From 262656b824549272f42b90ceef5e59fa356ce968 Mon Sep 17 00:00:00 2001 From: Tyler Ford <25458477+TF0rd@users.noreply.github.com> Date: Sat, 6 Jun 2026 19:38:22 -0400 Subject: [PATCH 1/3] add nvidia provider with 15 models MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add model parameter catalog entries for all NVIDIA NIM API models (provider: nvidia, authType: api_key) with parameters sourced from the official NVIDIA API reference at docs.api.nvidia.com. Models added: - nemotron-3-ultra-550b-a55b (reasoning_effort, reasoning_budget) - nemotron-3-super-120b-a12b (reasoning_effort, reasoning_budget) - nemotron-3-nano-30b-a3b - nemotron-mini-4b-instruct (tools support) - nemotron-content-safety-reasoning-4b - llama-3.1-nemotron-nano-8b-v1 - llama-3.1-nemotron-ultra-253b-v1 - llama-3.3-nemotron-super-49b-v1 - llama-3.3-nemotron-super-49b-v1.5 (65536 max_tokens) - llama-3.1-nemotron-safety-guard-8b-v3 - llama-3.1-nemoguard-8b-topic-control - riva-translate-4b-instruct-v1.1 - usdcode-llama-3.1-70b-instruct (expert_type enum) - gliner-pii (entity extraction: labels, threshold, chunk_length, overlap, flat_ner) - nemoguard-jailbreak-detect Two models omitted — their infer pages document only stream (a reserved MPS path not eligible as a parameter): - llama-3.1-nemoguard-8b-content-safety - nvidia-nemotron-nano-9b-v2 (infer page appears buggy; serves content-safety model data) --- models/nvidia/gliner-pii.yaml | 43 +++++++++++++ .../llama-3.1-nemoguard-8b-topic-control.yaml | 53 ++++++++++++++++ .../nvidia/llama-3.1-nemotron-nano-8b-v1.yaml | 63 +++++++++++++++++++ ...llama-3.1-nemotron-safety-guard-8b-v3.yaml | 14 +++++ .../llama-3.1-nemotron-ultra-253b-v1.yaml | 63 +++++++++++++++++++ .../llama-3.3-nemotron-super-49b-v1.5.yaml | 63 +++++++++++++++++++ .../llama-3.3-nemotron-super-49b-v1.yaml | 63 +++++++++++++++++++ models/nvidia/nemoguard-jailbreak-detect.yaml | 10 +++ models/nvidia/nemotron-3-nano-30b-a3b.yaml | 43 +++++++++++++ models/nvidia/nemotron-3-super-120b-a12b.yaml | 62 ++++++++++++++++++ models/nvidia/nemotron-3-ultra-550b-a55b.yaml | 62 ++++++++++++++++++ .../nemotron-content-safety-reasoning-4b.yaml | 43 +++++++++++++ models/nvidia/nemotron-mini-4b-instruct.yaml | 59 +++++++++++++++++ .../riva-translate-4b-instruct-v1.1.yaml | 54 ++++++++++++++++ .../usdcode-llama-3.1-70b-instruct.yaml | 42 +++++++++++++ 15 files changed, 737 insertions(+) create mode 100644 models/nvidia/gliner-pii.yaml create mode 100644 models/nvidia/llama-3.1-nemoguard-8b-topic-control.yaml create mode 100644 models/nvidia/llama-3.1-nemotron-nano-8b-v1.yaml create mode 100644 models/nvidia/llama-3.1-nemotron-safety-guard-8b-v3.yaml create mode 100644 models/nvidia/llama-3.1-nemotron-ultra-253b-v1.yaml create mode 100644 models/nvidia/llama-3.3-nemotron-super-49b-v1.5.yaml create mode 100644 models/nvidia/llama-3.3-nemotron-super-49b-v1.yaml create mode 100644 models/nvidia/nemoguard-jailbreak-detect.yaml create mode 100644 models/nvidia/nemotron-3-nano-30b-a3b.yaml create mode 100644 models/nvidia/nemotron-3-super-120b-a12b.yaml create mode 100644 models/nvidia/nemotron-3-ultra-550b-a55b.yaml create mode 100644 models/nvidia/nemotron-content-safety-reasoning-4b.yaml create mode 100644 models/nvidia/nemotron-mini-4b-instruct.yaml create mode 100644 models/nvidia/riva-translate-4b-instruct-v1.1.yaml create mode 100644 models/nvidia/usdcode-llama-3.1-70b-instruct.yaml diff --git a/models/nvidia/gliner-pii.yaml b/models/nvidia/gliner-pii.yaml new file mode 100644 index 0000000..9bbb542 --- /dev/null +++ b/models/nvidia/gliner-pii.yaml @@ -0,0 +1,43 @@ +# yaml-language-server: $schema=https://modelparams.dev/api/v1/schema.json +provider: nvidia +authType: api_key +model: gliner-pii +params: + - path: labels + type: string + label: Labels + description: Entity types to detect. If not specified, uses the default set of 55 PII categories including email, phone_number, ssn, first_name, last_name, and address. + group: provider_metadata + - path: threshold + type: number + label: Threshold + description: Confidence threshold for entity detection. Lower values detect more entities but may include false positives. + default: 0.5 + range: + min: 0 + max: 1 + group: sampling + - path: chunk_length + type: integer + label: Chunk length + description: Context window size for processing. Longer texts are automatically split into chunks with overlap for complete coverage. Must be greater than overlap. + default: 384 + range: + min: 1 + max: 2048 + group: provider_metadata + - path: overlap + type: integer + label: Overlap + description: Token overlap between chunks to prevent entity clipping. Must be less than chunk_length. + default: 128 + range: + min: 0 + max: 512 + group: provider_metadata + - path: flat_ner + type: boolean + label: Flat NER + description: When true, prevents overlapping entity spans. When false, may return nested entities such as both a full name and its constituent first name. + default: false + group: provider_metadata diff --git a/models/nvidia/llama-3.1-nemoguard-8b-topic-control.yaml b/models/nvidia/llama-3.1-nemoguard-8b-topic-control.yaml new file mode 100644 index 0000000..bfc0d61 --- /dev/null +++ b/models/nvidia/llama-3.1-nemoguard-8b-topic-control.yaml @@ -0,0 +1,53 @@ +# yaml-language-server: $schema=https://modelparams.dev/api/v1/schema.json +provider: nvidia +authType: api_key +model: llama-3.1-nemoguard-8b-topic-control +params: + - path: temperature + type: number + label: Temperature + description: Controls randomness. Lower values make outputs more focused; higher values make them more varied. Not recommended to modify both temperature and top_p in the same call. + default: 0.5 + range: + min: 0 + max: 2 + group: sampling + - path: top_p + type: number + label: Top P + description: Controls nucleus sampling by limiting generation to tokens within the selected cumulative probability. Not recommended to modify both temperature and top_p in the same call. + default: 1 + range: + max: 1 + group: sampling + - path: max_tokens + type: integer + label: Max tokens + description: Maximum number of tokens to generate. Generation stops when this limit is reached. + default: 1024 + range: + min: 1 + group: generation_length + - path: frequency_penalty + type: number + label: Frequency penalty + description: Penalizes new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim. + default: 0 + range: + min: -2 + max: 2 + group: sampling + - path: presence_penalty + type: number + label: Presence penalty + description: Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics. + default: 0 + range: + min: -2 + max: 2 + group: sampling + - path: stop + type: string + label: Stop + description: A string or list of strings where the API will stop generating further tokens. The returned text will not contain the stop sequence. + group: generation_length diff --git a/models/nvidia/llama-3.1-nemotron-nano-8b-v1.yaml b/models/nvidia/llama-3.1-nemotron-nano-8b-v1.yaml new file mode 100644 index 0000000..fc46239 --- /dev/null +++ b/models/nvidia/llama-3.1-nemotron-nano-8b-v1.yaml @@ -0,0 +1,63 @@ +# yaml-language-server: $schema=https://modelparams.dev/api/v1/schema.json +provider: nvidia +authType: api_key +model: llama-3.1-nemotron-nano-8b-v1 +params: + - path: temperature + type: number + label: Temperature + description: Controls randomness. Lower values make outputs more focused; higher values make them more varied. Not recommended to modify both temperature and top_p in the same call. + default: 0.6 + range: + min: 0 + max: 1 + group: sampling + - path: top_p + type: number + label: Top P + description: Controls nucleus sampling by limiting generation to tokens within the selected cumulative probability. Not recommended to modify both temperature and top_p in the same call. + default: 0.95 + range: + max: 1 + group: sampling + - path: max_tokens + type: integer + label: Max tokens + description: Maximum number of tokens to generate. Generation stops when this limit is reached. + default: 4096 + range: + min: 1 + max: 16384 + group: generation_length + - path: frequency_penalty + type: number + label: Frequency penalty + description: Penalizes new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim. + default: 0 + range: + min: -2 + max: 2 + group: sampling + - path: presence_penalty + type: number + label: Presence penalty + description: Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics. + default: 0 + range: + min: -2 + max: 2 + group: sampling + - path: seed + type: integer + label: Seed + description: Best-effort deterministic sampling seed. Changing the seed produces a different response with similar characteristics. Fix the seed to reproduce results. + default: 0 + range: + min: 0 + max: 18446744073709552000 + group: sampling + - path: stop + type: string + label: Stop + description: A string or list of strings where the API will stop generating further tokens. The returned text will not contain the stop sequence. + group: generation_length diff --git a/models/nvidia/llama-3.1-nemotron-safety-guard-8b-v3.yaml b/models/nvidia/llama-3.1-nemotron-safety-guard-8b-v3.yaml new file mode 100644 index 0000000..4256093 --- /dev/null +++ b/models/nvidia/llama-3.1-nemotron-safety-guard-8b-v3.yaml @@ -0,0 +1,14 @@ +# yaml-language-server: $schema=https://modelparams.dev/api/v1/schema.json +provider: nvidia +authType: api_key +model: llama-3.1-nemotron-safety-guard-8b-v3 +params: + - path: temperature + type: number + label: Temperature + description: Controls randomness. Lower values make outputs more focused; higher values make them more varied. + default: 0 + range: + min: 0 + max: 1 + group: sampling diff --git a/models/nvidia/llama-3.1-nemotron-ultra-253b-v1.yaml b/models/nvidia/llama-3.1-nemotron-ultra-253b-v1.yaml new file mode 100644 index 0000000..367b0ef --- /dev/null +++ b/models/nvidia/llama-3.1-nemotron-ultra-253b-v1.yaml @@ -0,0 +1,63 @@ +# yaml-language-server: $schema=https://modelparams.dev/api/v1/schema.json +provider: nvidia +authType: api_key +model: llama-3.1-nemotron-ultra-253b-v1 +params: + - path: temperature + type: number + label: Temperature + description: Controls randomness. Lower values make outputs more focused; higher values make them more varied. Not recommended to modify both temperature and top_p in the same call. + default: 0.6 + range: + min: 0 + max: 1 + group: sampling + - path: top_p + type: number + label: Top P + description: Controls nucleus sampling by limiting generation to tokens within the selected cumulative probability. Not recommended to modify both temperature and top_p in the same call. + default: 0.95 + range: + max: 1 + group: sampling + - path: max_tokens + type: integer + label: Max tokens + description: Maximum number of tokens to generate. Generation stops when this limit is reached. + default: 4096 + range: + min: 1 + max: 16384 + group: generation_length + - path: frequency_penalty + type: number + label: Frequency penalty + description: Penalizes new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim. + default: 0 + range: + min: -2 + max: 2 + group: sampling + - path: presence_penalty + type: number + label: Presence penalty + description: Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics. + default: 0 + range: + min: -2 + max: 2 + group: sampling + - path: seed + type: integer + label: Seed + description: Best-effort deterministic sampling seed. Changing the seed produces a different response with similar characteristics. Fix the seed to reproduce results. + default: 0 + range: + min: 0 + max: 18446744073709552000 + group: sampling + - path: stop + type: string + label: Stop + description: A string or list of strings where the API will stop generating further tokens. The returned text will not contain the stop sequence. + group: generation_length diff --git a/models/nvidia/llama-3.3-nemotron-super-49b-v1.5.yaml b/models/nvidia/llama-3.3-nemotron-super-49b-v1.5.yaml new file mode 100644 index 0000000..b20d19b --- /dev/null +++ b/models/nvidia/llama-3.3-nemotron-super-49b-v1.5.yaml @@ -0,0 +1,63 @@ +# yaml-language-server: $schema=https://modelparams.dev/api/v1/schema.json +provider: nvidia +authType: api_key +model: llama-3.3-nemotron-super-49b-v1.5 +params: + - path: temperature + type: number + label: Temperature + description: Controls randomness. Lower values make outputs more focused; higher values make them more varied. Not recommended to modify both temperature and top_p in the same call. + default: 0.6 + range: + min: 0 + max: 1 + group: sampling + - path: top_p + type: number + label: Top P + description: Controls nucleus sampling by limiting generation to tokens within the selected cumulative probability. Not recommended to modify both temperature and top_p in the same call. + default: 0.95 + range: + max: 1 + group: sampling + - path: max_tokens + type: integer + label: Max tokens + description: Maximum number of tokens to generate. Generation stops when this limit is reached. + default: 65536 + range: + min: 1 + max: 65536 + group: generation_length + - path: frequency_penalty + type: number + label: Frequency penalty + description: Penalizes new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim. + default: 0 + range: + min: -2 + max: 2 + group: sampling + - path: presence_penalty + type: number + label: Presence penalty + description: Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics. + default: 0 + range: + min: -2 + max: 2 + group: sampling + - path: seed + type: integer + label: Seed + description: Best-effort deterministic sampling seed. Changing the seed produces a different response with similar characteristics. Fix the seed to reproduce results. + default: 0 + range: + min: 0 + max: 18446744073709552000 + group: sampling + - path: stop + type: string + label: Stop + description: A string or list of strings where the API will stop generating further tokens. The returned text will not contain the stop sequence. + group: generation_length diff --git a/models/nvidia/llama-3.3-nemotron-super-49b-v1.yaml b/models/nvidia/llama-3.3-nemotron-super-49b-v1.yaml new file mode 100644 index 0000000..313d97e --- /dev/null +++ b/models/nvidia/llama-3.3-nemotron-super-49b-v1.yaml @@ -0,0 +1,63 @@ +# yaml-language-server: $schema=https://modelparams.dev/api/v1/schema.json +provider: nvidia +authType: api_key +model: llama-3.3-nemotron-super-49b-v1 +params: + - path: temperature + type: number + label: Temperature + description: Controls randomness. Lower values make outputs more focused; higher values make them more varied. Not recommended to modify both temperature and top_p in the same call. + default: 0.6 + range: + min: 0 + max: 1 + group: sampling + - path: top_p + type: number + label: Top P + description: Controls nucleus sampling by limiting generation to tokens within the selected cumulative probability. Not recommended to modify both temperature and top_p in the same call. + default: 0.95 + range: + max: 1 + group: sampling + - path: max_tokens + type: integer + label: Max tokens + description: Maximum number of tokens to generate. Generation stops when this limit is reached. + default: 4096 + range: + min: 1 + max: 16384 + group: generation_length + - path: frequency_penalty + type: number + label: Frequency penalty + description: Penalizes new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim. + default: 0 + range: + min: -2 + max: 2 + group: sampling + - path: presence_penalty + type: number + label: Presence penalty + description: Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics. + default: 0 + range: + min: -2 + max: 2 + group: sampling + - path: seed + type: integer + label: Seed + description: Best-effort deterministic sampling seed. Changing the seed produces a different response with similar characteristics. Fix the seed to reproduce results. + default: 0 + range: + min: 0 + max: 18446744073709552000 + group: sampling + - path: stop + type: string + label: Stop + description: A string or list of strings where the API will stop generating further tokens. The returned text will not contain the stop sequence. + group: generation_length diff --git a/models/nvidia/nemoguard-jailbreak-detect.yaml b/models/nvidia/nemoguard-jailbreak-detect.yaml new file mode 100644 index 0000000..1800f49 --- /dev/null +++ b/models/nvidia/nemoguard-jailbreak-detect.yaml @@ -0,0 +1,10 @@ +# yaml-language-server: $schema=https://modelparams.dev/api/v1/schema.json +provider: nvidia +authType: api_key +model: nemoguard-jailbreak-detect +params: + - path: input + type: string + label: Input + description: The text to classify for jailbreak attempts. Accepts a string or an array of strings. + group: provider_metadata diff --git a/models/nvidia/nemotron-3-nano-30b-a3b.yaml b/models/nvidia/nemotron-3-nano-30b-a3b.yaml new file mode 100644 index 0000000..aaf9bcc --- /dev/null +++ b/models/nvidia/nemotron-3-nano-30b-a3b.yaml @@ -0,0 +1,43 @@ +# yaml-language-server: $schema=https://modelparams.dev/api/v1/schema.json +provider: nvidia +authType: api_key +model: nemotron-3-nano-30b-a3b +params: + - path: temperature + type: number + label: Temperature + description: Controls randomness. Lower values make outputs more focused; higher values make them more varied. Not recommended to modify both temperature and top_p in the same call. + default: 1 + range: + max: 1 + group: sampling + - path: top_p + type: number + label: Top P + description: Controls nucleus sampling by limiting generation to tokens within the selected cumulative probability. Not recommended to modify both temperature and top_p in the same call. + default: 1 + range: + max: 1 + group: sampling + - path: max_tokens + type: integer + label: Max tokens + description: Maximum number of tokens to generate. Generation stops when this limit is reached. + default: 16384 + range: + min: 1 + max: 32768 + group: generation_length + - path: seed + type: integer + label: Seed + description: Best-effort deterministic sampling seed. Repeated requests with the same seed and parameters should return the same result. + range: + min: 0 + max: 18446744073709552000 + group: sampling + - path: stop + type: string + label: Stop + description: A string or list of strings where the API will stop generating further tokens. The returned text will not contain the stop sequence. + group: generation_length diff --git a/models/nvidia/nemotron-3-super-120b-a12b.yaml b/models/nvidia/nemotron-3-super-120b-a12b.yaml new file mode 100644 index 0000000..1a81de2 --- /dev/null +++ b/models/nvidia/nemotron-3-super-120b-a12b.yaml @@ -0,0 +1,62 @@ +# yaml-language-server: $schema=https://modelparams.dev/api/v1/schema.json +provider: nvidia +authType: api_key +model: nemotron-3-super-120b-a12b +params: + - path: temperature + type: number + label: Temperature + description: Controls randomness. Lower values make outputs more focused; higher values make them more varied. Not recommended to modify both temperature and top_p in the same call. + default: 1 + range: + max: 1 + group: sampling + - path: top_p + type: number + label: Top P + description: Controls nucleus sampling by limiting generation to tokens within the selected cumulative probability. Not recommended to modify both temperature and top_p in the same call. + default: 0.95 + range: + max: 1 + group: sampling + - path: max_tokens + type: integer + label: Max tokens + description: Maximum number of tokens to generate. Generation stops when this limit is reached. + default: 16384 + range: + min: 1 + max: 32768 + group: generation_length + - path: reasoning_effort + type: enum + label: Reasoning effort + description: Controls the reasoning mode. 'none' disables reasoning tokens, 'low' enables low-effort reasoning, and 'high' enables full reasoning. + default: high + values: + - none + - low + - high + group: reasoning + - path: reasoning_budget + type: integer + label: Reasoning budget + description: Maximum number of tokens the model may use for internal reasoning before being forced to end the reasoning trace. Use -1 to disable budget enforcement. + default: 16384 + range: + min: -1 + max: 32768 + group: reasoning + - path: seed + type: integer + label: Seed + description: Best-effort deterministic sampling seed. Repeated requests with the same seed and parameters should return the same result. + range: + min: 0 + max: 18446744073709552000 + group: sampling + - path: stop + type: string + label: Stop + description: A string or list of strings where the API will stop generating further tokens. The returned text will not contain the stop sequence. + group: generation_length diff --git a/models/nvidia/nemotron-3-ultra-550b-a55b.yaml b/models/nvidia/nemotron-3-ultra-550b-a55b.yaml new file mode 100644 index 0000000..b7c2fef --- /dev/null +++ b/models/nvidia/nemotron-3-ultra-550b-a55b.yaml @@ -0,0 +1,62 @@ +# yaml-language-server: $schema=https://modelparams.dev/api/v1/schema.json +provider: nvidia +authType: api_key +model: nemotron-3-ultra-550b-a55b +params: + - path: temperature + type: number + label: Temperature + description: Controls randomness. Lower values make outputs more focused; higher values make them more varied. Not recommended to modify both temperature and top_p in the same call. + default: 1 + range: + max: 1 + group: sampling + - path: top_p + type: number + label: Top P + description: Controls nucleus sampling by limiting generation to tokens within the selected cumulative probability. Not recommended to modify both temperature and top_p in the same call. + default: 0.95 + range: + max: 1 + group: sampling + - path: max_tokens + type: integer + label: Max tokens + description: Maximum number of tokens to generate. Generation stops when this limit is reached. + default: 16384 + range: + min: 1 + max: 32768 + group: generation_length + - path: reasoning_effort + type: enum + label: Reasoning effort + description: Controls the reasoning mode. 'none' disables reasoning tokens, 'medium' enables efficient reasoning, and 'high' enables full reasoning. + default: high + values: + - none + - medium + - high + group: reasoning + - path: reasoning_budget + type: integer + label: Reasoning budget + description: Maximum number of tokens the model may use for internal reasoning before being forced to end the reasoning trace. Use -1 to disable budget enforcement. + default: 16384 + range: + min: -1 + max: 32768 + group: reasoning + - path: seed + type: integer + label: Seed + description: Best-effort deterministic sampling seed. Repeated requests with the same seed and parameters should return the same result. + range: + min: 0 + max: 18446744073709552000 + group: sampling + - path: stop + type: string + label: Stop + description: A string or list of strings where the API will stop generating further tokens. The returned text will not contain the stop sequence. + group: generation_length diff --git a/models/nvidia/nemotron-content-safety-reasoning-4b.yaml b/models/nvidia/nemotron-content-safety-reasoning-4b.yaml new file mode 100644 index 0000000..7bfa8bb --- /dev/null +++ b/models/nvidia/nemotron-content-safety-reasoning-4b.yaml @@ -0,0 +1,43 @@ +# yaml-language-server: $schema=https://modelparams.dev/api/v1/schema.json +provider: nvidia +authType: api_key +model: nemotron-content-safety-reasoning-4b +params: + - path: temperature + type: number + label: Temperature + description: Controls randomness. Lower values make outputs more focused; higher values make them more varied. Not recommended to modify both temperature and top_p in the same call. + default: 1 + range: + max: 1 + group: sampling + - path: top_p + type: number + label: Top P + description: Controls nucleus sampling by limiting generation to tokens within the selected cumulative probability. Not recommended to modify both temperature and top_p in the same call. + default: 1 + range: + max: 1 + group: sampling + - path: max_tokens + type: integer + label: Max tokens + description: Maximum number of tokens to generate. Generation stops when this limit is reached. + default: 16384 + range: + min: 1 + max: 32768 + group: generation_length + - path: seed + type: integer + label: Seed + description: Best-effort deterministic sampling seed. Repeated requests with the same seed and parameters should return the same result. + range: + min: 0 + max: 18446744073709552000 + group: sampling + - path: stop + type: string + label: Stop + description: A string or list of strings where the API will stop generating further tokens. The returned text will not contain the stop sequence. + group: generation_length diff --git a/models/nvidia/nemotron-mini-4b-instruct.yaml b/models/nvidia/nemotron-mini-4b-instruct.yaml new file mode 100644 index 0000000..070d6bf --- /dev/null +++ b/models/nvidia/nemotron-mini-4b-instruct.yaml @@ -0,0 +1,59 @@ +# yaml-language-server: $schema=https://modelparams.dev/api/v1/schema.json +provider: nvidia +authType: api_key +model: nemotron-mini-4b-instruct +params: + - path: temperature + type: number + label: Temperature + description: Controls randomness. Lower values make outputs more focused; higher values make them more varied. Not recommended to modify both temperature and top_p in the same call. + default: 0.2 + range: + min: 0 + max: 1 + group: sampling + - path: top_p + type: number + label: Top P + description: Controls nucleus sampling by limiting generation to tokens within the selected cumulative probability. Not recommended to modify both temperature and top_p in the same call. + default: 0.7 + range: + max: 1 + group: sampling + - path: max_tokens + type: integer + label: Max tokens + description: Maximum number of tokens to generate. Generation stops when this limit is reached. + default: 1024 + range: + min: 1 + max: 4096 + group: generation_length + - path: frequency_penalty + type: number + label: Frequency penalty + description: Penalizes new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim. + default: 0 + range: + min: -2 + max: 2 + group: sampling + - path: presence_penalty + type: number + label: Presence penalty + description: Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics. + default: 0 + range: + min: -2 + max: 2 + group: sampling + - path: stop + type: string + label: Stop + description: A string or list of strings where the API will stop generating further tokens. The returned text will not contain the stop sequence. + group: generation_length + - path: tools + type: string + label: Tools + description: A list of tools the model may call. If no tools are provided, the model will not call any tools. + group: tooling diff --git a/models/nvidia/riva-translate-4b-instruct-v1.1.yaml b/models/nvidia/riva-translate-4b-instruct-v1.1.yaml new file mode 100644 index 0000000..420c6e0 --- /dev/null +++ b/models/nvidia/riva-translate-4b-instruct-v1.1.yaml @@ -0,0 +1,54 @@ +# yaml-language-server: $schema=https://modelparams.dev/api/v1/schema.json +provider: nvidia +authType: api_key +model: riva-translate-4b-instruct-v1.1 +params: + - path: temperature + type: number + label: Temperature + description: Controls randomness. Lower values make outputs more focused; higher values make them more varied. Not recommended to modify both temperature and top_p in the same call. + default: 0 + range: + min: 0 + max: 1 + group: sampling + - path: top_p + type: number + label: Top P + description: Controls nucleus sampling by limiting generation to tokens within the selected cumulative probability. Not recommended to modify both temperature and top_p in the same call. + default: 0.9 + range: + max: 1 + group: sampling + - path: max_tokens + type: integer + label: Max tokens + description: Maximum number of tokens to generate. Generation stops when this limit is reached. + default: 512 + range: + min: 1 + max: 4096 + group: generation_length + - path: frequency_penalty + type: number + label: Frequency penalty + description: Penalizes new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim. + default: 0 + range: + min: -2 + max: 2 + group: sampling + - path: presence_penalty + type: number + label: Presence penalty + description: Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics. + default: 0 + range: + min: -2 + max: 2 + group: sampling + - path: stop + type: string + label: Stop + description: A string or list of strings where the API will stop generating further tokens. The returned text will not contain the stop sequence. + group: generation_length diff --git a/models/nvidia/usdcode-llama-3.1-70b-instruct.yaml b/models/nvidia/usdcode-llama-3.1-70b-instruct.yaml new file mode 100644 index 0000000..6f4aa2b --- /dev/null +++ b/models/nvidia/usdcode-llama-3.1-70b-instruct.yaml @@ -0,0 +1,42 @@ +# yaml-language-server: $schema=https://modelparams.dev/api/v1/schema.json +provider: nvidia +authType: api_key +model: usdcode-llama-3.1-70b-instruct +params: + - path: temperature + type: number + label: Temperature + description: Controls randomness. Lower values make outputs more focused; higher values make them more varied. Not recommended to modify both temperature and top_p in the same call. + default: 0.1 + range: + min: 0 + max: 1 + group: sampling + - path: top_p + type: number + label: Top P + description: Controls nucleus sampling by limiting generation to tokens within the selected cumulative probability. Not recommended to modify both temperature and top_p in the same call. + default: 1 + range: + max: 1 + group: sampling + - path: max_tokens + type: integer + label: Max tokens + description: Maximum number of tokens to generate. Generation stops when this limit is reached. + default: 1024 + range: + min: 1 + max: 2048 + group: generation_length + - path: expert_type + type: enum + label: Expert type + description: The type of expert to use. 'knowledge' answers with USD knowledge, 'code' responds with vanilla OpenUSD code, 'helperfunction' uses high-level helper functions, and 'auto' lets the LLM determine which expert to use. + default: auto + values: + - auto + - code + - knowledge + - helperfunction + group: provider_metadata From 3c8d451b268f8f9a798195e32824b70ea7f87afd Mon Sep 17 00:00:00 2001 From: Guillaume Gay Date: Mon, 15 Jun 2026 13:37:02 +0200 Subject: [PATCH 2/3] fix: remove unsupported nvidia params --- models/nvidia/gliner-pii.yaml | 5 ----- models/nvidia/nemoguard-jailbreak-detect.yaml | 7 +------ models/nvidia/nemotron-mini-4b-instruct.yaml | 5 ----- 3 files changed, 1 insertion(+), 16 deletions(-) diff --git a/models/nvidia/gliner-pii.yaml b/models/nvidia/gliner-pii.yaml index 9bbb542..f0bc975 100644 --- a/models/nvidia/gliner-pii.yaml +++ b/models/nvidia/gliner-pii.yaml @@ -3,11 +3,6 @@ provider: nvidia authType: api_key model: gliner-pii params: - - path: labels - type: string - label: Labels - description: Entity types to detect. If not specified, uses the default set of 55 PII categories including email, phone_number, ssn, first_name, last_name, and address. - group: provider_metadata - path: threshold type: number label: Threshold diff --git a/models/nvidia/nemoguard-jailbreak-detect.yaml b/models/nvidia/nemoguard-jailbreak-detect.yaml index 1800f49..4a6231d 100644 --- a/models/nvidia/nemoguard-jailbreak-detect.yaml +++ b/models/nvidia/nemoguard-jailbreak-detect.yaml @@ -2,9 +2,4 @@ provider: nvidia authType: api_key model: nemoguard-jailbreak-detect -params: - - path: input - type: string - label: Input - description: The text to classify for jailbreak attempts. Accepts a string or an array of strings. - group: provider_metadata +params: [] diff --git a/models/nvidia/nemotron-mini-4b-instruct.yaml b/models/nvidia/nemotron-mini-4b-instruct.yaml index 070d6bf..0283f9a 100644 --- a/models/nvidia/nemotron-mini-4b-instruct.yaml +++ b/models/nvidia/nemotron-mini-4b-instruct.yaml @@ -52,8 +52,3 @@ params: label: Stop description: A string or list of strings where the API will stop generating further tokens. The returned text will not contain the stop sequence. group: generation_length - - path: tools - type: string - label: Tools - description: A list of tools the model may call. If no tools are provided, the model will not call any tools. - group: tooling From b8a740bab09ebd8355225cf094e8e512d2e414b6 Mon Sep 17 00:00:00 2001 From: Guillaume Gay Date: Mon, 15 Jun 2026 14:13:22 +0200 Subject: [PATCH 3/3] chore: regenerate modelparams package data --- packages/modelparams/scripts/codegen.ts | 4 + packages/modelparams/src/generated/data.ts | 976 ++++++++++++++++++ .../modelparams/src/generated/defaults.ts | 93 ++ .../modelparams/src/generated/model-ids.ts | 16 + .../modelparams/src/generated/params-by-id.ts | 108 ++ 5 files changed, 1197 insertions(+) diff --git a/packages/modelparams/scripts/codegen.ts b/packages/modelparams/scripts/codegen.ts index 3ae2ca9..b532a5e 100644 --- a/packages/modelparams/scripts/codegen.ts +++ b/packages/modelparams/scripts/codegen.ts @@ -32,6 +32,10 @@ function quoteKey(k: string): string { function emitParamsByIdEntry(m: Model): string { const id = modelId(m); const fields = m.params.map((p) => ` ${quoteKey(p.path)}: ${tsType(p)};`).join("\n"); + if (fields.length === 0) { + return ` ${JSON.stringify(id)}: Record;`; + } + return ` ${JSON.stringify(id)}: {\n${fields}\n };`; } diff --git a/packages/modelparams/src/generated/data.ts b/packages/modelparams/src/generated/data.ts index 0f8611e..49d13ec 100644 --- a/packages/modelparams/src/generated/data.ts +++ b/packages/modelparams/src/generated/data.ts @@ -9917,6 +9917,982 @@ export const CATALOG = [ } ] }, + { + "provider": "nvidia", + "authType": "api_key", + "model": "gliner-pii", + "params": [ + { + "path": "threshold", + "label": "Threshold", + "description": "Confidence threshold for entity detection. Lower values detect more entities but may include false positives.", + "group": "sampling", + "type": "number", + "default": 0.5, + "range": { + "min": 0, + "max": 1 + } + }, + { + "path": "chunk_length", + "label": "Chunk length", + "description": "Context window size for processing. Longer texts are automatically split into chunks with overlap for complete coverage. Must be greater than overlap.", + "group": "provider_metadata", + "type": "integer", + "default": 384, + "range": { + "min": 1, + "max": 2048 + } + }, + { + "path": "overlap", + "label": "Overlap", + "description": "Token overlap between chunks to prevent entity clipping. Must be less than chunk_length.", + "group": "provider_metadata", + "type": "integer", + "default": 128, + "range": { + "min": 0, + "max": 512 + } + }, + { + "path": "flat_ner", + "label": "Flat NER", + "description": "When true, prevents overlapping entity spans. When false, may return nested entities such as both a full name and its constituent first name.", + "group": "provider_metadata", + "type": "boolean", + "default": false + } + ] + }, + { + "provider": "nvidia", + "authType": "api_key", + "model": "llama-3.1-nemoguard-8b-topic-control", + "params": [ + { + "path": "temperature", + "label": "Temperature", + "description": "Controls randomness. Lower values make outputs more focused; higher values make them more varied. Not recommended to modify both temperature and top_p in the same call.", + "group": "sampling", + "type": "number", + "default": 0.5, + "range": { + "min": 0, + "max": 2 + } + }, + { + "path": "top_p", + "label": "Top P", + "description": "Controls nucleus sampling by limiting generation to tokens within the selected cumulative probability. Not recommended to modify both temperature and top_p in the same call.", + "group": "sampling", + "type": "number", + "default": 1, + "range": { + "max": 1 + } + }, + { + "path": "max_tokens", + "label": "Max tokens", + "description": "Maximum number of tokens to generate. Generation stops when this limit is reached.", + "group": "generation_length", + "type": "integer", + "default": 1024, + "range": { + "min": 1 + } + }, + { + "path": "frequency_penalty", + "label": "Frequency penalty", + "description": "Penalizes new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.", + "group": "sampling", + "type": "number", + "default": 0, + "range": { + "min": -2, + "max": 2 + } + }, + { + "path": "presence_penalty", + "label": "Presence penalty", + "description": "Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics.", + "group": "sampling", + "type": "number", + "default": 0, + "range": { + "min": -2, + "max": 2 + } + }, + { + "path": "stop", + "label": "Stop", + "description": "A string or list of strings where the API will stop generating further tokens. The returned text will not contain the stop sequence.", + "group": "generation_length", + "type": "string" + } + ] + }, + { + "provider": "nvidia", + "authType": "api_key", + "model": "llama-3.1-nemotron-nano-8b-v1", + "params": [ + { + "path": "temperature", + "label": "Temperature", + "description": "Controls randomness. Lower values make outputs more focused; higher values make them more varied. Not recommended to modify both temperature and top_p in the same call.", + "group": "sampling", + "type": "number", + "default": 0.6, + "range": { + "min": 0, + "max": 1 + } + }, + { + "path": "top_p", + "label": "Top P", + "description": "Controls nucleus sampling by limiting generation to tokens within the selected cumulative probability. Not recommended to modify both temperature and top_p in the same call.", + "group": "sampling", + "type": "number", + "default": 0.95, + "range": { + "max": 1 + } + }, + { + "path": "max_tokens", + "label": "Max tokens", + "description": "Maximum number of tokens to generate. Generation stops when this limit is reached.", + "group": "generation_length", + "type": "integer", + "default": 4096, + "range": { + "min": 1, + "max": 16384 + } + }, + { + "path": "frequency_penalty", + "label": "Frequency penalty", + "description": "Penalizes new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.", + "group": "sampling", + "type": "number", + "default": 0, + "range": { + "min": -2, + "max": 2 + } + }, + { + "path": "presence_penalty", + "label": "Presence penalty", + "description": "Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics.", + "group": "sampling", + "type": "number", + "default": 0, + "range": { + "min": -2, + "max": 2 + } + }, + { + "path": "seed", + "label": "Seed", + "description": "Best-effort deterministic sampling seed. Changing the seed produces a different response with similar characteristics. Fix the seed to reproduce results.", + "group": "sampling", + "type": "integer", + "default": 0, + "range": { + "min": 0, + "max": 18446744073709552000 + } + }, + { + "path": "stop", + "label": "Stop", + "description": "A string or list of strings where the API will stop generating further tokens. The returned text will not contain the stop sequence.", + "group": "generation_length", + "type": "string" + } + ] + }, + { + "provider": "nvidia", + "authType": "api_key", + "model": "llama-3.1-nemotron-safety-guard-8b-v3", + "params": [ + { + "path": "temperature", + "label": "Temperature", + "description": "Controls randomness. Lower values make outputs more focused; higher values make them more varied.", + "group": "sampling", + "type": "number", + "default": 0, + "range": { + "min": 0, + "max": 1 + } + } + ] + }, + { + "provider": "nvidia", + "authType": "api_key", + "model": "llama-3.1-nemotron-ultra-253b-v1", + "params": [ + { + "path": "temperature", + "label": "Temperature", + "description": "Controls randomness. Lower values make outputs more focused; higher values make them more varied. Not recommended to modify both temperature and top_p in the same call.", + "group": "sampling", + "type": "number", + "default": 0.6, + "range": { + "min": 0, + "max": 1 + } + }, + { + "path": "top_p", + "label": "Top P", + "description": "Controls nucleus sampling by limiting generation to tokens within the selected cumulative probability. Not recommended to modify both temperature and top_p in the same call.", + "group": "sampling", + "type": "number", + "default": 0.95, + "range": { + "max": 1 + } + }, + { + "path": "max_tokens", + "label": "Max tokens", + "description": "Maximum number of tokens to generate. Generation stops when this limit is reached.", + "group": "generation_length", + "type": "integer", + "default": 4096, + "range": { + "min": 1, + "max": 16384 + } + }, + { + "path": "frequency_penalty", + "label": "Frequency penalty", + "description": "Penalizes new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.", + "group": "sampling", + "type": "number", + "default": 0, + "range": { + "min": -2, + "max": 2 + } + }, + { + "path": "presence_penalty", + "label": "Presence penalty", + "description": "Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics.", + "group": "sampling", + "type": "number", + "default": 0, + "range": { + "min": -2, + "max": 2 + } + }, + { + "path": "seed", + "label": "Seed", + "description": "Best-effort deterministic sampling seed. Changing the seed produces a different response with similar characteristics. Fix the seed to reproduce results.", + "group": "sampling", + "type": "integer", + "default": 0, + "range": { + "min": 0, + "max": 18446744073709552000 + } + }, + { + "path": "stop", + "label": "Stop", + "description": "A string or list of strings where the API will stop generating further tokens. The returned text will not contain the stop sequence.", + "group": "generation_length", + "type": "string" + } + ] + }, + { + "provider": "nvidia", + "authType": "api_key", + "model": "llama-3.3-nemotron-super-49b-v1", + "params": [ + { + "path": "temperature", + "label": "Temperature", + "description": "Controls randomness. Lower values make outputs more focused; higher values make them more varied. Not recommended to modify both temperature and top_p in the same call.", + "group": "sampling", + "type": "number", + "default": 0.6, + "range": { + "min": 0, + "max": 1 + } + }, + { + "path": "top_p", + "label": "Top P", + "description": "Controls nucleus sampling by limiting generation to tokens within the selected cumulative probability. Not recommended to modify both temperature and top_p in the same call.", + "group": "sampling", + "type": "number", + "default": 0.95, + "range": { + "max": 1 + } + }, + { + "path": "max_tokens", + "label": "Max tokens", + "description": "Maximum number of tokens to generate. Generation stops when this limit is reached.", + "group": "generation_length", + "type": "integer", + "default": 4096, + "range": { + "min": 1, + "max": 16384 + } + }, + { + "path": "frequency_penalty", + "label": "Frequency penalty", + "description": "Penalizes new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.", + "group": "sampling", + "type": "number", + "default": 0, + "range": { + "min": -2, + "max": 2 + } + }, + { + "path": "presence_penalty", + "label": "Presence penalty", + "description": "Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics.", + "group": "sampling", + "type": "number", + "default": 0, + "range": { + "min": -2, + "max": 2 + } + }, + { + "path": "seed", + "label": "Seed", + "description": "Best-effort deterministic sampling seed. Changing the seed produces a different response with similar characteristics. Fix the seed to reproduce results.", + "group": "sampling", + "type": "integer", + "default": 0, + "range": { + "min": 0, + "max": 18446744073709552000 + } + }, + { + "path": "stop", + "label": "Stop", + "description": "A string or list of strings where the API will stop generating further tokens. The returned text will not contain the stop sequence.", + "group": "generation_length", + "type": "string" + } + ] + }, + { + "provider": "nvidia", + "authType": "api_key", + "model": "llama-3.3-nemotron-super-49b-v1.5", + "params": [ + { + "path": "temperature", + "label": "Temperature", + "description": "Controls randomness. Lower values make outputs more focused; higher values make them more varied. Not recommended to modify both temperature and top_p in the same call.", + "group": "sampling", + "type": "number", + "default": 0.6, + "range": { + "min": 0, + "max": 1 + } + }, + { + "path": "top_p", + "label": "Top P", + "description": "Controls nucleus sampling by limiting generation to tokens within the selected cumulative probability. Not recommended to modify both temperature and top_p in the same call.", + "group": "sampling", + "type": "number", + "default": 0.95, + "range": { + "max": 1 + } + }, + { + "path": "max_tokens", + "label": "Max tokens", + "description": "Maximum number of tokens to generate. Generation stops when this limit is reached.", + "group": "generation_length", + "type": "integer", + "default": 65536, + "range": { + "min": 1, + "max": 65536 + } + }, + { + "path": "frequency_penalty", + "label": "Frequency penalty", + "description": "Penalizes new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.", + "group": "sampling", + "type": "number", + "default": 0, + "range": { + "min": -2, + "max": 2 + } + }, + { + "path": "presence_penalty", + "label": "Presence penalty", + "description": "Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics.", + "group": "sampling", + "type": "number", + "default": 0, + "range": { + "min": -2, + "max": 2 + } + }, + { + "path": "seed", + "label": "Seed", + "description": "Best-effort deterministic sampling seed. Changing the seed produces a different response with similar characteristics. Fix the seed to reproduce results.", + "group": "sampling", + "type": "integer", + "default": 0, + "range": { + "min": 0, + "max": 18446744073709552000 + } + }, + { + "path": "stop", + "label": "Stop", + "description": "A string or list of strings where the API will stop generating further tokens. The returned text will not contain the stop sequence.", + "group": "generation_length", + "type": "string" + } + ] + }, + { + "provider": "nvidia", + "authType": "api_key", + "model": "nemoguard-jailbreak-detect", + "params": [] + }, + { + "provider": "nvidia", + "authType": "api_key", + "model": "nemotron-3-nano-30b-a3b", + "params": [ + { + "path": "temperature", + "label": "Temperature", + "description": "Controls randomness. Lower values make outputs more focused; higher values make them more varied. Not recommended to modify both temperature and top_p in the same call.", + "group": "sampling", + "type": "number", + "default": 1, + "range": { + "max": 1 + } + }, + { + "path": "top_p", + "label": "Top P", + "description": "Controls nucleus sampling by limiting generation to tokens within the selected cumulative probability. Not recommended to modify both temperature and top_p in the same call.", + "group": "sampling", + "type": "number", + "default": 1, + "range": { + "max": 1 + } + }, + { + "path": "max_tokens", + "label": "Max tokens", + "description": "Maximum number of tokens to generate. Generation stops when this limit is reached.", + "group": "generation_length", + "type": "integer", + "default": 16384, + "range": { + "min": 1, + "max": 32768 + } + }, + { + "path": "seed", + "label": "Seed", + "description": "Best-effort deterministic sampling seed. Repeated requests with the same seed and parameters should return the same result.", + "group": "sampling", + "type": "integer", + "range": { + "min": 0, + "max": 18446744073709552000 + } + }, + { + "path": "stop", + "label": "Stop", + "description": "A string or list of strings where the API will stop generating further tokens. The returned text will not contain the stop sequence.", + "group": "generation_length", + "type": "string" + } + ] + }, + { + "provider": "nvidia", + "authType": "api_key", + "model": "nemotron-3-super-120b-a12b", + "params": [ + { + "path": "temperature", + "label": "Temperature", + "description": "Controls randomness. Lower values make outputs more focused; higher values make them more varied. Not recommended to modify both temperature and top_p in the same call.", + "group": "sampling", + "type": "number", + "default": 1, + "range": { + "max": 1 + } + }, + { + "path": "top_p", + "label": "Top P", + "description": "Controls nucleus sampling by limiting generation to tokens within the selected cumulative probability. Not recommended to modify both temperature and top_p in the same call.", + "group": "sampling", + "type": "number", + "default": 0.95, + "range": { + "max": 1 + } + }, + { + "path": "max_tokens", + "label": "Max tokens", + "description": "Maximum number of tokens to generate. Generation stops when this limit is reached.", + "group": "generation_length", + "type": "integer", + "default": 16384, + "range": { + "min": 1, + "max": 32768 + } + }, + { + "path": "reasoning_effort", + "label": "Reasoning effort", + "description": "Controls the reasoning mode. 'none' disables reasoning tokens, 'low' enables low-effort reasoning, and 'high' enables full reasoning.", + "group": "reasoning", + "type": "enum", + "default": "high", + "values": [ + "none", + "low", + "high" + ] + }, + { + "path": "reasoning_budget", + "label": "Reasoning budget", + "description": "Maximum number of tokens the model may use for internal reasoning before being forced to end the reasoning trace. Use -1 to disable budget enforcement.", + "group": "reasoning", + "type": "integer", + "default": 16384, + "range": { + "min": -1, + "max": 32768 + } + }, + { + "path": "seed", + "label": "Seed", + "description": "Best-effort deterministic sampling seed. Repeated requests with the same seed and parameters should return the same result.", + "group": "sampling", + "type": "integer", + "range": { + "min": 0, + "max": 18446744073709552000 + } + }, + { + "path": "stop", + "label": "Stop", + "description": "A string or list of strings where the API will stop generating further tokens. The returned text will not contain the stop sequence.", + "group": "generation_length", + "type": "string" + } + ] + }, + { + "provider": "nvidia", + "authType": "api_key", + "model": "nemotron-3-ultra-550b-a55b", + "params": [ + { + "path": "temperature", + "label": "Temperature", + "description": "Controls randomness. Lower values make outputs more focused; higher values make them more varied. Not recommended to modify both temperature and top_p in the same call.", + "group": "sampling", + "type": "number", + "default": 1, + "range": { + "max": 1 + } + }, + { + "path": "top_p", + "label": "Top P", + "description": "Controls nucleus sampling by limiting generation to tokens within the selected cumulative probability. Not recommended to modify both temperature and top_p in the same call.", + "group": "sampling", + "type": "number", + "default": 0.95, + "range": { + "max": 1 + } + }, + { + "path": "max_tokens", + "label": "Max tokens", + "description": "Maximum number of tokens to generate. Generation stops when this limit is reached.", + "group": "generation_length", + "type": "integer", + "default": 16384, + "range": { + "min": 1, + "max": 32768 + } + }, + { + "path": "reasoning_effort", + "label": "Reasoning effort", + "description": "Controls the reasoning mode. 'none' disables reasoning tokens, 'medium' enables efficient reasoning, and 'high' enables full reasoning.", + "group": "reasoning", + "type": "enum", + "default": "high", + "values": [ + "none", + "medium", + "high" + ] + }, + { + "path": "reasoning_budget", + "label": "Reasoning budget", + "description": "Maximum number of tokens the model may use for internal reasoning before being forced to end the reasoning trace. Use -1 to disable budget enforcement.", + "group": "reasoning", + "type": "integer", + "default": 16384, + "range": { + "min": -1, + "max": 32768 + } + }, + { + "path": "seed", + "label": "Seed", + "description": "Best-effort deterministic sampling seed. Repeated requests with the same seed and parameters should return the same result.", + "group": "sampling", + "type": "integer", + "range": { + "min": 0, + "max": 18446744073709552000 + } + }, + { + "path": "stop", + "label": "Stop", + "description": "A string or list of strings where the API will stop generating further tokens. The returned text will not contain the stop sequence.", + "group": "generation_length", + "type": "string" + } + ] + }, + { + "provider": "nvidia", + "authType": "api_key", + "model": "nemotron-content-safety-reasoning-4b", + "params": [ + { + "path": "temperature", + "label": "Temperature", + "description": "Controls randomness. Lower values make outputs more focused; higher values make them more varied. Not recommended to modify both temperature and top_p in the same call.", + "group": "sampling", + "type": "number", + "default": 1, + "range": { + "max": 1 + } + }, + { + "path": "top_p", + "label": "Top P", + "description": "Controls nucleus sampling by limiting generation to tokens within the selected cumulative probability. Not recommended to modify both temperature and top_p in the same call.", + "group": "sampling", + "type": "number", + "default": 1, + "range": { + "max": 1 + } + }, + { + "path": "max_tokens", + "label": "Max tokens", + "description": "Maximum number of tokens to generate. Generation stops when this limit is reached.", + "group": "generation_length", + "type": "integer", + "default": 16384, + "range": { + "min": 1, + "max": 32768 + } + }, + { + "path": "seed", + "label": "Seed", + "description": "Best-effort deterministic sampling seed. Repeated requests with the same seed and parameters should return the same result.", + "group": "sampling", + "type": "integer", + "range": { + "min": 0, + "max": 18446744073709552000 + } + }, + { + "path": "stop", + "label": "Stop", + "description": "A string or list of strings where the API will stop generating further tokens. The returned text will not contain the stop sequence.", + "group": "generation_length", + "type": "string" + } + ] + }, + { + "provider": "nvidia", + "authType": "api_key", + "model": "nemotron-mini-4b-instruct", + "params": [ + { + "path": "temperature", + "label": "Temperature", + "description": "Controls randomness. Lower values make outputs more focused; higher values make them more varied. Not recommended to modify both temperature and top_p in the same call.", + "group": "sampling", + "type": "number", + "default": 0.2, + "range": { + "min": 0, + "max": 1 + } + }, + { + "path": "top_p", + "label": "Top P", + "description": "Controls nucleus sampling by limiting generation to tokens within the selected cumulative probability. Not recommended to modify both temperature and top_p in the same call.", + "group": "sampling", + "type": "number", + "default": 0.7, + "range": { + "max": 1 + } + }, + { + "path": "max_tokens", + "label": "Max tokens", + "description": "Maximum number of tokens to generate. Generation stops when this limit is reached.", + "group": "generation_length", + "type": "integer", + "default": 1024, + "range": { + "min": 1, + "max": 4096 + } + }, + { + "path": "frequency_penalty", + "label": "Frequency penalty", + "description": "Penalizes new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.", + "group": "sampling", + "type": "number", + "default": 0, + "range": { + "min": -2, + "max": 2 + } + }, + { + "path": "presence_penalty", + "label": "Presence penalty", + "description": "Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics.", + "group": "sampling", + "type": "number", + "default": 0, + "range": { + "min": -2, + "max": 2 + } + }, + { + "path": "stop", + "label": "Stop", + "description": "A string or list of strings where the API will stop generating further tokens. The returned text will not contain the stop sequence.", + "group": "generation_length", + "type": "string" + } + ] + }, + { + "provider": "nvidia", + "authType": "api_key", + "model": "riva-translate-4b-instruct-v1.1", + "params": [ + { + "path": "temperature", + "label": "Temperature", + "description": "Controls randomness. Lower values make outputs more focused; higher values make them more varied. Not recommended to modify both temperature and top_p in the same call.", + "group": "sampling", + "type": "number", + "default": 0, + "range": { + "min": 0, + "max": 1 + } + }, + { + "path": "top_p", + "label": "Top P", + "description": "Controls nucleus sampling by limiting generation to tokens within the selected cumulative probability. Not recommended to modify both temperature and top_p in the same call.", + "group": "sampling", + "type": "number", + "default": 0.9, + "range": { + "max": 1 + } + }, + { + "path": "max_tokens", + "label": "Max tokens", + "description": "Maximum number of tokens to generate. Generation stops when this limit is reached.", + "group": "generation_length", + "type": "integer", + "default": 512, + "range": { + "min": 1, + "max": 4096 + } + }, + { + "path": "frequency_penalty", + "label": "Frequency penalty", + "description": "Penalizes new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.", + "group": "sampling", + "type": "number", + "default": 0, + "range": { + "min": -2, + "max": 2 + } + }, + { + "path": "presence_penalty", + "label": "Presence penalty", + "description": "Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics.", + "group": "sampling", + "type": "number", + "default": 0, + "range": { + "min": -2, + "max": 2 + } + }, + { + "path": "stop", + "label": "Stop", + "description": "A string or list of strings where the API will stop generating further tokens. The returned text will not contain the stop sequence.", + "group": "generation_length", + "type": "string" + } + ] + }, + { + "provider": "nvidia", + "authType": "api_key", + "model": "usdcode-llama-3.1-70b-instruct", + "params": [ + { + "path": "temperature", + "label": "Temperature", + "description": "Controls randomness. Lower values make outputs more focused; higher values make them more varied. Not recommended to modify both temperature and top_p in the same call.", + "group": "sampling", + "type": "number", + "default": 0.1, + "range": { + "min": 0, + "max": 1 + } + }, + { + "path": "top_p", + "label": "Top P", + "description": "Controls nucleus sampling by limiting generation to tokens within the selected cumulative probability. Not recommended to modify both temperature and top_p in the same call.", + "group": "sampling", + "type": "number", + "default": 1, + "range": { + "max": 1 + } + }, + { + "path": "max_tokens", + "label": "Max tokens", + "description": "Maximum number of tokens to generate. Generation stops when this limit is reached.", + "group": "generation_length", + "type": "integer", + "default": 1024, + "range": { + "min": 1, + "max": 2048 + } + }, + { + "path": "expert_type", + "label": "Expert type", + "description": "The type of expert to use. 'knowledge' answers with USD knowledge, 'code' responds with vanilla OpenUSD code, 'helperfunction' uses high-level helper functions, and 'auto' lets the LLM determine which expert to use.", + "group": "provider_metadata", + "type": "enum", + "default": "auto", + "values": [ + "auto", + "code", + "knowledge", + "helperfunction" + ] + } + ] + }, { "provider": "openai", "authType": "api_key", diff --git a/packages/modelparams/src/generated/defaults.ts b/packages/modelparams/src/generated/defaults.ts index e64886f..fdb86a2 100644 --- a/packages/modelparams/src/generated/defaults.ts +++ b/packages/modelparams/src/generated/defaults.ts @@ -730,6 +730,99 @@ export const DEFAULTS = { frequency_penalty: 0, "response_format.type": "text", }, + "nvidia/gliner-pii": { + threshold: 0.5, + chunk_length: 384, + overlap: 128, + flat_ner: false, + }, + "nvidia/llama-3.1-nemoguard-8b-topic-control": { + temperature: 0.5, + top_p: 1, + max_tokens: 1024, + frequency_penalty: 0, + presence_penalty: 0, + }, + "nvidia/llama-3.1-nemotron-nano-8b-v1": { + temperature: 0.6, + top_p: 0.95, + max_tokens: 4096, + frequency_penalty: 0, + presence_penalty: 0, + seed: 0, + }, + "nvidia/llama-3.1-nemotron-safety-guard-8b-v3": { + temperature: 0, + }, + "nvidia/llama-3.1-nemotron-ultra-253b-v1": { + temperature: 0.6, + top_p: 0.95, + max_tokens: 4096, + frequency_penalty: 0, + presence_penalty: 0, + seed: 0, + }, + "nvidia/llama-3.3-nemotron-super-49b-v1": { + temperature: 0.6, + top_p: 0.95, + max_tokens: 4096, + frequency_penalty: 0, + presence_penalty: 0, + seed: 0, + }, + "nvidia/llama-3.3-nemotron-super-49b-v1.5": { + temperature: 0.6, + top_p: 0.95, + max_tokens: 65536, + frequency_penalty: 0, + presence_penalty: 0, + seed: 0, + }, + "nvidia/nemoguard-jailbreak-detect": {}, + "nvidia/nemotron-3-nano-30b-a3b": { + temperature: 1, + top_p: 1, + max_tokens: 16384, + }, + "nvidia/nemotron-3-super-120b-a12b": { + temperature: 1, + top_p: 0.95, + max_tokens: 16384, + reasoning_effort: "high", + reasoning_budget: 16384, + }, + "nvidia/nemotron-3-ultra-550b-a55b": { + temperature: 1, + top_p: 0.95, + max_tokens: 16384, + reasoning_effort: "high", + reasoning_budget: 16384, + }, + "nvidia/nemotron-content-safety-reasoning-4b": { + temperature: 1, + top_p: 1, + max_tokens: 16384, + }, + "nvidia/nemotron-mini-4b-instruct": { + temperature: 0.2, + top_p: 0.7, + max_tokens: 1024, + frequency_penalty: 0, + presence_penalty: 0, + }, + "nvidia/riva-translate-4b-instruct-v1.1": { + temperature: 0, + top_p: 0.9, + max_tokens: 512, + frequency_penalty: 0, + presence_penalty: 0, + }, + "nvidia/usdcode-llama-3.1-70b-instruct": { + temperature: 0.1, + top_p: 1, + max_tokens: 1024, + expert_type: "auto", + }, "openai/chatgpt-4o-latest": { max_tokens: 4096, temperature: 1, diff --git a/packages/modelparams/src/generated/model-ids.ts b/packages/modelparams/src/generated/model-ids.ts index 33a1830..3b7eae0 100644 --- a/packages/modelparams/src/generated/model-ids.ts +++ b/packages/modelparams/src/generated/model-ids.ts @@ -109,6 +109,21 @@ export const MODEL_IDS = [ "moonshot/moonshot-v1-128k", "moonshot/moonshot-v1-32k", "moonshot/moonshot-v1-8k", + "nvidia/gliner-pii", + "nvidia/llama-3.1-nemoguard-8b-topic-control", + "nvidia/llama-3.1-nemotron-nano-8b-v1", + "nvidia/llama-3.1-nemotron-safety-guard-8b-v3", + "nvidia/llama-3.1-nemotron-ultra-253b-v1", + "nvidia/llama-3.3-nemotron-super-49b-v1", + "nvidia/llama-3.3-nemotron-super-49b-v1.5", + "nvidia/nemoguard-jailbreak-detect", + "nvidia/nemotron-3-nano-30b-a3b", + "nvidia/nemotron-3-super-120b-a12b", + "nvidia/nemotron-3-ultra-550b-a55b", + "nvidia/nemotron-content-safety-reasoning-4b", + "nvidia/nemotron-mini-4b-instruct", + "nvidia/riva-translate-4b-instruct-v1.1", + "nvidia/usdcode-llama-3.1-70b-instruct", "openai/chatgpt-4o-latest", "openai/gpt-3.5-turbo", "openai/gpt-4-turbo", @@ -192,6 +207,7 @@ export const PROVIDERS = [ "minimax", "mistral", "moonshot", + "nvidia", "openai", "perplexity", "xai", diff --git a/packages/modelparams/src/generated/params-by-id.ts b/packages/modelparams/src/generated/params-by-id.ts index 67765f8..17357d0 100644 --- a/packages/modelparams/src/generated/params-by-id.ts +++ b/packages/modelparams/src/generated/params-by-id.ts @@ -913,6 +913,114 @@ export type ParamsById = { frequency_penalty: number; "response_format.type": "text" | "json_object"; }; + "nvidia/gliner-pii": { + threshold: number; + chunk_length: number; + overlap: number; + flat_ner: boolean; + }; + "nvidia/llama-3.1-nemoguard-8b-topic-control": { + temperature: number; + top_p: number; + max_tokens: number; + frequency_penalty: number; + presence_penalty: number; + stop: string; + }; + "nvidia/llama-3.1-nemotron-nano-8b-v1": { + temperature: number; + top_p: number; + max_tokens: number; + frequency_penalty: number; + presence_penalty: number; + seed: number; + stop: string; + }; + "nvidia/llama-3.1-nemotron-safety-guard-8b-v3": { + temperature: number; + }; + "nvidia/llama-3.1-nemotron-ultra-253b-v1": { + temperature: number; + top_p: number; + max_tokens: number; + frequency_penalty: number; + presence_penalty: number; + seed: number; + stop: string; + }; + "nvidia/llama-3.3-nemotron-super-49b-v1": { + temperature: number; + top_p: number; + max_tokens: number; + frequency_penalty: number; + presence_penalty: number; + seed: number; + stop: string; + }; + "nvidia/llama-3.3-nemotron-super-49b-v1.5": { + temperature: number; + top_p: number; + max_tokens: number; + frequency_penalty: number; + presence_penalty: number; + seed: number; + stop: string; + }; + "nvidia/nemoguard-jailbreak-detect": Record; + "nvidia/nemotron-3-nano-30b-a3b": { + temperature: number; + top_p: number; + max_tokens: number; + seed: number; + stop: string; + }; + "nvidia/nemotron-3-super-120b-a12b": { + temperature: number; + top_p: number; + max_tokens: number; + reasoning_effort: "none" | "low" | "high"; + reasoning_budget: number; + seed: number; + stop: string; + }; + "nvidia/nemotron-3-ultra-550b-a55b": { + temperature: number; + top_p: number; + max_tokens: number; + reasoning_effort: "none" | "medium" | "high"; + reasoning_budget: number; + seed: number; + stop: string; + }; + "nvidia/nemotron-content-safety-reasoning-4b": { + temperature: number; + top_p: number; + max_tokens: number; + seed: number; + stop: string; + }; + "nvidia/nemotron-mini-4b-instruct": { + temperature: number; + top_p: number; + max_tokens: number; + frequency_penalty: number; + presence_penalty: number; + stop: string; + }; + "nvidia/riva-translate-4b-instruct-v1.1": { + temperature: number; + top_p: number; + max_tokens: number; + frequency_penalty: number; + presence_penalty: number; + stop: string; + }; + "nvidia/usdcode-llama-3.1-70b-instruct": { + temperature: number; + top_p: number; + max_tokens: number; + expert_type: "auto" | "code" | "knowledge" | "helperfunction"; + }; "openai/chatgpt-4o-latest": { max_tokens: number; temperature: number;