From 262656b824549272f42b90ceef5e59fa356ce968 Mon Sep 17 00:00:00 2001
From: Tyler Ford <25458477+TF0rd@users.noreply.github.com>
Date: Sat, 6 Jun 2026 19:38:22 -0400
Subject: [PATCH 1/3] add nvidia provider with 15 models
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add model parameter catalog entries for all NVIDIA NIM API models
(provider: nvidia, authType: api_key) with parameters sourced from
the official NVIDIA API reference at docs.api.nvidia.com.

Models added:
- nemotron-3-ultra-550b-a55b (reasoning_effort, reasoning_budget)
- nemotron-3-super-120b-a12b (reasoning_effort, reasoning_budget)
- nemotron-3-nano-30b-a3b
- nemotron-mini-4b-instruct (tools support)
- nemotron-content-safety-reasoning-4b
- llama-3.1-nemotron-nano-8b-v1
- llama-3.1-nemotron-ultra-253b-v1
- llama-3.3-nemotron-super-49b-v1
- llama-3.3-nemotron-super-49b-v1.5 (65536 max_tokens)
- llama-3.1-nemotron-safety-guard-8b-v3
- llama-3.1-nemoguard-8b-topic-control
- riva-translate-4b-instruct-v1.1
- usdcode-llama-3.1-70b-instruct (expert_type enum)
- gliner-pii (entity extraction: labels, threshold, chunk_length,
  overlap, flat_ner)
- nemoguard-jailbreak-detect

Two models omitted — their infer pages document only stream
(a reserved MPS path not eligible as a parameter):
- llama-3.1-nemoguard-8b-content-safety
- nvidia-nemotron-nano-9b-v2 (infer page appears buggy; serves
  content-safety model data)
---
 models/nvidia/gliner-pii.yaml                 | 43 +++++++++++++
 .../llama-3.1-nemoguard-8b-topic-control.yaml | 53 ++++++++++++++++
 .../nvidia/llama-3.1-nemotron-nano-8b-v1.yaml | 63 +++++++++++++++++++
 ...llama-3.1-nemotron-safety-guard-8b-v3.yaml | 14 +++++
 .../llama-3.1-nemotron-ultra-253b-v1.yaml     | 63 +++++++++++++++++++
 .../llama-3.3-nemotron-super-49b-v1.5.yaml    | 63 +++++++++++++++++++
 .../llama-3.3-nemotron-super-49b-v1.yaml      | 63 +++++++++++++++++++
 models/nvidia/nemoguard-jailbreak-detect.yaml | 10 +++
 models/nvidia/nemotron-3-nano-30b-a3b.yaml    | 43 +++++++++++++
 models/nvidia/nemotron-3-super-120b-a12b.yaml | 62 ++++++++++++++++++
 models/nvidia/nemotron-3-ultra-550b-a55b.yaml | 62 ++++++++++++++++++
 .../nemotron-content-safety-reasoning-4b.yaml | 43 +++++++++++++
 models/nvidia/nemotron-mini-4b-instruct.yaml  | 59 +++++++++++++++++
 .../riva-translate-4b-instruct-v1.1.yaml      | 54 ++++++++++++++++
 .../usdcode-llama-3.1-70b-instruct.yaml       | 42 +++++++++++++
 15 files changed, 737 insertions(+)
 create mode 100644 models/nvidia/gliner-pii.yaml
 create mode 100644 models/nvidia/llama-3.1-nemoguard-8b-topic-control.yaml
 create mode 100644 models/nvidia/llama-3.1-nemotron-nano-8b-v1.yaml
 create mode 100644 models/nvidia/llama-3.1-nemotron-safety-guard-8b-v3.yaml
 create mode 100644 models/nvidia/llama-3.1-nemotron-ultra-253b-v1.yaml
 create mode 100644 models/nvidia/llama-3.3-nemotron-super-49b-v1.5.yaml
 create mode 100644 models/nvidia/llama-3.3-nemotron-super-49b-v1.yaml
 create mode 100644 models/nvidia/nemoguard-jailbreak-detect.yaml
 create mode 100644 models/nvidia/nemotron-3-nano-30b-a3b.yaml
 create mode 100644 models/nvidia/nemotron-3-super-120b-a12b.yaml
 create mode 100644 models/nvidia/nemotron-3-ultra-550b-a55b.yaml
 create mode 100644 models/nvidia/nemotron-content-safety-reasoning-4b.yaml
 create mode 100644 models/nvidia/nemotron-mini-4b-instruct.yaml
 create mode 100644 models/nvidia/riva-translate-4b-instruct-v1.1.yaml
 create mode 100644 models/nvidia/usdcode-llama-3.1-70b-instruct.yaml

diff --git a/models/nvidia/gliner-pii.yaml b/models/nvidia/gliner-pii.yaml
new file mode 100644
index 0000000..9bbb542
--- /dev/null
+++ b/models/nvidia/gliner-pii.yaml
@@ -0,0 +1,43 @@
+# yaml-language-server: $schema=https://modelparams.dev/api/v1/schema.json
+provider: nvidia
+authType: api_key
+model: gliner-pii
+params:
+  - path: labels
+    type: string
+    label: Labels
+    description: Entity types to detect. If not specified, uses the default set of 55 PII categories including email, phone_number, ssn, first_name, last_name, and address.
+    group: provider_metadata
+  - path: threshold
+    type: number
+    label: Threshold
+    description: Confidence threshold for entity detection. Lower values detect more entities but may include false positives.
+    default: 0.5
+    range:
+      min: 0
+      max: 1
+    group: sampling
+  - path: chunk_length
+    type: integer
+    label: Chunk length
+    description: Context window size for processing. Longer texts are automatically split into chunks with overlap for complete coverage. Must be greater than overlap.
+    default: 384
+    range:
+      min: 1
+      max: 2048
+    group: provider_metadata
+  - path: overlap
+    type: integer
+    label: Overlap
+    description: Token overlap between chunks to prevent entity clipping. Must be less than chunk_length.
+    default: 128
+    range:
+      min: 0
+      max: 512
+    group: provider_metadata
+  - path: flat_ner
+    type: boolean
+    label: Flat NER
+    description: When true, prevents overlapping entity spans. When false, may return nested entities such as both a full name and its constituent first name.
+    default: false
+    group: provider_metadata
diff --git a/models/nvidia/llama-3.1-nemoguard-8b-topic-control.yaml b/models/nvidia/llama-3.1-nemoguard-8b-topic-control.yaml
new file mode 100644
index 0000000..bfc0d61
--- /dev/null
+++ b/models/nvidia/llama-3.1-nemoguard-8b-topic-control.yaml
@@ -0,0 +1,53 @@
+# yaml-language-server: $schema=https://modelparams.dev/api/v1/schema.json
+provider: nvidia
+authType: api_key
+model: llama-3.1-nemoguard-8b-topic-control
+params:
+  - path: temperature
+    type: number
+    label: Temperature
+    description: Controls randomness. Lower values make outputs more focused; higher values make them more varied. Not recommended to modify both temperature and top_p in the same call.
+    default: 0.5
+    range:
+      min: 0
+      max: 2
+    group: sampling
+  - path: top_p
+    type: number
+    label: Top P
+    description: Controls nucleus sampling by limiting generation to tokens within the selected cumulative probability. Not recommended to modify both temperature and top_p in the same call.
+    default: 1
+    range:
+      max: 1
+    group: sampling
+  - path: max_tokens
+    type: integer
+    label: Max tokens
+    description: Maximum number of tokens to generate. Generation stops when this limit is reached.
+    default: 1024
+    range:
+      min: 1
+    group: generation_length
+  - path: frequency_penalty
+    type: number
+    label: Frequency penalty
+    description: Penalizes new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.
+    default: 0
+    range:
+      min: -2
+      max: 2
+    group: sampling
+  - path: presence_penalty
+    type: number
+    label: Presence penalty
+    description: Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics.
+    default: 0
+    range:
+      min: -2
+      max: 2
+    group: sampling
+  - path: stop
+    type: string
+    label: Stop
+    description: A string or list of strings where the API will stop generating further tokens. The returned text will not contain the stop sequence.
+    group: generation_length
diff --git a/models/nvidia/llama-3.1-nemotron-nano-8b-v1.yaml b/models/nvidia/llama-3.1-nemotron-nano-8b-v1.yaml
new file mode 100644
index 0000000..fc46239
--- /dev/null
+++ b/models/nvidia/llama-3.1-nemotron-nano-8b-v1.yaml
@@ -0,0 +1,63 @@
+# yaml-language-server: $schema=https://modelparams.dev/api/v1/schema.json
+provider: nvidia
+authType: api_key
+model: llama-3.1-nemotron-nano-8b-v1
+params:
+  - path: temperature
+    type: number
+    label: Temperature
+    description: Controls randomness. Lower values make outputs more focused; higher values make them more varied. Not recommended to modify both temperature and top_p in the same call.
+    default: 0.6
+    range:
+      min: 0
+      max: 1
+    group: sampling
+  - path: top_p
+    type: number
+    label: Top P
+    description: Controls nucleus sampling by limiting generation to tokens within the selected cumulative probability. Not recommended to modify both temperature and top_p in the same call.
+    default: 0.95
+    range:
+      max: 1
+    group: sampling
+  - path: max_tokens
+    type: integer
+    label: Max tokens
+    description: Maximum number of tokens to generate. Generation stops when this limit is reached.
+    default: 4096
+    range:
+      min: 1
+      max: 16384
+    group: generation_length
+  - path: frequency_penalty
+    type: number
+    label: Frequency penalty
+    description: Penalizes new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.
+    default: 0
+    range:
+      min: -2
+      max: 2
+    group: sampling
+  - path: presence_penalty
+    type: number
+    label: Presence penalty
+    description: Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics.
+    default: 0
+    range:
+      min: -2
+      max: 2
+    group: sampling
+  - path: seed
+    type: integer
+    label: Seed
+    description: Best-effort deterministic sampling seed. Changing the seed produces a different response with similar characteristics. Fix the seed to reproduce results.
+    default: 0
+    range:
+      min: 0
+      max: 18446744073709552000
+    group: sampling
+  - path: stop
+    type: string
+    label: Stop
+    description: A string or list of strings where the API will stop generating further tokens. The returned text will not contain the stop sequence.
+    group: generation_length
diff --git a/models/nvidia/llama-3.1-nemotron-safety-guard-8b-v3.yaml b/models/nvidia/llama-3.1-nemotron-safety-guard-8b-v3.yaml
new file mode 100644
index 0000000..4256093
--- /dev/null
+++ b/models/nvidia/llama-3.1-nemotron-safety-guard-8b-v3.yaml
@@ -0,0 +1,14 @@
+# yaml-language-server: $schema=https://modelparams.dev/api/v1/schema.json
+provider: nvidia
+authType: api_key
+model: llama-3.1-nemotron-safety-guard-8b-v3
+params:
+  - path: temperature
+    type: number
+    label: Temperature
+    description: Controls randomness. Lower values make outputs more focused; higher values make them more varied.
+    default: 0
+    range:
+      min: 0
+      max: 1
+    group: sampling
diff --git a/models/nvidia/llama-3.1-nemotron-ultra-253b-v1.yaml b/models/nvidia/llama-3.1-nemotron-ultra-253b-v1.yaml
new file mode 100644
index 0000000..367b0ef
--- /dev/null
+++ b/models/nvidia/llama-3.1-nemotron-ultra-253b-v1.yaml
@@ -0,0 +1,63 @@
+# yaml-language-server: $schema=https://modelparams.dev/api/v1/schema.json
+provider: nvidia
+authType: api_key
+model: llama-3.1-nemotron-ultra-253b-v1
+params:
+  - path: temperature
+    type: number
+    label: Temperature
+    description: Controls randomness. Lower values make outputs more focused; higher values make them more varied. Not recommended to modify both temperature and top_p in the same call.
+    default: 0.6
+    range:
+      min: 0
+      max: 1
+    group: sampling
+  - path: top_p
+    type: number
+    label: Top P
+    description: Controls nucleus sampling by limiting generation to tokens within the selected cumulative probability. Not recommended to modify both temperature and top_p in the same call.
+    default: 0.95
+    range:
+      max: 1
+    group: sampling
+  - path: max_tokens
+    type: integer
+    label: Max tokens
+    description: Maximum number of tokens to generate. Generation stops when this limit is reached.
+    default: 4096
+    range:
+      min: 1
+      max: 16384
+    group: generation_length
+  - path: frequency_penalty
+    type: number
+    label: Frequency penalty
+    description: Penalizes new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.
+    default: 0
+    range:
+      min: -2
+      max: 2
+    group: sampling
+  - path: presence_penalty
+    type: number
+    label: Presence penalty
+    description: Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics.
+    default: 0
+    range:
+      min: -2
+      max: 2
+    group: sampling
+  - path: seed
+    type: integer
+    label: Seed
+    description: Best-effort deterministic sampling seed. Changing the seed produces a different response with similar characteristics. Fix the seed to reproduce results.
+    default: 0
+    range:
+      min: 0
+      max: 18446744073709552000
+    group: sampling
+  - path: stop
+    type: string
+    label: Stop
+    description: A string or list of strings where the API will stop generating further tokens. The returned text will not contain the stop sequence.
+    group: generation_length
diff --git a/models/nvidia/llama-3.3-nemotron-super-49b-v1.5.yaml b/models/nvidia/llama-3.3-nemotron-super-49b-v1.5.yaml
new file mode 100644
index 0000000..b20d19b
--- /dev/null
+++ b/models/nvidia/llama-3.3-nemotron-super-49b-v1.5.yaml
@@ -0,0 +1,63 @@
+# yaml-language-server: $schema=https://modelparams.dev/api/v1/schema.json
+provider: nvidia
+authType: api_key
+model: llama-3.3-nemotron-super-49b-v1.5
+params:
+  - path: temperature
+    type: number
+    label: Temperature
+    description: Controls randomness. Lower values make outputs more focused; higher values make them more varied. Not recommended to modify both temperature and top_p in the same call.
+    default: 0.6
+    range:
+      min: 0
+      max: 1
+    group: sampling
+  - path: top_p
+    type: number
+    label: Top P
+    description: Controls nucleus sampling by limiting generation to tokens within the selected cumulative probability. Not recommended to modify both temperature and top_p in the same call.
+    default: 0.95
+    range:
+      max: 1
+    group: sampling
+  - path: max_tokens
+    type: integer
+    label: Max tokens
+    description: Maximum number of tokens to generate. Generation stops when this limit is reached.
+    default: 65536
+    range:
+      min: 1
+      max: 65536
+    group: generation_length
+  - path: frequency_penalty
+    type: number
+    label: Frequency penalty
+    description: Penalizes new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.
+    default: 0
+    range:
+      min: -2
+      max: 2
+    group: sampling
+  - path: presence_penalty
+    type: number
+    label: Presence penalty
+    description: Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics.
+    default: 0
+    range:
+      min: -2
+      max: 2
+    group: sampling
+  - path: seed
+    type: integer
+    label: Seed
+    description: Best-effort deterministic sampling seed. Changing the seed produces a different response with similar characteristics. Fix the seed to reproduce results.
+    default: 0
+    range:
+      min: 0
+      max: 18446744073709552000
+    group: sampling
+  - path: stop
+    type: string
+    label: Stop
+    description: A string or list of strings where the API will stop generating further tokens. The returned text will not contain the stop sequence.
+    group: generation_length
diff --git a/models/nvidia/llama-3.3-nemotron-super-49b-v1.yaml b/models/nvidia/llama-3.3-nemotron-super-49b-v1.yaml
new file mode 100644
index 0000000..313d97e
--- /dev/null
+++ b/models/nvidia/llama-3.3-nemotron-super-49b-v1.yaml
@@ -0,0 +1,63 @@
+# yaml-language-server: $schema=https://modelparams.dev/api/v1/schema.json
+provider: nvidia
+authType: api_key
+model: llama-3.3-nemotron-super-49b-v1
+params:
+  - path: temperature
+    type: number
+    label: Temperature
+    description: Controls randomness. Lower values make outputs more focused; higher values make them more varied. Not recommended to modify both temperature and top_p in the same call.
+    default: 0.6
+    range:
+      min: 0
+      max: 1
+    group: sampling
+  - path: top_p
+    type: number
+    label: Top P
+    description: Controls nucleus sampling by limiting generation to tokens within the selected cumulative probability. Not recommended to modify both temperature and top_p in the same call.
+    default: 0.95
+    range:
+      max: 1
+    group: sampling
+  - path: max_tokens
+    type: integer
+    label: Max tokens
+    description: Maximum number of tokens to generate. Generation stops when this limit is reached.
+    default: 4096
+    range:
+      min: 1
+      max: 16384
+    group: generation_length
+  - path: frequency_penalty
+    type: number
+    label: Frequency penalty
+    description: Penalizes new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.
+    default: 0
+    range:
+      min: -2
+      max: 2
+    group: sampling
+  - path: presence_penalty
+    type: number
+    label: Presence penalty
+    description: Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics.
+    default: 0
+    range:
+      min: -2
+      max: 2
+    group: sampling
+  - path: seed
+    type: integer
+    label: Seed
+    description: Best-effort deterministic sampling seed. Changing the seed produces a different response with similar characteristics. Fix the seed to reproduce results.
+    default: 0
+    range:
+      min: 0
+      max: 18446744073709552000
+    group: sampling
+  - path: stop
+    type: string
+    label: Stop
+    description: A string or list of strings where the API will stop generating further tokens. The returned text will not contain the stop sequence.
+    group: generation_length
diff --git a/models/nvidia/nemoguard-jailbreak-detect.yaml b/models/nvidia/nemoguard-jailbreak-detect.yaml
new file mode 100644
index 0000000..1800f49
--- /dev/null
+++ b/models/nvidia/nemoguard-jailbreak-detect.yaml
@@ -0,0 +1,10 @@
+# yaml-language-server: $schema=https://modelparams.dev/api/v1/schema.json
+provider: nvidia
+authType: api_key
+model: nemoguard-jailbreak-detect
+params:
+  - path: input
+    type: string
+    label: Input
+    description: The text to classify for jailbreak attempts. Accepts a string or an array of strings.
+    group: provider_metadata
diff --git a/models/nvidia/nemotron-3-nano-30b-a3b.yaml b/models/nvidia/nemotron-3-nano-30b-a3b.yaml
new file mode 100644
index 0000000..aaf9bcc
--- /dev/null
+++ b/models/nvidia/nemotron-3-nano-30b-a3b.yaml
@@ -0,0 +1,43 @@
+# yaml-language-server: $schema=https://modelparams.dev/api/v1/schema.json
+provider: nvidia
+authType: api_key
+model: nemotron-3-nano-30b-a3b
+params:
+  - path: temperature
+    type: number
+    label: Temperature
+    description: Controls randomness. Lower values make outputs more focused; higher values make them more varied. Not recommended to modify both temperature and top_p in the same call.
+    default: 1
+    range:
+      max: 1
+    group: sampling
+  - path: top_p
+    type: number
+    label: Top P
+    description: Controls nucleus sampling by limiting generation to tokens within the selected cumulative probability. Not recommended to modify both temperature and top_p in the same call.
+    default: 1
+    range:
+      max: 1
+    group: sampling
+  - path: max_tokens
+    type: integer
+    label: Max tokens
+    description: Maximum number of tokens to generate. Generation stops when this limit is reached.
+    default: 16384
+    range:
+      min: 1
+      max: 32768
+    group: generation_length
+  - path: seed
+    type: integer
+    label: Seed
+    description: Best-effort deterministic sampling seed. Repeated requests with the same seed and parameters should return the same result.
+    range:
+      min: 0
+      max: 18446744073709552000
+    group: sampling
+  - path: stop
+    type: string
+    label: Stop
+    description: A string or list of strings where the API will stop generating further tokens. The returned text will not contain the stop sequence.
+    group: generation_length
diff --git a/models/nvidia/nemotron-3-super-120b-a12b.yaml b/models/nvidia/nemotron-3-super-120b-a12b.yaml
new file mode 100644
index 0000000..1a81de2
--- /dev/null
+++ b/models/nvidia/nemotron-3-super-120b-a12b.yaml
@@ -0,0 +1,62 @@
+# yaml-language-server: $schema=https://modelparams.dev/api/v1/schema.json
+provider: nvidia
+authType: api_key
+model: nemotron-3-super-120b-a12b
+params:
+  - path: temperature
+    type: number
+    label: Temperature
+    description: Controls randomness. Lower values make outputs more focused; higher values make them more varied. Not recommended to modify both temperature and top_p in the same call.
+    default: 1
+    range:
+      max: 1
+    group: sampling
+  - path: top_p
+    type: number
+    label: Top P
+    description: Controls nucleus sampling by limiting generation to tokens within the selected cumulative probability. Not recommended to modify both temperature and top_p in the same call.
+    default: 0.95
+    range:
+      max: 1
+    group: sampling
+  - path: max_tokens
+    type: integer
+    label: Max tokens
+    description: Maximum number of tokens to generate. Generation stops when this limit is reached.
+    default: 16384
+    range:
+      min: 1
+      max: 32768
+    group: generation_length
+  - path: reasoning_effort
+    type: enum
+    label: Reasoning effort
+    description: Controls the reasoning mode. 'none' disables reasoning tokens, 'low' enables low-effort reasoning, and 'high' enables full reasoning.
+    default: high
+    values:
+      - none
+      - low
+      - high
+    group: reasoning
+  - path: reasoning_budget
+    type: integer
+    label: Reasoning budget
+    description: Maximum number of tokens the model may use for internal reasoning before being forced to end the reasoning trace. Use -1 to disable budget enforcement.
+    default: 16384
+    range:
+      min: -1
+      max: 32768
+    group: reasoning
+  - path: seed
+    type: integer
+    label: Seed
+    description: Best-effort deterministic sampling seed. Repeated requests with the same seed and parameters should return the same result.
+    range:
+      min: 0
+      max: 18446744073709552000
+    group: sampling
+  - path: stop
+    type: string
+    label: Stop
+    description: A string or list of strings where the API will stop generating further tokens. The returned text will not contain the stop sequence.
+    group: generation_length
diff --git a/models/nvidia/nemotron-3-ultra-550b-a55b.yaml b/models/nvidia/nemotron-3-ultra-550b-a55b.yaml
new file mode 100644
index 0000000..b7c2fef
--- /dev/null
+++ b/models/nvidia/nemotron-3-ultra-550b-a55b.yaml
@@ -0,0 +1,62 @@
+# yaml-language-server: $schema=https://modelparams.dev/api/v1/schema.json
+provider: nvidia
+authType: api_key
+model: nemotron-3-ultra-550b-a55b
+params:
+  - path: temperature
+    type: number
+    label: Temperature
+    description: Controls randomness. Lower values make outputs more focused; higher values make them more varied. Not recommended to modify both temperature and top_p in the same call.
+    default: 1
+    range:
+      max: 1
+    group: sampling
+  - path: top_p
+    type: number
+    label: Top P
+    description: Controls nucleus sampling by limiting generation to tokens within the selected cumulative probability. Not recommended to modify both temperature and top_p in the same call.
+    default: 0.95
+    range:
+      max: 1
+    group: sampling
+  - path: max_tokens
+    type: integer
+    label: Max tokens
+    description: Maximum number of tokens to generate. Generation stops when this limit is reached.
+    default: 16384
+    range:
+      min: 1
+      max: 32768
+    group: generation_length
+  - path: reasoning_effort
+    type: enum
+    label: Reasoning effort
+    description: Controls the reasoning mode. 'none' disables reasoning tokens, 'medium' enables efficient reasoning, and 'high' enables full reasoning.
+    default: high
+    values:
+      - none
+      - medium
+      - high
+    group: reasoning
+  - path: reasoning_budget
+    type: integer
+    label: Reasoning budget
+    description: Maximum number of tokens the model may use for internal reasoning before being forced to end the reasoning trace. Use -1 to disable budget enforcement.
+    default: 16384
+    range:
+      min: -1
+      max: 32768
+    group: reasoning
+  - path: seed
+    type: integer
+    label: Seed
+    description: Best-effort deterministic sampling seed. Repeated requests with the same seed and parameters should return the same result.
+    range:
+      min: 0
+      max: 18446744073709552000
+    group: sampling
+  - path: stop
+    type: string
+    label: Stop
+    description: A string or list of strings where the API will stop generating further tokens. The returned text will not contain the stop sequence.
+    group: generation_length
diff --git a/models/nvidia/nemotron-content-safety-reasoning-4b.yaml b/models/nvidia/nemotron-content-safety-reasoning-4b.yaml
new file mode 100644
index 0000000..7bfa8bb
--- /dev/null
+++ b/models/nvidia/nemotron-content-safety-reasoning-4b.yaml
@@ -0,0 +1,43 @@
+# yaml-language-server: $schema=https://modelparams.dev/api/v1/schema.json
+provider: nvidia
+authType: api_key
+model: nemotron-content-safety-reasoning-4b
+params:
+  - path: temperature
+    type: number
+    label: Temperature
+    description: Controls randomness. Lower values make outputs more focused; higher values make them more varied. Not recommended to modify both temperature and top_p in the same call.
+    default: 1
+    range:
+      max: 1
+    group: sampling
+  - path: top_p
+    type: number
+    label: Top P
+    description: Controls nucleus sampling by limiting generation to tokens within the selected cumulative probability. Not recommended to modify both temperature and top_p in the same call.
+    default: 1
+    range:
+      max: 1
+    group: sampling
+  - path: max_tokens
+    type: integer
+    label: Max tokens
+    description: Maximum number of tokens to generate. Generation stops when this limit is reached.
+    default: 16384
+    range:
+      min: 1
+      max: 32768
+    group: generation_length
+  - path: seed
+    type: integer
+    label: Seed
+    description: Best-effort deterministic sampling seed. Repeated requests with the same seed and parameters should return the same result.
+    range:
+      min: 0
+      max: 18446744073709552000
+    group: sampling
+  - path: stop
+    type: string
+    label: Stop
+    description: A string or list of strings where the API will stop generating further tokens. The returned text will not contain the stop sequence.
+    group: generation_length
diff --git a/models/nvidia/nemotron-mini-4b-instruct.yaml b/models/nvidia/nemotron-mini-4b-instruct.yaml
new file mode 100644
index 0000000..070d6bf
--- /dev/null
+++ b/models/nvidia/nemotron-mini-4b-instruct.yaml
@@ -0,0 +1,59 @@
+# yaml-language-server: $schema=https://modelparams.dev/api/v1/schema.json
+provider: nvidia
+authType: api_key
+model: nemotron-mini-4b-instruct
+params:
+  - path: temperature
+    type: number
+    label: Temperature
+    description: Controls randomness. Lower values make outputs more focused; higher values make them more varied. Not recommended to modify both temperature and top_p in the same call.
+    default: 0.2
+    range:
+      min: 0
+      max: 1
+    group: sampling
+  - path: top_p
+    type: number
+    label: Top P
+    description: Controls nucleus sampling by limiting generation to tokens within the selected cumulative probability. Not recommended to modify both temperature and top_p in the same call.
+    default: 0.7
+    range:
+      max: 1
+    group: sampling
+  - path: max_tokens
+    type: integer
+    label: Max tokens
+    description: Maximum number of tokens to generate. Generation stops when this limit is reached.
+    default: 1024
+    range:
+      min: 1
+      max: 4096
+    group: generation_length
+  - path: frequency_penalty
+    type: number
+    label: Frequency penalty
+    description: Penalizes new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.
+    default: 0
+    range:
+      min: -2
+      max: 2
+    group: sampling
+  - path: presence_penalty
+    type: number
+    label: Presence penalty
+    description: Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics.
+    default: 0
+    range:
+      min: -2
+      max: 2
+    group: sampling
+  - path: stop
+    type: string
+    label: Stop
+    description: A string or list of strings where the API will stop generating further tokens. The returned text will not contain the stop sequence.
+    group: generation_length
+  - path: tools
+    type: string
+    label: Tools
+    description: A list of tools the model may call. If no tools are provided, the model will not call any tools.
+    group: tooling
diff --git a/models/nvidia/riva-translate-4b-instruct-v1.1.yaml b/models/nvidia/riva-translate-4b-instruct-v1.1.yaml
new file mode 100644
index 0000000..420c6e0
--- /dev/null
+++ b/models/nvidia/riva-translate-4b-instruct-v1.1.yaml
@@ -0,0 +1,54 @@
+# yaml-language-server: $schema=https://modelparams.dev/api/v1/schema.json
+provider: nvidia
+authType: api_key
+model: riva-translate-4b-instruct-v1.1
+params:
+  - path: temperature
+    type: number
+    label: Temperature
+    description: Controls randomness. Lower values make outputs more focused; higher values make them more varied. Not recommended to modify both temperature and top_p in the same call.
+    default: 0
+    range:
+      min: 0
+      max: 1
+    group: sampling
+  - path: top_p
+    type: number
+    label: Top P
+    description: Controls nucleus sampling by limiting generation to tokens within the selected cumulative probability. Not recommended to modify both temperature and top_p in the same call.
+    default: 0.9
+    range:
+      max: 1
+    group: sampling
+  - path: max_tokens
+    type: integer
+    label: Max tokens
+    description: Maximum number of tokens to generate. Generation stops when this limit is reached.
+    default: 512
+    range:
+      min: 1
+      max: 4096
+    group: generation_length
+  - path: frequency_penalty
+    type: number
+    label: Frequency penalty
+    description: Penalizes new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.
+    default: 0
+    range:
+      min: -2
+      max: 2
+    group: sampling
+  - path: presence_penalty
+    type: number
+    label: Presence penalty
+    description: Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics.
+    default: 0
+    range:
+      min: -2
+      max: 2
+    group: sampling
+  - path: stop
+    type: string
+    label: Stop
+    description: A string or list of strings where the API will stop generating further tokens. The returned text will not contain the stop sequence.
+    group: generation_length
diff --git a/models/nvidia/usdcode-llama-3.1-70b-instruct.yaml b/models/nvidia/usdcode-llama-3.1-70b-instruct.yaml
new file mode 100644
index 0000000..6f4aa2b
--- /dev/null
+++ b/models/nvidia/usdcode-llama-3.1-70b-instruct.yaml
@@ -0,0 +1,42 @@
+# yaml-language-server: $schema=https://modelparams.dev/api/v1/schema.json
+provider: nvidia
+authType: api_key
+model: usdcode-llama-3.1-70b-instruct
+params:
+  - path: temperature
+    type: number
+    label: Temperature
+    description: Controls randomness. Lower values make outputs more focused; higher values make them more varied. Not recommended to modify both temperature and top_p in the same call.
+    default: 0.1
+    range:
+      min: 0
+      max: 1
+    group: sampling
+  - path: top_p
+    type: number
+    label: Top P
+    description: Controls nucleus sampling by limiting generation to tokens within the selected cumulative probability. Not recommended to modify both temperature and top_p in the same call.
+    default: 1
+    range:
+      max: 1
+    group: sampling
+  - path: max_tokens
+    type: integer
+    label: Max tokens
+    description: Maximum number of tokens to generate. Generation stops when this limit is reached.
+    default: 1024
+    range:
+      min: 1
+      max: 2048
+    group: generation_length
+  - path: expert_type
+    type: enum
+    label: Expert type
+    description: The type of expert to use. 'knowledge' answers with USD knowledge, 'code' responds with vanilla OpenUSD code, 'helperfunction' uses high-level helper functions, and 'auto' lets the LLM determine which expert to use.
+    default: auto
+    values:
+      - auto
+      - code
+      - knowledge
+      - helperfunction
+    group: provider_metadata

From 3c8d451b268f8f9a798195e32824b70ea7f87afd Mon Sep 17 00:00:00 2001
From: Guillaume Gay <guillaume.gay@protonmail.com>
Date: Mon, 15 Jun 2026 13:37:02 +0200
Subject: [PATCH 2/3] fix: remove unsupported nvidia params

---
 models/nvidia/gliner-pii.yaml                 | 5 -----
 models/nvidia/nemoguard-jailbreak-detect.yaml | 7 +------
 models/nvidia/nemotron-mini-4b-instruct.yaml  | 5 -----
 3 files changed, 1 insertion(+), 16 deletions(-)

diff --git a/models/nvidia/gliner-pii.yaml b/models/nvidia/gliner-pii.yaml
index 9bbb542..f0bc975 100644
--- a/models/nvidia/gliner-pii.yaml
+++ b/models/nvidia/gliner-pii.yaml
@@ -3,11 +3,6 @@ provider: nvidia
 authType: api_key
 model: gliner-pii
 params:
-  - path: labels
-    type: string
-    label: Labels
-    description: Entity types to detect. If not specified, uses the default set of 55 PII categories including email, phone_number, ssn, first_name, last_name, and address.
-    group: provider_metadata
   - path: threshold
     type: number
     label: Threshold
diff --git a/models/nvidia/nemoguard-jailbreak-detect.yaml b/models/nvidia/nemoguard-jailbreak-detect.yaml
index 1800f49..4a6231d 100644
--- a/models/nvidia/nemoguard-jailbreak-detect.yaml
+++ b/models/nvidia/nemoguard-jailbreak-detect.yaml
@@ -2,9 +2,4 @@
 provider: nvidia
 authType: api_key
 model: nemoguard-jailbreak-detect
-params:
-  - path: input
-    type: string
-    label: Input
-    description: The text to classify for jailbreak attempts. Accepts a string or an array of strings.
-    group: provider_metadata
+params: []
diff --git a/models/nvidia/nemotron-mini-4b-instruct.yaml b/models/nvidia/nemotron-mini-4b-instruct.yaml
index 070d6bf..0283f9a 100644
--- a/models/nvidia/nemotron-mini-4b-instruct.yaml
+++ b/models/nvidia/nemotron-mini-4b-instruct.yaml
@@ -52,8 +52,3 @@ params:
     label: Stop
     description: A string or list of strings where the API will stop generating further tokens. The returned text will not contain the stop sequence.
     group: generation_length
-  - path: tools
-    type: string
-    label: Tools
-    description: A list of tools the model may call. If no tools are provided, the model will not call any tools.
-    group: tooling

From b8a740bab09ebd8355225cf094e8e512d2e414b6 Mon Sep 17 00:00:00 2001
From: Guillaume Gay <guillaume.gay@protonmail.com>
Date: Mon, 15 Jun 2026 14:13:22 +0200
Subject: [PATCH 3/3] chore: regenerate modelparams package data

---
 packages/modelparams/scripts/codegen.ts       |   4 +
 packages/modelparams/src/generated/data.ts    | 976 ++++++++++++++++++
 .../modelparams/src/generated/defaults.ts     |  93 ++
 .../modelparams/src/generated/model-ids.ts    |  16 +
 .../modelparams/src/generated/params-by-id.ts | 108 ++
 5 files changed, 1197 insertions(+)

diff --git a/packages/modelparams/scripts/codegen.ts b/packages/modelparams/scripts/codegen.ts
index 3ae2ca9..b532a5e 100644
--- a/packages/modelparams/scripts/codegen.ts
+++ b/packages/modelparams/scripts/codegen.ts
@@ -32,6 +32,10 @@ function quoteKey(k: string): string {
 function emitParamsByIdEntry(m: Model): string {
   const id = modelId(m);
   const fields = m.params.map((p) => `    ${quoteKey(p.path)}: ${tsType(p)};`).join("\n");
+  if (fields.length === 0) {
+    return `  ${JSON.stringify(id)}: Record<string, never>;`;
+  }
+
   return `  ${JSON.stringify(id)}: {\n${fields}\n  };`;
 }
 
diff --git a/packages/modelparams/src/generated/data.ts b/packages/modelparams/src/generated/data.ts
index 0f8611e..49d13ec 100644
--- a/packages/modelparams/src/generated/data.ts
+++ b/packages/modelparams/src/generated/data.ts
@@ -9917,6 +9917,982 @@ export const CATALOG = [
       }
     ]
   },
+  {
+    "provider": "nvidia",
+    "authType": "api_key",
+    "model": "gliner-pii",
+    "params": [
+      {
+        "path": "threshold",
+        "label": "Threshold",
+        "description": "Confidence threshold for entity detection. Lower values detect more entities but may include false positives.",
+        "group": "sampling",
+        "type": "number",
+        "default": 0.5,
+        "range": {
+          "min": 0,
+          "max": 1
+        }
+      },
+      {
+        "path": "chunk_length",
+        "label": "Chunk length",
+        "description": "Context window size for processing. Longer texts are automatically split into chunks with overlap for complete coverage. Must be greater than overlap.",
+        "group": "provider_metadata",
+        "type": "integer",
+        "default": 384,
+        "range": {
+          "min": 1,
+          "max": 2048
+        }
+      },
+      {
+        "path": "overlap",
+        "label": "Overlap",
+        "description": "Token overlap between chunks to prevent entity clipping. Must be less than chunk_length.",
+        "group": "provider_metadata",
+        "type": "integer",
+        "default": 128,
+        "range": {
+          "min": 0,
+          "max": 512
+        }
+      },
+      {
+        "path": "flat_ner",
+        "label": "Flat NER",
+        "description": "When true, prevents overlapping entity spans. When false, may return nested entities such as both a full name and its constituent first name.",
+        "group": "provider_metadata",
+        "type": "boolean",
+        "default": false
+      }
+    ]
+  },
+  {
+    "provider": "nvidia",
+    "authType": "api_key",
+    "model": "llama-3.1-nemoguard-8b-topic-control",
+    "params": [
+      {
+        "path": "temperature",
+        "label": "Temperature",
+        "description": "Controls randomness. Lower values make outputs more focused; higher values make them more varied. Not recommended to modify both temperature and top_p in the same call.",
+        "group": "sampling",
+        "type": "number",
+        "default": 0.5,
+        "range": {
+          "min": 0,
+          "max": 2
+        }
+      },
+      {
+        "path": "top_p",
+        "label": "Top P",
+        "description": "Controls nucleus sampling by limiting generation to tokens within the selected cumulative probability. Not recommended to modify both temperature and top_p in the same call.",
+        "group": "sampling",
+        "type": "number",
+        "default": 1,
+        "range": {
+          "max": 1
+        }
+      },
+      {
+        "path": "max_tokens",
+        "label": "Max tokens",
+        "description": "Maximum number of tokens to generate. Generation stops when this limit is reached.",
+        "group": "generation_length",
+        "type": "integer",
+        "default": 1024,
+        "range": {
+          "min": 1
+        }
+      },
+      {
+        "path": "frequency_penalty",
+        "label": "Frequency penalty",
+        "description": "Penalizes new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.",
+        "group": "sampling",
+        "type": "number",
+        "default": 0,
+        "range": {
+          "min": -2,
+          "max": 2
+        }
+      },
+      {
+        "path": "presence_penalty",
+        "label": "Presence penalty",
+        "description": "Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics.",
+        "group": "sampling",
+        "type": "number",
+        "default": 0,
+        "range": {
+          "min": -2,
+          "max": 2
+        }
+      },
+      {
+        "path": "stop",
+        "label": "Stop",
+        "description": "A string or list of strings where the API will stop generating further tokens. The returned text will not contain the stop sequence.",
+        "group": "generation_length",
+        "type": "string"
+      }
+    ]
+  },
+  {
+    "provider": "nvidia",
+    "authType": "api_key",
+    "model": "llama-3.1-nemotron-nano-8b-v1",
+    "params": [
+      {
+        "path": "temperature",
+        "label": "Temperature",
+        "description": "Controls randomness. Lower values make outputs more focused; higher values make them more varied. Not recommended to modify both temperature and top_p in the same call.",
+        "group": "sampling",
+        "type": "number",
+        "default": 0.6,
+        "range": {
+          "min": 0,
+          "max": 1
+        }
+      },
+      {
+        "path": "top_p",
+        "label": "Top P",
+        "description": "Controls nucleus sampling by limiting generation to tokens within the selected cumulative probability. Not recommended to modify both temperature and top_p in the same call.",
+        "group": "sampling",
+        "type": "number",
+        "default": 0.95,
+        "range": {
+          "max": 1
+        }
+      },
+      {
+        "path": "max_tokens",
+        "label": "Max tokens",
+        "description": "Maximum number of tokens to generate. Generation stops when this limit is reached.",
+        "group": "generation_length",
+        "type": "integer",
+        "default": 4096,
+        "range": {
+          "min": 1,
+          "max": 16384
+        }
+      },
+      {
+        "path": "frequency_penalty",
+        "label": "Frequency penalty",
+        "description": "Penalizes new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.",
+        "group": "sampling",
+        "type": "number",
+        "default": 0,
+        "range": {
+          "min": -2,
+          "max": 2
+        }
+      },
+      {
+        "path": "presence_penalty",
+        "label": "Presence penalty",
+        "description": "Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics.",
+        "group": "sampling",
+        "type": "number",
+        "default": 0,
+        "range": {
+          "min": -2,
+          "max": 2
+        }
+      },
+      {
+        "path": "seed",
+        "label": "Seed",
+        "description": "Best-effort deterministic sampling seed. Changing the seed produces a different response with similar characteristics. Fix the seed to reproduce results.",
+        "group": "sampling",
+        "type": "integer",
+        "default": 0,
+        "range": {
+          "min": 0,
+          "max": 18446744073709552000
+        }
+      },
+      {
+        "path": "stop",
+        "label": "Stop",
+        "description": "A string or list of strings where the API will stop generating further tokens. The returned text will not contain the stop sequence.",
+        "group": "generation_length",
+        "type": "string"
+      }
+    ]
+  },
+  {
+    "provider": "nvidia",
+    "authType": "api_key",
+    "model": "llama-3.1-nemotron-safety-guard-8b-v3",
+    "params": [
+      {
+        "path": "temperature",
+        "label": "Temperature",
+        "description": "Controls randomness. Lower values make outputs more focused; higher values make them more varied.",
+        "group": "sampling",
+        "type": "number",
+        "default": 0,
+        "range": {
+          "min": 0,
+          "max": 1
+        }
+      }
+    ]
+  },
+  {
+    "provider": "nvidia",
+    "authType": "api_key",
+    "model": "llama-3.1-nemotron-ultra-253b-v1",
+    "params": [
+      {
+        "path": "temperature",
+        "label": "Temperature",
+        "description": "Controls randomness. Lower values make outputs more focused; higher values make them more varied. Not recommended to modify both temperature and top_p in the same call.",
+        "group": "sampling",
+        "type": "number",
+        "default": 0.6,
+        "range": {
+          "min": 0,
+          "max": 1
+        }
+      },
+      {
+        "path": "top_p",
+        "label": "Top P",
+        "description": "Controls nucleus sampling by limiting generation to tokens within the selected cumulative probability. Not recommended to modify both temperature and top_p in the same call.",
+        "group": "sampling",
+        "type": "number",
+        "default": 0.95,
+        "range": {
+          "max": 1
+        }
+      },
+      {
+        "path": "max_tokens",
+        "label": "Max tokens",
+        "description": "Maximum number of tokens to generate. Generation stops when this limit is reached.",
+        "group": "generation_length",
+        "type": "integer",
+        "default": 4096,
+        "range": {
+          "min": 1,
+          "max": 16384
+        }
+      },
+      {
+        "path": "frequency_penalty",
+        "label": "Frequency penalty",
+        "description": "Penalizes new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.",
+        "group": "sampling",
+        "type": "number",
+        "default": 0,
+        "range": {
+          "min": -2,
+          "max": 2
+        }
+      },
+      {
+        "path": "presence_penalty",
+        "label": "Presence penalty",
+        "description": "Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics.",
+        "group": "sampling",
+        "type": "number",
+        "default": 0,
+        "range": {
+          "min": -2,
+          "max": 2
+        }
+      },
+      {
+        "path": "seed",
+        "label": "Seed",
+        "description": "Best-effort deterministic sampling seed. Changing the seed produces a different response with similar characteristics. Fix the seed to reproduce results.",
+        "group": "sampling",
+        "type": "integer",
+        "default": 0,
+        "range": {
+          "min": 0,
+          "max": 18446744073709552000
+        }
+      },
+      {
+        "path": "stop",
+        "label": "Stop",
+        "description": "A string or list of strings where the API will stop generating further tokens. The returned text will not contain the stop sequence.",
+        "group": "generation_length",
+        "type": "string"
+      }
+    ]
+  },
+  {
+    "provider": "nvidia",
+    "authType": "api_key",
+    "model": "llama-3.3-nemotron-super-49b-v1",
+    "params": [
+      {
+        "path": "temperature",
+        "label": "Temperature",
+        "description": "Controls randomness. Lower values make outputs more focused; higher values make them more varied. Not recommended to modify both temperature and top_p in the same call.",
+        "group": "sampling",
+        "type": "number",
+        "default": 0.6,
+        "range": {
+          "min": 0,
+          "max": 1
+        }
+      },
+      {
+        "path": "top_p",
+        "label": "Top P",
+        "description": "Controls nucleus sampling by limiting generation to tokens within the selected cumulative probability. Not recommended to modify both temperature and top_p in the same call.",
+        "group": "sampling",
+        "type": "number",
+        "default": 0.95,
+        "range": {
+          "max": 1
+        }
+      },
+      {
+        "path": "max_tokens",
+        "label": "Max tokens",
+        "description": "Maximum number of tokens to generate. Generation stops when this limit is reached.",
+        "group": "generation_length",
+        "type": "integer",
+        "default": 4096,
+        "range": {
+          "min": 1,
+          "max": 16384
+        }
+      },
+      {
+        "path": "frequency_penalty",
+        "label": "Frequency penalty",
+        "description": "Penalizes new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.",
+        "group": "sampling",
+        "type": "number",
+        "default": 0,
+        "range": {
+          "min": -2,
+          "max": 2
+        }
+      },
+      {
+        "path": "presence_penalty",
+        "label": "Presence penalty",
+        "description": "Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics.",
+        "group": "sampling",
+        "type": "number",
+        "default": 0,
+        "range": {
+          "min": -2,
+          "max": 2
+        }
+      },
+      {
+        "path": "seed",
+        "label": "Seed",
+        "description": "Best-effort deterministic sampling seed. Changing the seed produces a different response with similar characteristics. Fix the seed to reproduce results.",
+        "group": "sampling",
+        "type": "integer",
+        "default": 0,
+        "range": {
+          "min": 0,
+          "max": 18446744073709552000
+        }
+      },
+      {
+        "path": "stop",
+        "label": "Stop",
+        "description": "A string or list of strings where the API will stop generating further tokens. The returned text will not contain the stop sequence.",
+        "group": "generation_length",
+        "type": "string"
+      }
+    ]
+  },
+  {
+    "provider": "nvidia",
+    "authType": "api_key",
+    "model": "llama-3.3-nemotron-super-49b-v1.5",
+    "params": [
+      {
+        "path": "temperature",
+        "label": "Temperature",
+        "description": "Controls randomness. Lower values make outputs more focused; higher values make them more varied. Not recommended to modify both temperature and top_p in the same call.",
+        "group": "sampling",
+        "type": "number",
+        "default": 0.6,
+        "range": {
+          "min": 0,
+          "max": 1
+        }
+      },
+      {
+        "path": "top_p",
+        "label": "Top P",
+        "description": "Controls nucleus sampling by limiting generation to tokens within the selected cumulative probability. Not recommended to modify both temperature and top_p in the same call.",
+        "group": "sampling",
+        "type": "number",
+        "default": 0.95,
+        "range": {
+          "max": 1
+        }
+      },
+      {
+        "path": "max_tokens",
+        "label": "Max tokens",
+        "description": "Maximum number of tokens to generate. Generation stops when this limit is reached.",
+        "group": "generation_length",
+        "type": "integer",
+        "default": 65536,
+        "range": {
+          "min": 1,
+          "max": 65536
+        }
+      },
+      {
+        "path": "frequency_penalty",
+        "label": "Frequency penalty",
+        "description": "Penalizes new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.",
+        "group": "sampling",
+        "type": "number",
+        "default": 0,
+        "range": {
+          "min": -2,
+          "max": 2
+        }
+      },
+      {
+        "path": "presence_penalty",
+        "label": "Presence penalty",
+        "description": "Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics.",
+        "group": "sampling",
+        "type": "number",
+        "default": 0,
+        "range": {
+          "min": -2,
+          "max": 2
+        }
+      },
+      {
+        "path": "seed",
+        "label": "Seed",
+        "description": "Best-effort deterministic sampling seed. Changing the seed produces a different response with similar characteristics. Fix the seed to reproduce results.",
+        "group": "sampling",
+        "type": "integer",
+        "default": 0,
+        "range": {
+          "min": 0,
+          "max": 18446744073709552000
+        }
+      },
+      {
+        "path": "stop",
+        "label": "Stop",
+        "description": "A string or list of strings where the API will stop generating further tokens. The returned text will not contain the stop sequence.",
+        "group": "generation_length",
+        "type": "string"
+      }
+    ]
+  },
+  {
+    "provider": "nvidia",
+    "authType": "api_key",
+    "model": "nemoguard-jailbreak-detect",
+    "params": []
+  },
+  {
+    "provider": "nvidia",
+    "authType": "api_key",
+    "model": "nemotron-3-nano-30b-a3b",
+    "params": [
+      {
+        "path": "temperature",
+        "label": "Temperature",
+        "description": "Controls randomness. Lower values make outputs more focused; higher values make them more varied. Not recommended to modify both temperature and top_p in the same call.",
+        "group": "sampling",
+        "type": "number",
+        "default": 1,
+        "range": {
+          "max": 1
+        }
+      },
+      {
+        "path": "top_p",
+        "label": "Top P",
+        "description": "Controls nucleus sampling by limiting generation to tokens within the selected cumulative probability. Not recommended to modify both temperature and top_p in the same call.",
+        "group": "sampling",
+        "type": "number",
+        "default": 1,
+        "range": {
+          "max": 1
+        }
+      },
+      {
+        "path": "max_tokens",
+        "label": "Max tokens",
+        "description": "Maximum number of tokens to generate. Generation stops when this limit is reached.",
+        "group": "generation_length",
+        "type": "integer",
+        "default": 16384,
+        "range": {
+          "min": 1,
+          "max": 32768
+        }
+      },
+      {
+        "path": "seed",
+        "label": "Seed",
+        "description": "Best-effort deterministic sampling seed. Repeated requests with the same seed and parameters should return the same result.",
+        "group": "sampling",
+        "type": "integer",
+        "range": {
+          "min": 0,
+          "max": 18446744073709552000
+        }
+      },
+      {
+        "path": "stop",
+        "label": "Stop",
+        "description": "A string or list of strings where the API will stop generating further tokens. The returned text will not contain the stop sequence.",
+        "group": "generation_length",
+        "type": "string"
+      }
+    ]
+  },
+  {
+    "provider": "nvidia",
+    "authType": "api_key",
+    "model": "nemotron-3-super-120b-a12b",
+    "params": [
+      {
+        "path": "temperature",
+        "label": "Temperature",
+        "description": "Controls randomness. Lower values make outputs more focused; higher values make them more varied. Not recommended to modify both temperature and top_p in the same call.",
+        "group": "sampling",
+        "type": "number",
+        "default": 1,
+        "range": {
+          "max": 1
+        }
+      },
+      {
+        "path": "top_p",
+        "label": "Top P",
+        "description": "Controls nucleus sampling by limiting generation to tokens within the selected cumulative probability. Not recommended to modify both temperature and top_p in the same call.",
+        "group": "sampling",
+        "type": "number",
+        "default": 0.95,
+        "range": {
+          "max": 1
+        }
+      },
+      {
+        "path": "max_tokens",
+        "label": "Max tokens",
+        "description": "Maximum number of tokens to generate. Generation stops when this limit is reached.",
+        "group": "generation_length",
+        "type": "integer",
+        "default": 16384,
+        "range": {
+          "min": 1,
+          "max": 32768
+        }
+      },
+      {
+        "path": "reasoning_effort",
+        "label": "Reasoning effort",
+        "description": "Controls the reasoning mode. 'none' disables reasoning tokens, 'low' enables low-effort reasoning, and 'high' enables full reasoning.",
+        "group": "reasoning",
+        "type": "enum",
+        "default": "high",
+        "values": [
+          "none",
+          "low",
+          "high"
+        ]
+      },
+      {
+        "path": "reasoning_budget",
+        "label": "Reasoning budget",
+        "description": "Maximum number of tokens the model may use for internal reasoning before being forced to end the reasoning trace. Use -1 to disable budget enforcement.",
+        "group": "reasoning",
+        "type": "integer",
+        "default": 16384,
+        "range": {
+          "min": -1,
+          "max": 32768
+        }
+      },
+      {
+        "path": "seed",
+        "label": "Seed",
+        "description": "Best-effort deterministic sampling seed. Repeated requests with the same seed and parameters should return the same result.",
+        "group": "sampling",
+        "type": "integer",
+        "range": {
+          "min": 0,
+          "max": 18446744073709552000
+        }
+      },
+      {
+        "path": "stop",
+        "label": "Stop",
+        "description": "A string or list of strings where the API will stop generating further tokens. The returned text will not contain the stop sequence.",
+        "group": "generation_length",
+        "type": "string"
+      }
+    ]
+  },
+  {
+    "provider": "nvidia",
+    "authType": "api_key",
+    "model": "nemotron-3-ultra-550b-a55b",
+    "params": [
+      {
+        "path": "temperature",
+        "label": "Temperature",
+        "description": "Controls randomness. Lower values make outputs more focused; higher values make them more varied. Not recommended to modify both temperature and top_p in the same call.",
+        "group": "sampling",
+        "type": "number",
+        "default": 1,
+        "range": {
+          "max": 1
+        }
+      },
+      {
+        "path": "top_p",
+        "label": "Top P",
+        "description": "Controls nucleus sampling by limiting generation to tokens within the selected cumulative probability. Not recommended to modify both temperature and top_p in the same call.",
+        "group": "sampling",
+        "type": "number",
+        "default": 0.95,
+        "range": {
+          "max": 1
+        }
+      },
+      {
+        "path": "max_tokens",
+        "label": "Max tokens",
+        "description": "Maximum number of tokens to generate. Generation stops when this limit is reached.",
+        "group": "generation_length",
+        "type": "integer",
+        "default": 16384,
+        "range": {
+          "min": 1,
+          "max": 32768
+        }
+      },
+      {
+        "path": "reasoning_effort",
+        "label": "Reasoning effort",
+        "description": "Controls the reasoning mode. 'none' disables reasoning tokens, 'medium' enables efficient reasoning, and 'high' enables full reasoning.",
+        "group": "reasoning",
+        "type": "enum",
+        "default": "high",
+        "values": [
+          "none",
+          "medium",
+          "high"
+        ]
+      },
+      {
+        "path": "reasoning_budget",
+        "label": "Reasoning budget",
+        "description": "Maximum number of tokens the model may use for internal reasoning before being forced to end the reasoning trace. Use -1 to disable budget enforcement.",
+        "group": "reasoning",
+        "type": "integer",
+        "default": 16384,
+        "range": {
+          "min": -1,
+          "max": 32768
+        }
+      },
+      {
+        "path": "seed",
+        "label": "Seed",
+        "description": "Best-effort deterministic sampling seed. Repeated requests with the same seed and parameters should return the same result.",
+        "group": "sampling",
+        "type": "integer",
+        "range": {
+          "min": 0,
+          "max": 18446744073709552000
+        }
+      },
+      {
+        "path": "stop",
+        "label": "Stop",
+        "description": "A string or list of strings where the API will stop generating further tokens. The returned text will not contain the stop sequence.",
+        "group": "generation_length",
+        "type": "string"
+      }
+    ]
+  },
+  {
+    "provider": "nvidia",
+    "authType": "api_key",
+    "model": "nemotron-content-safety-reasoning-4b",
+    "params": [
+      {
+        "path": "temperature",
+        "label": "Temperature",
+        "description": "Controls randomness. Lower values make outputs more focused; higher values make them more varied. Not recommended to modify both temperature and top_p in the same call.",
+        "group": "sampling",
+        "type": "number",
+        "default": 1,
+        "range": {
+          "max": 1
+        }
+      },
+      {
+        "path": "top_p",
+        "label": "Top P",
+        "description": "Controls nucleus sampling by limiting generation to tokens within the selected cumulative probability. Not recommended to modify both temperature and top_p in the same call.",
+        "group": "sampling",
+        "type": "number",
+        "default": 1,
+        "range": {
+          "max": 1
+        }
+      },
+      {
+        "path": "max_tokens",
+        "label": "Max tokens",
+        "description": "Maximum number of tokens to generate. Generation stops when this limit is reached.",
+        "group": "generation_length",
+        "type": "integer",
+        "default": 16384,
+        "range": {
+          "min": 1,
+          "max": 32768
+        }
+      },
+      {
+        "path": "seed",
+        "label": "Seed",
+        "description": "Best-effort deterministic sampling seed. Repeated requests with the same seed and parameters should return the same result.",
+        "group": "sampling",
+        "type": "integer",
+        "range": {
+          "min": 0,
+          "max": 18446744073709552000
+        }
+      },
+      {
+        "path": "stop",
+        "label": "Stop",
+        "description": "A string or list of strings where the API will stop generating further tokens. The returned text will not contain the stop sequence.",
+        "group": "generation_length",
+        "type": "string"
+      }
+    ]
+  },
+  {
+    "provider": "nvidia",
+    "authType": "api_key",
+    "model": "nemotron-mini-4b-instruct",
+    "params": [
+      {
+        "path": "temperature",
+        "label": "Temperature",
+        "description": "Controls randomness. Lower values make outputs more focused; higher values make them more varied. Not recommended to modify both temperature and top_p in the same call.",
+        "group": "sampling",
+        "type": "number",
+        "default": 0.2,
+        "range": {
+          "min": 0,
+          "max": 1
+        }
+      },
+      {
+        "path": "top_p",
+        "label": "Top P",
+        "description": "Controls nucleus sampling by limiting generation to tokens within the selected cumulative probability. Not recommended to modify both temperature and top_p in the same call.",
+        "group": "sampling",
+        "type": "number",
+        "default": 0.7,
+        "range": {
+          "max": 1
+        }
+      },
+      {
+        "path": "max_tokens",
+        "label": "Max tokens",
+        "description": "Maximum number of tokens to generate. Generation stops when this limit is reached.",
+        "group": "generation_length",
+        "type": "integer",
+        "default": 1024,
+        "range": {
+          "min": 1,
+          "max": 4096
+        }
+      },
+      {
+        "path": "frequency_penalty",
+        "label": "Frequency penalty",
+        "description": "Penalizes new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.",
+        "group": "sampling",
+        "type": "number",
+        "default": 0,
+        "range": {
+          "min": -2,
+          "max": 2
+        }
+      },
+      {
+        "path": "presence_penalty",
+        "label": "Presence penalty",
+        "description": "Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics.",
+        "group": "sampling",
+        "type": "number",
+        "default": 0,
+        "range": {
+          "min": -2,
+          "max": 2
+        }
+      },
+      {
+        "path": "stop",
+        "label": "Stop",
+        "description": "A string or list of strings where the API will stop generating further tokens. The returned text will not contain the stop sequence.",
+        "group": "generation_length",
+        "type": "string"
+      }
+    ]
+  },
+  {
+    "provider": "nvidia",
+    "authType": "api_key",
+    "model": "riva-translate-4b-instruct-v1.1",
+    "params": [
+      {
+        "path": "temperature",
+        "label": "Temperature",
+        "description": "Controls randomness. Lower values make outputs more focused; higher values make them more varied. Not recommended to modify both temperature and top_p in the same call.",
+        "group": "sampling",
+        "type": "number",
+        "default": 0,
+        "range": {
+          "min": 0,
+          "max": 1
+        }
+      },
+      {
+        "path": "top_p",
+        "label": "Top P",
+        "description": "Controls nucleus sampling by limiting generation to tokens within the selected cumulative probability. Not recommended to modify both temperature and top_p in the same call.",
+        "group": "sampling",
+        "type": "number",
+        "default": 0.9,
+        "range": {
+          "max": 1
+        }
+      },
+      {
+        "path": "max_tokens",
+        "label": "Max tokens",
+        "description": "Maximum number of tokens to generate. Generation stops when this limit is reached.",
+        "group": "generation_length",
+        "type": "integer",
+        "default": 512,
+        "range": {
+          "min": 1,
+          "max": 4096
+        }
+      },
+      {
+        "path": "frequency_penalty",
+        "label": "Frequency penalty",
+        "description": "Penalizes new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.",
+        "group": "sampling",
+        "type": "number",
+        "default": 0,
+        "range": {
+          "min": -2,
+          "max": 2
+        }
+      },
+      {
+        "path": "presence_penalty",
+        "label": "Presence penalty",
+        "description": "Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics.",
+        "group": "sampling",
+        "type": "number",
+        "default": 0,
+        "range": {
+          "min": -2,
+          "max": 2
+        }
+      },
+      {
+        "path": "stop",
+        "label": "Stop",
+        "description": "A string or list of strings where the API will stop generating further tokens. The returned text will not contain the stop sequence.",
+        "group": "generation_length",
+        "type": "string"
+      }
+    ]
+  },
+  {
+    "provider": "nvidia",
+    "authType": "api_key",
+    "model": "usdcode-llama-3.1-70b-instruct",
+    "params": [
+      {
+        "path": "temperature",
+        "label": "Temperature",
+        "description": "Controls randomness. Lower values make outputs more focused; higher values make them more varied. Not recommended to modify both temperature and top_p in the same call.",
+        "group": "sampling",
+        "type": "number",
+        "default": 0.1,
+        "range": {
+          "min": 0,
+          "max": 1
+        }
+      },
+      {
+        "path": "top_p",
+        "label": "Top P",
+        "description": "Controls nucleus sampling by limiting generation to tokens within the selected cumulative probability. Not recommended to modify both temperature and top_p in the same call.",
+        "group": "sampling",
+        "type": "number",
+        "default": 1,
+        "range": {
+          "max": 1
+        }
+      },
+      {
+        "path": "max_tokens",
+        "label": "Max tokens",
+        "description": "Maximum number of tokens to generate. Generation stops when this limit is reached.",
+        "group": "generation_length",
+        "type": "integer",
+        "default": 1024,
+        "range": {
+          "min": 1,
+          "max": 2048
+        }
+      },
+      {
+        "path": "expert_type",
+        "label": "Expert type",
+        "description": "The type of expert to use. 'knowledge' answers with USD knowledge, 'code' responds with vanilla OpenUSD code, 'helperfunction' uses high-level helper functions, and 'auto' lets the LLM determine which expert to use.",
+        "group": "provider_metadata",
+        "type": "enum",
+        "default": "auto",
+        "values": [
+          "auto",
+          "code",
+          "knowledge",
+          "helperfunction"
+        ]
+      }
+    ]
+  },
   {
     "provider": "openai",
     "authType": "api_key",
diff --git a/packages/modelparams/src/generated/defaults.ts b/packages/modelparams/src/generated/defaults.ts
index e64886f..fdb86a2 100644
--- a/packages/modelparams/src/generated/defaults.ts
+++ b/packages/modelparams/src/generated/defaults.ts
@@ -730,6 +730,99 @@ export const DEFAULTS = {
     frequency_penalty: 0,
     "response_format.type": "text",
   },
+  "nvidia/gliner-pii": {
+    threshold: 0.5,
+    chunk_length: 384,
+    overlap: 128,
+    flat_ner: false,
+  },
+  "nvidia/llama-3.1-nemoguard-8b-topic-control": {
+    temperature: 0.5,
+    top_p: 1,
+    max_tokens: 1024,
+    frequency_penalty: 0,
+    presence_penalty: 0,
+  },
+  "nvidia/llama-3.1-nemotron-nano-8b-v1": {
+    temperature: 0.6,
+    top_p: 0.95,
+    max_tokens: 4096,
+    frequency_penalty: 0,
+    presence_penalty: 0,
+    seed: 0,
+  },
+  "nvidia/llama-3.1-nemotron-safety-guard-8b-v3": {
+    temperature: 0,
+  },
+  "nvidia/llama-3.1-nemotron-ultra-253b-v1": {
+    temperature: 0.6,
+    top_p: 0.95,
+    max_tokens: 4096,
+    frequency_penalty: 0,
+    presence_penalty: 0,
+    seed: 0,
+  },
+  "nvidia/llama-3.3-nemotron-super-49b-v1": {
+    temperature: 0.6,
+    top_p: 0.95,
+    max_tokens: 4096,
+    frequency_penalty: 0,
+    presence_penalty: 0,
+    seed: 0,
+  },
+  "nvidia/llama-3.3-nemotron-super-49b-v1.5": {
+    temperature: 0.6,
+    top_p: 0.95,
+    max_tokens: 65536,
+    frequency_penalty: 0,
+    presence_penalty: 0,
+    seed: 0,
+  },
+  "nvidia/nemoguard-jailbreak-detect": {},
+  "nvidia/nemotron-3-nano-30b-a3b": {
+    temperature: 1,
+    top_p: 1,
+    max_tokens: 16384,
+  },
+  "nvidia/nemotron-3-super-120b-a12b": {
+    temperature: 1,
+    top_p: 0.95,
+    max_tokens: 16384,
+    reasoning_effort: "high",
+    reasoning_budget: 16384,
+  },
+  "nvidia/nemotron-3-ultra-550b-a55b": {
+    temperature: 1,
+    top_p: 0.95,
+    max_tokens: 16384,
+    reasoning_effort: "high",
+    reasoning_budget: 16384,
+  },
+  "nvidia/nemotron-content-safety-reasoning-4b": {
+    temperature: 1,
+    top_p: 1,
+    max_tokens: 16384,
+  },
+  "nvidia/nemotron-mini-4b-instruct": {
+    temperature: 0.2,
+    top_p: 0.7,
+    max_tokens: 1024,
+    frequency_penalty: 0,
+    presence_penalty: 0,
+  },
+  "nvidia/riva-translate-4b-instruct-v1.1": {
+    temperature: 0,
+    top_p: 0.9,
+    max_tokens: 512,
+    frequency_penalty: 0,
+    presence_penalty: 0,
+  },
+  "nvidia/usdcode-llama-3.1-70b-instruct": {
+    temperature: 0.1,
+    top_p: 1,
+    max_tokens: 1024,
+    expert_type: "auto",
+  },
   "openai/chatgpt-4o-latest": {
     max_tokens: 4096,
     temperature: 1,
diff --git a/packages/modelparams/src/generated/model-ids.ts b/packages/modelparams/src/generated/model-ids.ts
index 33a1830..3b7eae0 100644
--- a/packages/modelparams/src/generated/model-ids.ts
+++ b/packages/modelparams/src/generated/model-ids.ts
@@ -109,6 +109,21 @@ export const MODEL_IDS = [
   "moonshot/moonshot-v1-128k",
   "moonshot/moonshot-v1-32k",
   "moonshot/moonshot-v1-8k",
+  "nvidia/gliner-pii",
+  "nvidia/llama-3.1-nemoguard-8b-topic-control",
+  "nvidia/llama-3.1-nemotron-nano-8b-v1",
+  "nvidia/llama-3.1-nemotron-safety-guard-8b-v3",
+  "nvidia/llama-3.1-nemotron-ultra-253b-v1",
+  "nvidia/llama-3.3-nemotron-super-49b-v1",
+  "nvidia/llama-3.3-nemotron-super-49b-v1.5",
+  "nvidia/nemoguard-jailbreak-detect",
+  "nvidia/nemotron-3-nano-30b-a3b",
+  "nvidia/nemotron-3-super-120b-a12b",
+  "nvidia/nemotron-3-ultra-550b-a55b",
+  "nvidia/nemotron-content-safety-reasoning-4b",
+  "nvidia/nemotron-mini-4b-instruct",
+  "nvidia/riva-translate-4b-instruct-v1.1",
+  "nvidia/usdcode-llama-3.1-70b-instruct",
   "openai/chatgpt-4o-latest",
   "openai/gpt-3.5-turbo",
   "openai/gpt-4-turbo",
@@ -192,6 +207,7 @@ export const PROVIDERS = [
   "minimax",
   "mistral",
   "moonshot",
+  "nvidia",
   "openai",
   "perplexity",
   "xai",
diff --git a/packages/modelparams/src/generated/params-by-id.ts b/packages/modelparams/src/generated/params-by-id.ts
index 67765f8..17357d0 100644
--- a/packages/modelparams/src/generated/params-by-id.ts
+++ b/packages/modelparams/src/generated/params-by-id.ts
@@ -913,6 +913,114 @@ export type ParamsById = {
     frequency_penalty: number;
     "response_format.type": "text" | "json_object";
   };
+  "nvidia/gliner-pii": {
+    threshold: number;
+    chunk_length: number;
+    overlap: number;
+    flat_ner: boolean;
+  };
+  "nvidia/llama-3.1-nemoguard-8b-topic-control": {
+    temperature: number;
+    top_p: number;
+    max_tokens: number;
+    frequency_penalty: number;
+    presence_penalty: number;
+    stop: string;
+  };
+  "nvidia/llama-3.1-nemotron-nano-8b-v1": {
+    temperature: number;
+    top_p: number;
+    max_tokens: number;
+    frequency_penalty: number;
+    presence_penalty: number;
+    seed: number;
+    stop: string;
+  };
+  "nvidia/llama-3.1-nemotron-safety-guard-8b-v3": {
+    temperature: number;
+  };
+  "nvidia/llama-3.1-nemotron-ultra-253b-v1": {
+    temperature: number;
+    top_p: number;
+    max_tokens: number;
+    frequency_penalty: number;
+    presence_penalty: number;
+    seed: number;
+    stop: string;
+  };
+  "nvidia/llama-3.3-nemotron-super-49b-v1": {
+    temperature: number;
+    top_p: number;
+    max_tokens: number;
+    frequency_penalty: number;
+    presence_penalty: number;
+    seed: number;
+    stop: string;
+  };
+  "nvidia/llama-3.3-nemotron-super-49b-v1.5": {
+    temperature: number;
+    top_p: number;
+    max_tokens: number;
+    frequency_penalty: number;
+    presence_penalty: number;
+    seed: number;
+    stop: string;
+  };
+  "nvidia/nemoguard-jailbreak-detect": Record<string, never>;
+  "nvidia/nemotron-3-nano-30b-a3b": {
+    temperature: number;
+    top_p: number;
+    max_tokens: number;
+    seed: number;
+    stop: string;
+  };
+  "nvidia/nemotron-3-super-120b-a12b": {
+    temperature: number;
+    top_p: number;
+    max_tokens: number;
+    reasoning_effort: "none" | "low" | "high";
+    reasoning_budget: number;
+    seed: number;
+    stop: string;
+  };
+  "nvidia/nemotron-3-ultra-550b-a55b": {
+    temperature: number;
+    top_p: number;
+    max_tokens: number;
+    reasoning_effort: "none" | "medium" | "high";
+    reasoning_budget: number;
+    seed: number;
+    stop: string;
+  };
+  "nvidia/nemotron-content-safety-reasoning-4b": {
+    temperature: number;
+    top_p: number;
+    max_tokens: number;
+    seed: number;
+    stop: string;
+  };
+  "nvidia/nemotron-mini-4b-instruct": {
+    temperature: number;
+    top_p: number;
+    max_tokens: number;
+    frequency_penalty: number;
+    presence_penalty: number;
+    stop: string;
+  };
+  "nvidia/riva-translate-4b-instruct-v1.1": {
+    temperature: number;
+    top_p: number;
+    max_tokens: number;
+    frequency_penalty: number;
+    presence_penalty: number;
+    stop: string;
+  };
+  "nvidia/usdcode-llama-3.1-70b-instruct": {
+    temperature: number;
+    top_p: number;
+    max_tokens: number;
+    expert_type: "auto" | "code" | "knowledge" | "helperfunction";
+  };
   "openai/chatgpt-4o-latest": {
     max_tokens: number;
     temperature: number;