SemiAnalysisAI · Oseltamivir · Jun 13, 2026 · Jun 13, 2026 · Jun 13, 2026 · Jun 13, 2026
@@ -12604,6 +12604,193 @@ qwen3.5-fp4-b200-trt:
       - { tp: 4, ep: 4, dp-attn: true, conc-list: [1024] }
       - { tp: 8, ep: 8, dp-attn: true, conc-list: [256, 512, 1024] }
 
+# MiniMax-M3 (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3).
+# 427B total / 26B active MoE with MSA sparse attention; MXFP8 checkpoint
+# (MiniMaxAI/MiniMax-M3-MXFP8, ~444 GB) quantized by NVIDIA — native MX
+# tensor cores on Blackwell. Image is the multi-arch m3_release vLLM build
+# (vllm/vllm-openai:minimax-m3, vllm-project/vllm#45381); recipes set
+# dynamo.install=true + wheel 1.2.0.dev20260526 so the dynamo runtime AND
+# NIXL are layered in at job start (the ai-dynamo vllm-runtime dev image
+# shipped without NIXL, so disagg workers crashed at NixlConnector init).
+# block-size 128 mandatory (MSA index-cache alignment); FLASHINFER
+# (trtllm-gen) attention to exploit Blackwell — needs vllm#45381 @ 022448dd
+# (m3_release HEAD: gates page>=128 on trtllm-gen GQA), so rebuild the image
+# from m3_release before running. Fully disaggregated, rack-scale wide-EP
+# GB200 sweep (NixlConnector P/D split over the NVL72 NVLink fabric). Mirrors
+# the deepseek-v4 "megamoe" ladder: DEP unit = DP-attn + expert-parallel
+# (DEP8 = 8 GPU / 2 nodes, DEP16 = 16 GPU / 4 nodes), with prefill workers
+# scaled 1P->4P. EP8/EP16 vs B200's 8-GPU NVLink island is the GB200 edge.
+# 1P1D TP4 (low conc), 1P1D DEP8 (mid), 1P1D DEP8->DEP16 (wide decode),
+# 2P1D / 4P1D DEP8->DEP16 (prefill-scaled max throughput). M3 = 128 experts.
+minimaxm3-fp8-gb200-dynamo-vllm:
+  image: vllm/vllm-openai:minimax-m3
+  model: MiniMaxAI/MiniMax-M3-MXFP8
+  model-prefix: minimaxm3
+  runner: gb200
+  precision: fp8
+  framework: dynamo-vllm
+  multinode: true
+  disagg: true
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      # Low latency: 1P+1D TP4 prefill -> TEP8 decode (TP8+EP8), 3 nodes.
+      # EP splits 128 MoE experts across 8 decode ranks (16 each), cutting
+      # per-step latency ~19% vs pure TP8.  Matches B200 TEP8 topology.
+      - conc-list: [1, 2, 4, 8, 16, 32, 64]
+        prefill:
+          num-worker: 1
+          tp: 4
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: false
+
+      # Mid curve: 1P+1D DEP8 (DP-attn + EP8), 4 nodes.
+      - conc-list: [128, 256, 512]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-4n.yaml"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: true
+
+      # Wide decode: 1P+1D DEP8 prefill -> DEP16 decode, 6 nodes.
+      - conc-list: [512, 1024, 2048]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml"
+        decode:
+          num-worker: 1
+          tp: 16
+          ep: 16
+          dp-attn: true
+
+      # Prefill-scaled: 2P+1D, 8 nodes.
+      - conc-list: [2048, 4096]
+        prefill:
+          num-worker: 2
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml"
+        decode:
+          num-worker: 1
+          tp: 16
+          ep: 16
+          dp-attn: true
+
+      # Max throughput: 4P+1D, 12 nodes.
+      - conc-list: [4096, 8192]
+        prefill:
+          num-worker: 4
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml"
+        decode:
+          num-worker: 1
+          tp: 16
+          ep: 16
+          dp-attn: true
+
+    - isl: 8192
+      osl: 1024
+      search-space:
+      # Low latency 8k1k: 1P+1D TP4 prefill -> TEP8 decode (TP8+EP8), 3 nodes.
+      - conc-list: [1, 2, 4, 8, 16]
+        prefill:
+          num-worker: 1
+          tp: 4
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: false
+
+      # Mid curve 8k1k: 1P+1D DEP8, 4 nodes.
+      - conc-list: [32, 64, 128]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-4n.yaml"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: true
+
+      # Wide decode 8k1k: DEP8 prefill -> DEP16 decode, 6 nodes.
+      - conc-list: [128, 256, 512]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml"
+        decode:
+          num-worker: 1
+          tp: 16
+          ep: 16
+          dp-attn: true
+
+      # Prefill-scaled 8k1k: 2P+1D, 8 nodes.
+      - conc-list: [512, 1024]
+        prefill:
+          num-worker: 2
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml"
+        decode:
+          num-worker: 1
+          tp: 16
+          ep: 16
+          dp-attn: true
+
+      # Max throughput 8k1k: 4P+1D, 12 nodes.
+      - conc-list: [1024, 2048]
+        prefill:
+          num-worker: 4
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml"
+        decode:
+          num-worker: 1
+          tp: 16
+          ep: 16
+          dp-attn: true
+
 # MiniMax-M3 day-zero (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3).
 # 427B total / 26B active MoE with MSA sparse attention; MXFP8 checkpoint
 # (MiniMaxAI/MiniMax-M3-MXFP8, ~444 GB) quantized by NVIDIA — native MX tensor

diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml
@@ -123,6 +123,11 @@ on:
 
 env:
   RANDOM_RANGE_RATIO: 0.8
+  # Day-zero models resolved via hf: ids download from the Hub inside the
+  # slurm job (srtctl pre-download + dynamo hub fetch). Anonymous requests
+  # get 429-rate-limited when several workers pull a 444 GB snapshot at
+  # once; sbatch/srun inherit this env so the token reaches the workers.
+  HF_TOKEN: ${{ secrets.INFERENCEX_OFFICIAL_RO_HF_TOKEN }}
   EXP_NAME: ${{ inputs.exp-name }}
   IMAGE: ${{ inputs.image }}
   MODEL_PREFIX: ${{ inputs.model-prefix }}

diff --git a/...ulti_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-4n.yaml b/...ulti_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-4n.yaml
@@ -0,0 +1,118 @@
+name: "minimax-m3-vllm-disagg-gb200-1p1d-dep8-1k1k"
+
+# MiniMax-M3 disaggregated 1P+1D recipe for GB200 (mid curve, wide EP).
+# Prefill DEP8 (DP-attn + EP across 8 GPU / 2 nodes) -> NixlConnector ->
+# Decode DEP8 (8 GPU / 2 nodes) = 4 nodes. Rack-scale expert parallel
+# over the NVL72 NVLink fabric -- the regime where GB200 pulls ahead of
+# B200 (capped at an 8-GPU NVLink island). M3 has 128 routed experts so
+# EP8 shards 16 experts/rank. FLASHINFER attention, block-size 128.
+
+model:
+  path: "minimax-m3-mxfp8"
+  container: "vllm/vllm-openai:minimax-m3"
+  precision: "fp8"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260526"
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 2
+  decode_nodes: 2
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_prefill: 8
+  gpus_per_decode: 8
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead
+    # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and
+    # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable.
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
+
+  decode_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead
+    # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and
+    # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable.
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13346
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 2304
+      max-num-seqs: 16
+      max-num-batched-tokens: 16384
+      block-size: 128
+      attention-backend: FLASHINFER
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      numa-bind: true
+      enable-sleep-mode: true
+      stream-interval: 32
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: 2304
+      max-num-seqs: 512
+      max-num-batched-tokens: 512
+      max-cudagraph-capture-size: 512
+      block-size: 128
+      attention-backend: FLASHINFER
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      numa-bind: true
+      enable-sleep-mode: true
+      stream-interval: 128
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "128x256x512"