SemiAnalysisAI · qiching · Jun 11, 2026 · Jun 11, 2026 · claude · Jun 15, 2026
diff --git a/.github/workflows/speedbench-al.yml b/.github/workflows/speedbench-al.yml
@@ -209,6 +209,17 @@ jobs:
           path: speedbench_results/server_*.log
           if-no-files-found: ignore
 
+      # Per-request benchmark detail (vllm bench serve --save-detailed): includes
+      # the model's response text per request, so the AL of each cell can be
+      # audited for output correctness (sensible text + correct thinking mode).
+      - name: Upload detailed benchmark results
+        if: always()
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
+        with:
+          name: speedbench_detailed_results-${{ inputs.model-prefix }}
+          path: speedbench_results/speedbench_*
+          if-no-files-found: ignore
+
       - name: Resource cleanup (post-run)
         if: always()
         run: *resource-cleanup
diff --git a/benchmarks/single_node/speedbench/dsr1_fp4_b300_vllm.sh b/benchmarks/single_node/speedbench/dsr1_fp4_b300_vllm.sh
@@ -0,0 +1,243 @@
+#!/usr/bin/env bash
+
+# DeepSeek-R1 B300 vLLM SPEED-Bench AL matrix collector.
+#
+# Produces the golden acceptance-length (AL) reference matrix consumed by the
+# synthetic-acceptance framework: for each MTP level (num_speculative_tokens),
+# measure the REAL AL on a single SPEED-Bench category (default: coding) and emit
+# a YAML matrix identical in shape to benchmarks/speedbench-reference-al.yaml.
+# This measures real MTP acceptance; the synthetic value is injected downstream
+# by the throughput recipe, not here.
+#
+# Adapted from speedbench/dsv4_fp4_b300_vllm.sh. DeepSeek-R1 is DeepSeek-V3
+# architecture (MLA dense attention), NOT V4 (DSA / Lightning Indexer), so vs the
+# DSV4 collector:
+#   - NO --tokenizer-mode deepseek_v4 / --reasoning-parser deepseek_v4 /
+#     --tool-call-parser deepseek_v4   (all V4-specific; the official vLLM R1
+#     serve command is bare). reasoning-parser is irrelevant here anyway: AL is
+#     read from /metrics, not from parsed output.
+#   - NO --attention_config.use_fp4_indexer_cache (that knob is dsv32/MLA-indexer
+#     only; R1 is plain MLA and never reads it).
+#   - NO --block-size / --compilation-config (the official R1 recipe omits them;
+#     defaults apply). --kv-cache-dtype fp8 IS kept, to match the dsv4/qwen/glm
+#     collectors so all golden AL values share one kv-cache numeric regime.
+#   - FP4 on Blackwell needs FlashInfer MoE: export VLLM_USE_FLASHINFER_MOE_FP4=1.
+#   - THINKING: R1 is a pure reasoning model and always emits <think> (its chat
+#     template has no enable_thinking toggle). There is no thinking-off mode, so
+#     this collector measures thinking_on only and needs no --chat-template-kwargs
+#     shim (the default client-side template render already enables thinking).
+#
+# Checkpoint (B300 / Blackwell): NVFP4 build nvidia/DeepSeek-R1-0528-NVFP4-v2,
+# basename dsr1-fp4 on the runner (resolved by launch_b300-nv.sh).
+#
+# Usage (inside the vLLM container, on a B300 node):
+#   export MODEL=/data/models/dsr1-fp4
+#   bash benchmarks/single_node/speedbench/dsr1_fp4_b300_vllm.sh
+#
+# Tunables (env):
+#   MTP_LIST          space-separated MTP levels   (default "1 2 3 4 5 6 7 8")
+#   THINKING_MODES    space-separated: on           (default "on"; R1 has no off)
+#   CATEGORY          SPEED-Bench category          (default coding)
+#   SPEEDBENCH_OUTPUT_LEN  per-request output len   (default 4096)
+#   OUT_YAML          output matrix path            (default $RESULTS_DIR/speedbench-reference-al.yaml)
+
+set -uo pipefail
+source "$(dirname "$0")/../../benchmark_lib.sh"
+
+MODEL="${MODEL:?MODEL env var required (e.g. /data/models/dsr1-fp4)}"
+SERVE_MODEL="${MODEL_PATH:-$MODEL}"
+TP="${TP:-8}"
+DP_ATTENTION="${DP_ATTENTION:-false}"
+PORT="${PORT:-8888}"
+
+MTP_LIST="${MTP_LIST:-1 2 3 4 5 6 7 8}"
+THINKING_MODES="${THINKING_MODES:-on}"
+CATEGORY="${CATEGORY:-coding}"
+MODEL_KEY="${MODEL_KEY:-$(basename "$SERVE_MODEL" | tr '[:upper:]' '[:lower:]')}"
+SPEEDBENCH_OUTPUT_LEN="${SPEEDBENCH_OUTPUT_LEN:-4096}"
+CONCURRENCY="${CONCURRENCY:-1}"
+# Provider-recommended sampling from the DeepSeek-R1 checkpoint generation_config
+# (temperature 0.6, top_p 0.95; no top_k). vLLM's own default top_p is 1.0, so it
+# MUST be passed explicitly or the measured AL is taken at the wrong settings.
+TEMPERATURE="${TEMPERATURE:-0.6}"
+TOP_P="${TOP_P:-0.95}"
+
+SPEEDBENCH_DIR="${SPEEDBENCH_DIR:-/workspace/speed_bench_data}"
+# Flat results dir to match the speedbench-al.yml artifact glob
+# (speedbench_results/server_*.log) and its pre-run `rm -rf speedbench_results`.
+RESULTS_DIR="${RESULTS_DIR:-/workspace/speedbench_results}"
+OUT_YAML="${OUT_YAML:-$RESULTS_DIR/speedbench-reference-al.yaml}"
+
+# Blackwell FP4 MoE path (DeepSeek-R1 FP4 on B-series): required per vLLM R1 docs.
+export VLLM_USE_FLASHINFER_MOE_FP4="${VLLM_USE_FLASHINFER_MOE_FP4:-1}"
+export VLLM_ENGINE_READY_TIMEOUT_S=3600
+
+mkdir -p "$RESULTS_DIR"
+nvidia-smi
+if [[ "$SERVE_MODEL" != /* ]]; then hf download "$SERVE_MODEL"; fi
+
+# ---- Download SPEED-Bench dataset ----
+echo "=== Downloading SPEED-Bench dataset ==="
+pip install -q datasets tiktoken
+curl -LsSf https://raw.githubusercontent.com/NVIDIA-NeMo/Skills/refs/heads/main/nemo_skills/dataset/speed-bench/prepare.py \
+  | python3 - --config qualitative --output_dir "$SPEEDBENCH_DIR"
+
+if [[ ! -f "$SPEEDBENCH_DIR/qualitative.jsonl" ]]; then
+    echo "CRITICAL: SPEED-Bench download failed — $SPEEDBENCH_DIR/qualitative.jsonl not found"
+    exit 1
+fi
+
+PARALLEL_ARGS=(--tensor-parallel-size "$TP" --data-parallel-size 1)
+if [ "${DP_ATTENTION}" = "true" ]; then
+    PARALLEL_ARGS=(--tensor-parallel-size 1 --data-parallel-size "$TP")
+fi
+
+fetch_metric() {
+    local port="$1" name="$2"
+    curl -s "http://localhost:${port}/metrics" \
+      | grep -oP "${name}\\{[^}]*\\} \\K[0-9.]+" || echo "0"
+}
+
+SERVER_PID=""
+_descendants() {
+    local pid="$1" child
+    for child in $(pgrep -P "$pid" 2>/dev/null || true); do
+        echo "$child"
+        _descendants "$child"
+    done
+}
+cleanup_server() {
+    if [[ -n "$SERVER_PID" ]]; then
+        local descendants
+        descendants=$(_descendants "$SERVER_PID")
+        kill "$SERVER_PID" 2>/dev/null || true
+        wait "$SERVER_PID" 2>/dev/null || true
+        local pid
+        for pid in $descendants; do
+            kill -9 "$pid" 2>/dev/null || true
+        done
+        local waited=0
+        while [[ $waited -lt 120 ]]; do
+            local used
+            used=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits 2>/dev/null | sort -rn | head -1)
+            if [[ -z "$used" || "$used" -lt 2000 ]]; then break; fi
+            sleep 3; waited=$((waited + 3))
+        done
+        SERVER_PID=""
+    fi
+}
+trap 'cleanup_server' EXIT
+
+start_gpu_monitor
+
+declare -A AL_RESULT
+
+run_cell() {
+    local mode="$1" mtp="$2"
+
+    echo ""
+    echo "=========================================="
+    echo "  Cell: thinking=$mode  MTP=$mtp  category=$CATEGORY"
+    echo "=========================================="
+
+    local serve_args=(
+        --host 0.0.0.0 --port "$PORT"
+        "${PARALLEL_ARGS[@]}"
+        --pipeline-parallel-size 1
+        --trust-remote-code
+        --enable-expert-parallel
-        --enable-expert-parallel
+-        --enable-expert-parallel
+        "${EP_ARGS[@]}"
-        --enable-expert-parallel
+-        --enable-expert-parallel
+        "${EP_ARGS[@]}"
+        --kv-cache-dtype fp8
+        --no-enable-prefix-caching
+        --max-model-len 16384
+        --speculative-config "{\"method\": \"mtp\", \"num_speculative_tokens\": $mtp}"
+    )
+
+    local server_log="$RESULTS_DIR/server_${mode}_mtp${mtp}.log"
+    vllm serve "$SERVE_MODEL" "${serve_args[@]}" > "$server_log" 2>&1 &
+    SERVER_PID=$!
+
+    if ! wait_for_server_ready --port "$PORT" --server-log "$server_log" --server-pid "$SERVER_PID"; then
+        echo "  -> server failed to start (thinking=$mode mtp=$mtp), recording N/A"
+        AL_RESULT["${mode}_${mtp}"]="N/A"
+        cleanup_server
+        return
+    fi
+
+    local acc_before drf_before acc_after drf_after
+    acc_before=$(fetch_metric "$PORT" "vllm:spec_decode_num_accepted_tokens_total")
+    drf_before=$(fetch_metric "$PORT" "vllm:spec_decode_num_drafts_total")
+
+    vllm bench serve \
+        --model "$SERVE_MODEL" \
+        --port "$PORT" \
+        --dataset-name speed_bench \
+        --dataset-path "$SPEEDBENCH_DIR" \
+        --speed-bench-category "$CATEGORY" \
+        --speed-bench-output-len "$SPEEDBENCH_OUTPUT_LEN" \
+        --num-prompts -1 \
+        --max-concurrency "$CONCURRENCY" \
+        --save-result \
+        --save-detailed \
+        --result-dir "$RESULTS_DIR" \
+        --result-filename "speedbench_${mode}_mtp${mtp}" \
+        --trust-remote-code \
+        --temperature "$TEMPERATURE" \
+        --top-p "$TOP_P"
+
+    acc_after=$(fetch_metric "$PORT" "vllm:spec_decode_num_accepted_tokens_total")
+    drf_after=$(fetch_metric "$PORT" "vllm:spec_decode_num_drafts_total")
+
+    local delta_acc delta_drf al
+    delta_acc=$(awk "BEGIN {printf \"%d\", $acc_after - $acc_before}")
+    delta_drf=$(awk "BEGIN {printf \"%d\", $drf_after - $drf_before}")
+    if [[ "$delta_drf" -gt 0 ]]; then
+        al=$(awk "BEGIN {printf \"%.2f\", 1 + ($delta_acc / $delta_drf)}")
+    else
+        al="N/A"
+    fi
+    echo "  -> thinking=$mode MTP=$mtp AL=$al (accepted=$delta_acc drafts=$delta_drf)"
+    AL_RESULT["${mode}_${mtp}"]="$al"
+
+    cleanup_server
+}
+
+for mode in $THINKING_MODES; do
+    for mtp in $MTP_LIST; do
+        run_cell "$mode" "$mtp"
+    done
+done
+
+stop_gpu_monitor
+
+# ---- Emit the YAML matrix ----
+emit_mode_block() {
+    local mode="$1"
+    for mtp in $MTP_LIST; do
+        echo "    $mtp: ${AL_RESULT[${mode}_${mtp}]:-N/A}"
+    done
+}
+
+{
+    echo "# Acceptance Length (AL) reference values measured with SPEED-Bench."
+    echo "# dataset: $CATEGORY | temperature: $TEMPERATURE | top_p: $TOP_P | output_len: $SPEEDBENCH_OUTPUT_LEN"
+    echo "# DeepSeek-R1 always reasons (no thinking-off mode), so only thinking_on is emitted."
+    echo "# Measured on $MODEL_KEY (B300, vLLM MTP), per num_speculative_tokens."
+    echo "# Auto-generated by benchmarks/single_node/speedbench/dsr1_fp4_b300_vllm.sh (speedbench-al.yml)."
+    echo "#"
+    echo "# key = num_speculative_tokens (MTP level); value = golden AL"
+    echo "${MODEL_KEY}:"
+    if [[ " $THINKING_MODES " == *" on "* ]]; then
+        echo "  thinking_on:"
+        emit_mode_block on
+    fi
+    if [[ " $THINKING_MODES " == *" off "* ]]; then
+        echo "  thinking_off:"
+        emit_mode_block off
+    fi
+} > "$OUT_YAML"
+
+echo ""
+echo "=========================================="
+echo "  SPEED-Bench AL matrix written to: $OUT_YAML"
+echo "=========================================="
+cat "$OUT_YAML"
diff --git a/benchmarks/single_node/speedbench/dsv4_fp4_b300_vllm.sh b/benchmarks/single_node/speedbench/dsv4_fp4_b300_vllm.sh
@@ -275,6 +275,7 @@ run_cell() {
         --num-prompts -1 \
         --max-concurrency "$CONCURRENCY" \
         --save-result \
+        --save-detailed \
         --result-dir "$RESULTS_DIR" \
         --result-filename "speedbench_${mode}_mtp${mtp}" \
         --trust-remote-code \