From 01cc2af8be436575d31120587edc63e957a333a9 Mon Sep 17 00:00:00 2001
From: seungrokj <seungrok.jung@amd.com>
Date: Thu, 11 Jun 2026 15:05:02 +0900
Subject: [PATCH 01/21] [AMD] agentic: add hicache/lmcache configs, update
 agentic scripts for mi355x models

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/configs/amd-master.yaml               | 113 ++-
 .../single_node/agentic/glm5.1_fp4_mi355x.sh  |  96 ++-
 .../agentic/kimik2.5_fp4_mi355x.sh            | 661 ++----------------
 .../agentic/minimaxm2.5_fp4_mi355x.sh         | 272 +++++++
 .../agentic/minimaxm2.5_fp8_mi355x.sh         | 256 ++++++-
 .../single_node/agentic/qwen3.5_fp4_mi355x.sh | 150 ++++
 .../single_node/agentic/qwen3.5_fp8_mi355x.sh | 101 ++-
 runners/launch_mi355x-amds.sh                 |   2 +-
 8 files changed, 980 insertions(+), 671 deletions(-)
 create mode 100755 benchmarks/single_node/agentic/minimaxm2.5_fp4_mi355x.sh
 create mode 100755 benchmarks/single_node/agentic/qwen3.5_fp4_mi355x.sh

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index a50d37eab..ee8718506 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -433,6 +433,22 @@ qwen3.5-fp4-mi355x-sglang:
       - { tp: 2, conc-start: 4, conc-end: 256 }
       - { tp: 4, conc-start: 4, conc-end: 16 }
 
+# target
+qwen3.5-fp4-mi355x-sglang-agentic-hicache:
+  image: lmsysorg/sglang:v0.5.12-rocm720-mi35x
+  model: amd/Qwen3.5-397B-A17B-MXFP4
+  model-prefix: qwen3.5
+  runner: mi355x
+  precision: fp4
+  framework: sglang
+  multinode: false
+  scenarios:
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      - { tp: 2, ep: 1, offloading: none, conc-list: [8, 16, 32, 40, 48, 56, 72] }
+      - { tp: 2, ep: 1, offloading: hicache, conc-list: [8, 16, 32, 40, 48, 56, 72] }
+
 qwen3.5-fp4-mi355x-atom:
   image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post
   model: amd/Qwen3.5-397B-A17B-MXFP4
@@ -872,6 +888,22 @@ minimaxm2.5-fp4-mi355x-atom:
       - { tp: 4, conc-start: 4, conc-end: 128 }
       - { tp: 8, conc-start: 4, conc-end: 16 }
 
+# target
+minimaxm2.5-fp4-mi355x-vllm-agentic-lmcache:
+  image: vllm/vllm-openai-rocm:v0.22.0
+  model: amd/MiniMax-M2.5-MXFP4
+  model-prefix: minimaxm2.5
+  runner: mi355x
+  precision: fp4
+  framework: vllm
+  multinode: false
+  scenarios:
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      - { tp: 1, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 40, 48] }
+      - { tp: 1, ep: 1, offloading: lmcache, conc-list: [1, 2, 4, 8, 16, 32, 40, 48] }
+
 minimaxm2.5-fp4-mi355x-vllm:
   image: vllm/vllm-openai-rocm:v0.22.0
   model: amd/MiniMax-M2.5-MXFP4
@@ -2494,6 +2526,23 @@ glm5.1-fp4-mi355x-sglang-agentic:
       # sglang manages KV eviction; mi355x glm5.1 caps at tp=4 conc=16 in fixed-seq, so cap conservatively
       - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] }
 
+# target
+glm5.1-fp4-mi355x-sglang-agentic-hicache:
+  image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260529
+  model: amd/GLM-5.1-MXFP4
+  model-prefix: glm5.1
+  runner: mi355x
+  precision: fp4
+  framework: sglang
+  multinode: false
+  scenarios:
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      # sglang manages KV eviction; mi355x glm5.1 caps at tp=4 conc=16 in fixed-seq, so cap conservatively
+      - { tp: 2, ep: 1, offloading: none, conc-list: [4, 8, 16, 32, 40, 48] }
+      - { tp: 2, ep: 1, offloading: hicache, conc-list: [4, 8, 16, 32, 40, 48] }
+
 kimik2.5-fp4-mi355x-vllm-agentic:
   image: vllm/vllm-openai-rocm:v0.22.0
   model: amd/Kimi-K2.5-MXFP4
@@ -2518,8 +2567,40 @@ kimik2.5-fp4-mi355x-vllm-agentic:
       - { tp: 4, offloading: none, conc-list: [16, 24, 32, 40] }
       - { tp: 4, offloading: cpu,  conc-list: [16, 24, 32, 40] }
 
+# target
+kimik2.5-fp4-mi355x-vllm-agentic-lmcache:
+  image: vllm/vllm-openai-rocm:v0.22.0
+  model: amd/Kimi-K2.5-MXFP4
+  model-prefix: kimik2.5
+  runner: mi355x
+  precision: fp4
+  framework: vllm
+  multinode: false
+  scenarios:
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      - { tp: 4, ep: 1, offloading: none, conc-list:    [1, 2, 4, 8, 16, 32, 40, 48, 56, 64] }
+      - { tp: 4, ep: 1, offloading: lmcache, conc-list: [1, 2, 4, 8, 16, 32, 40, 48, 56, 64] }
+
+# target
+kimik2.5-fp4-mi355x-vllm-agentic-lmcache-060226DRAM1500GB:
+  image: vllm/vllm-openai-rocm:v0.22.0
+  model: amd/Kimi-K2.5-MXFP4
+  model-prefix: kimik2.5
+  runner: mi355x
+  precision: fp4
+  framework: vllm
+  multinode: false
+  scenarios:
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      #- { tp: 4, ep: 1, offloading: none, conc-list:    [1, 2, 4, 8, 16, 32, 40, 48, 56, 64] }
+      - { tp: 4, ep: 1, offloading: lmcache, conc-list: [1, 2, 4, 8, 16, 32, 40, 48, 56, 64] }
+
 minimaxm2.5-fp8-mi355x-vllm-agentic:
-  image: vllm/vllm-openai-rocm:v0.22.1
+  image: vllm/vllm-openai-rocm:v0.22.0
   model: MiniMaxAI/MiniMax-M2.5
   model-prefix: minimaxm2.5
   runner: mi355x
@@ -2536,8 +2617,27 @@ minimaxm2.5-fp8-mi355x-vllm-agentic:
       - { tp: 4, ep: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 48, 56, 64, 72, 96] }
       - { tp: 4, ep: 4, offloading: cpu,  conc-list: [48, 56, 64, 72, 96] }
 
+# target
+minimaxm2.5-fp8-mi355x-vllm-agentic-lmcache:
+  image: vllm/vllm-openai-rocm:v0.22.0
+  model: MiniMaxAI/MiniMax-M2.5
+  model-prefix: minimaxm2.5
+  runner: mi355x
+  precision: fp8
+  framework: vllm
+  multinode: false
+  scenarios:
+    agentic-coding:
+    # MI355X tp=4 ep=4: compute ceiling ~60 (empirical), KV cliff ~91 (analytical).
+    # Compute saturates first; cpu offload likely won't help, but worth confirming.
+    # AMD uses native OffloadingConnector (NOT SimpleCPUOffloadConnector).
+    - duration: 1800
+      search-space:
+      - { tp: 2, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 40, 48] }
+      - { tp: 2, ep: 1, offloading: lmcache, conc-list: [1, 2, 4, 8, 16, 32, 40, 48] }
+
 minimaxm2.5-fp8-mi300x-vllm-agentic:
-  image: vllm/vllm-openai-rocm:v0.22.1
+  image: vllm/vllm-openai-rocm:v0.22.0
   model: MiniMaxAI/MiniMax-M2.5
   model-prefix: minimaxm2.5
   runner: mi300x
@@ -2555,7 +2655,7 @@ minimaxm2.5-fp8-mi300x-vllm-agentic:
       - { tp: 4, offloading: cpu,  conc-list: [16, 20, 24, 28, 32] }
 
 minimaxm2.5-fp8-mi325x-vllm-agentic:
-  image: vllm/vllm-openai-rocm:v0.22.1
+  image: vllm/vllm-openai-rocm:v0.22.0
   model: MiniMaxAI/MiniMax-M2.5
   model-prefix: minimaxm2.5
   runner: mi325x
@@ -2573,8 +2673,9 @@ minimaxm2.5-fp8-mi325x-vllm-agentic:
       - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 20, 24, 28, 32, 40, 48] }
       - { tp: 4, offloading: cpu,  conc-list: [16, 20, 24, 28, 32] }
 
+# target
 qwen3.5-fp8-mi355x-sglang-agentic-hicache:
-  image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260521
+  image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260531
   model: Qwen/Qwen3.5-397B-A17B-FP8
   model-prefix: qwen3.5
   runner: mi355x
@@ -2585,8 +2686,8 @@ qwen3.5-fp8-mi355x-sglang-agentic-hicache:
     agentic-coding:
     - duration: 1800
       search-space:
-      - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] }
-      - { tp: 8, ep: 1, offloading: hicache, conc-list: [16, 32, 48, 64] }
+      - { tp: 4, ep: 1, offloading: none, conc-list: [8, 16, 32, 40, 48, 56, 72] }
+      - { tp: 4, ep: 1, offloading: hicache, conc-list: [8, 16, 32, 40, 48, 56, 72] }
 
 dsv4-fp4-mi355x-vllm-agentic:
   image: vllm/vllm-openai-rocm:v0.22.0
diff --git a/benchmarks/single_node/agentic/glm5.1_fp4_mi355x.sh b/benchmarks/single_node/agentic/glm5.1_fp4_mi355x.sh
index 3b85a31cd..6bea8dddd 100755
--- a/benchmarks/single_node/agentic/glm5.1_fp4_mi355x.sh
+++ b/benchmarks/single_node/agentic/glm5.1_fp4_mi355x.sh
@@ -2,18 +2,29 @@
 set -euo pipefail
 set -x
 
-# Agentic trace replay benchmark for GLM-5.1 FP4 on MI355X using SGLang.
+# Agentic trace replay benchmark for Qwen3.5 FP8 on MI300X using SGLang.
+#
+# Base server recipe follows the upstream MI300X reference
+# (benchmarks/single_node/qwen3.5_fp8_mi300x.sh, the "AMD Andy" recipe):
+# aiter attention backend, aiter allreduce fusion, mem-fraction 0.75.
+# The agentic harness (resolve_trace_source / build_replay_cmd /
+# run_agentic_replay_and_write_outputs) replaces run_benchmark_serving, and
+# --disable-radix-cache is dropped because agentic replay needs prefix reuse.
 #
 # Required env vars:
-#   MODEL, TP, CONC, RESULT_DIR
+#   MODEL, TP, CONC, OFFLOADING, TOTAL_CPU_DRAM_GB, RESULT_DIR, DURATION, EP_SIZE
+#
+# OFFLOADING values:
+#   none    - SGLang GPU KV with the default RadixAttention prefix cache.
+#   hicache - SGLang HiCache with a local CPU hierarchical cache on top of radix.
 
 source "$(dirname "$0")/../../benchmark_lib.sh"
 
-check_env_vars MODEL TP CONC RESULT_DIR DURATION
+check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR EP_SIZE DP_ATTENTION
 
-if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then
-    MAX_MODEL_LEN=131072
-fi
+PORT=${PORT:-8888}
+DURATION=${DURATION:-1800}
+EP_SIZE=${EP_SIZE:-1}
 
 if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
@@ -30,8 +41,16 @@ else
     hf download "$MODEL"
     export MODEL_PATH="$MODEL"
 fi
+
 rocm-smi || true
 amd-smi || true
+# ---- Resolve traces and install deps ----------------------------------------
+# Cap the replay corpus at 256k (470 traces, max in+out <= 256k) instead of the
+# unfiltered 052726 corpus whose ~1M-token traces get rejected and add no perf
+# signal at high concurrency.
+#export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k
+#060226
+export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060226_256k
 
 # ---- Resolve traces and install deps ----------------------------------------
 resolve_trace_source
@@ -48,26 +67,85 @@ mkdir -p "$RESULT_DIR"
 
 pip install -U transformers
 
+CACHE_ARGS=()
+WARMUP_ARGS=()
+CUDA_GRAPH_MAX_BS="$CONC"
+case "$OFFLOADING" in
+    none)
+        # Leave SGLang's default RadixAttention prefix cache on — agentic
+        # replay needs it; --disable-radix-cache would zero the hit rate.
+        ;;
+    hicache)
+        # GLM-5.1 FP4 uses a standard transformer (no hybrid Mamba path),
+        # so one HiCache host pool per TP rank is sufficient.
+        # The node-total DRAM budget divides by TP and host-pool count.
+        TOTAL_CPU_DRAM_GB=3000
+        HICACHE_HOST_POOL_COUNT="${HICACHE_HOST_POOL_COUNT:-1}"
+        HICACHE_MAX_SIZE_GB_PER_RANK_POOL="${HICACHE_MAX_SIZE_GB_PER_RANK_POOL:-${HICACHE_MAX_SIZE_GB_PER_RANK:-500}}"
+        HICACHE_WRITE_POLICY="${HICACHE_WRITE_POLICY:-write_through_selective}"
+        # GLM-5.1 uses standard paged attention (no no_buffer scheduler constraint),
+        # so page_size can be left at the default. Keep the safer direct/layer_first
+        # copy path on ROCm.
+        HICACHE_PAGE_SIZE="${HICACHE_PAGE_SIZE:-1}"
+        HICACHE_IO_BACKEND="${HICACHE_IO_BACKEND:-direct}"
+        HICACHE_MEM_LAYOUT="${HICACHE_MEM_LAYOUT:-layer_first}"
+        HICACHE_SIZE_GB="${HICACHE_SIZE_GB:-$((TOTAL_CPU_DRAM_GB / TP / HICACHE_HOST_POOL_COUNT))}"
+        if [ "$HICACHE_SIZE_GB" -gt "$HICACHE_MAX_SIZE_GB_PER_RANK_POOL" ]; then
+            HICACHE_SIZE_GB="$HICACHE_MAX_SIZE_GB_PER_RANK_POOL"
+        fi
+        if [ "$HICACHE_SIZE_GB" -lt 1 ]; then
+            echo "Error: computed HICACHE_SIZE_GB=$HICACHE_SIZE_GB from TOTAL_CPU_DRAM_GB=$TOTAL_CPU_DRAM_GB, TP=$TP, HICACHE_HOST_POOL_COUNT=$HICACHE_HOST_POOL_COUNT" >&2
+            exit 1
+        fi
+        echo "HiCache CPU pool: ${HICACHE_SIZE_GB} GB per rank per host pool across TP=${TP}, host_pool_count=${HICACHE_HOST_POOL_COUNT}"
+        CACHE_ARGS=(
+            --page-size "$HICACHE_PAGE_SIZE"
+            --enable-hierarchical-cache
+            --hicache-size "$HICACHE_SIZE_GB"
+            --hicache-io-backend "$HICACHE_IO_BACKEND"
+            --hicache-mem-layout "$HICACHE_MEM_LAYOUT"
+            --hicache-write-policy "$HICACHE_WRITE_POLICY"
+        )
+        # HiCache startup reaches API readiness but SGLang's internal warmup
+        # request can time out on this path; let aiperf own benchmark traffic.
+        WARMUP_ARGS=(--skip-server-warmup)
+        # Don't force ROCm graph capture at every high concurrency point; conc=16
+        # is the highest known-good capture size for this model/server path.
+        HICACHE_CUDA_GRAPH_MAX_BS="${HICACHE_CUDA_GRAPH_MAX_BS:-16}"
+        if [ "$HICACHE_CUDA_GRAPH_MAX_BS" -lt "$CUDA_GRAPH_MAX_BS" ]; then
+            CUDA_GRAPH_MAX_BS="$HICACHE_CUDA_GRAPH_MAX_BS"
+        fi
+        ;;
+    *)
+        echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, hicache)" >&2
+        exit 1
+        ;;
+esac
+
 echo "Starting SGLang server..."
 export PYTHONNOUSERSITE=1
 
+pip install -U transformers
 python3 -m sglang.launch_server \
-    --model-path "$MODEL_PATH" --served-model-name "$MODEL" \
+    --model-path "$MODEL_PATH" \
+    --served-model-name "$MODEL" \
     --host=0.0.0.0 \
     --port $PORT \
     --tensor-parallel-size $TP \
     --trust-remote-code \
     --cuda-graph-max-bs $CONC \
     --max-running-requests $CONC \
-    --context-length $MAX_MODEL_LEN \
     --mem-fraction-static 0.85 \
     --tool-call-parser glm47 \
     --reasoning-parser glm45 \
     --model-loader-extra-config '{"enable_multithread_load": true, "num_threads": 8}' \
     --nsa-prefill-backend tilelang \
     --nsa-decode-backend tilelang \
+    --watchdog-timeout 1200 \
     --kv-cache-dtype fp8_e4m3 \
     --tokenizer-worker-num $((TP*2)) \
+    "${CACHE_ARGS[@]}" \
+    "${WARMUP_ARGS[@]}" \
     --enable-metrics > "$SERVER_LOG" 2>&1 &
 SERVER_PID=$!
 echo "Server PID: $SERVER_PID"
@@ -77,4 +155,4 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S
 # ---- Run benchmark ----------------------------------------------------------
 build_replay_cmd "$RESULT_DIR"
 
-run_agentic_replay_and_write_outputs "$RESULT_DIR"
+run_agentic_replay_and_write_outputs "$RESULT_DIR"
\ No newline at end of file
diff --git a/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh b/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh
index 139b12256..b3211ff49 100755
--- a/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh
+++ b/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh
@@ -14,15 +14,11 @@ set -x
 
 source "$(dirname "$0")/../../benchmark_lib.sh"
 
-check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION EP_SIZE
-
-# Kimi-K2.5 advertises a 262144-token context window in vLLM 0.21.0.
-# Matrix defaults may export MAX_MODEL_LEN=0 to mean "server default"; for this
-# script we need the concrete value so AgentX filters prompt+max_tokens against
-# the same limit vLLM enforces.
-if [[ -z "${MAX_MODEL_LEN:-}" || "$MAX_MODEL_LEN" == "0" ]]; then
-    MAX_MODEL_LEN=262144
-fi
+check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR EP_SIZE DP_ATTENTION
+
+PORT=${PORT:-8888}
+DURATION=${DURATION:-1800}
+EP_SIZE=${EP_SIZE:-1}
 
 if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
@@ -44,546 +40,22 @@ else
     hf download "$MODEL"
     export MODEL_PATH="$MODEL"
 fi
+
 rocm-smi || true
 amd-smi || true
 
+# ---- Resolve traces and install deps ----------------------------------------
+# Cap the replay corpus at 256k (470 traces, max in+out <= 256k) instead of the
+# unfiltered 052726 corpus whose ~1M-token traces get rejected and add no perf
+# signal at high concurrency.
+#export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k
+#060226
+export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060226_256k
+
 # ---- Resolve traces and install deps ----------------------------------------
 resolve_trace_source
 install_agentic_deps
 
-# Install amd-quark for MXFP4 (manual install due to ROCm vLLM bug)
-pip install amd-quark
-
-# Disable AITER RMSNorm for TP < 8 due to accuracy issues
-if [ "${TP}" -lt 8 ]; then
-  export VLLM_ROCM_USE_AITER_RMSNORM=0
-fi
-
-write_lmcache_rocm_mp_patch() {
-    local patch_dir="$1"
-    mkdir -p "$patch_dir"
-    cat > "$patch_dir/sitecustomize.py" <<'PY'
-"""Runtime compatibility for LMCache MP on ROCm Kimi MLA KV caches."""
-
-import os
-import threading
-
-if os.environ.get("LMCACHE_ROCM_DEMAND_PINNED_ALLOCATOR") == "1":
-    import builtins
-    import sys
-
-    _orig_import = builtins.__import__
-
-    def _patch_lazy_memory_allocator(_lazy_memory_allocator) -> None:
-        _LazyMemoryAllocator = _lazy_memory_allocator.LazyMemoryAllocator
-
-        if getattr(_LazyMemoryAllocator, "_agentic_rocm_demand_patch", False):
-            return
-
-        _orig_init = _LazyMemoryAllocator.__init__
-        _orig_allocate = _LazyMemoryAllocator.allocate
-        _orig_batched_allocate = _LazyMemoryAllocator.batched_allocate
-
-        def _expand_to(self, target_size: int) -> None:
-            target_size = min(
-                self._final_size,
-                _lazy_memory_allocator.align_to(target_size, self.PIN_CHUNK_SIZE),
-            )
-            lock = self._agentic_rocm_demand_expand_lock
-            with lock:
-                if target_size <= self._curr_size:
-                    return
-
-                start_size = self._curr_size
-                while self._curr_size < target_size:
-                    commit_start = self._curr_size
-                    commit_target = min(target_size, self._curr_size + self.COMMIT_SIZE)
-                    while self._curr_size < commit_target:
-                        self._pin_memory_chunk(self._curr_size, self.PIN_CHUNK_SIZE)
-                        self._curr_size += self.PIN_CHUNK_SIZE
-                    self._commit_expansion(self._curr_size - commit_start)
-
-                self._log_expansion_progress(self._curr_size - start_size)
-
-        def _retry_with_demand_expansion(self, allocate_once):
-            obj = allocate_once()
-            step_gb = float(os.environ.get("LMCACHE_ROCM_DEMAND_PINNED_STEP_GB", "64"))
-            step_bytes = max(self.COMMIT_SIZE, int(step_gb * (1024**3)))
-
-            while obj is None and self._curr_size < self._final_size:
-                _expand_to(self, self._curr_size + step_bytes)
-                obj = allocate_once()
-
-            return obj
-
-        def _patched_init(self, *args, **kwargs):
-            _orig_init(self, *args, **kwargs)
-            self._agentic_rocm_demand_expand_lock = threading.Lock()
-
-            # LMCache MP's upstream LazyMemoryAllocator currently expands to
-            # the final pinned size in a background thread. On ROCm Kimi TP4,
-            # vLLM reaches KV-cache registration only after that 2.5 TB pool
-            # is fully pinned, and the server-side IPC open path can stall
-            # before acknowledging register_kv_caches. Keep the same final
-            # capacity, but pin/commit extra host memory only when L1
-            # allocations actually need it.
-            self._stop_expand.set()
-            self._expand_thread.join()
-            _lazy_memory_allocator.logger.info(
-                "Agentic ROCm patch: using demand-driven LMCache pinned "
-                "memory expansion; final capacity remains %s MB",
-                self._final_size >> 20,
-            )
-
-        def _patched_allocate(
-            self,
-            shapes,
-            dtypes,
-            fmt=_lazy_memory_allocator.MemoryFormat.UNDEFINED,
-            allocator_type=None,
-        ):
-            return _retry_with_demand_expansion(
-                self,
-                lambda: _orig_allocate(self, shapes, dtypes, fmt, allocator_type),
-            )
-
-        def _patched_batched_allocate(
-            self,
-            shapes,
-            dtypes,
-            batch_size,
-            fmt=_lazy_memory_allocator.MemoryFormat.UNDEFINED,
-            allocator_type=None,
-        ):
-            return _retry_with_demand_expansion(
-                self,
-                lambda: _orig_batched_allocate(
-                    self, shapes, dtypes, batch_size, fmt, allocator_type
-                ),
-            )
-
-        _LazyMemoryAllocator.__init__ = _patched_init
-        _LazyMemoryAllocator.allocate = _patched_allocate
-        _LazyMemoryAllocator.batched_allocate = _patched_batched_allocate
-        _LazyMemoryAllocator._agentic_rocm_demand_patch = True
-
-    def _patch_l1_memory_manager(_memory_manager) -> None:
-        _L1MemoryManager = getattr(_memory_manager, "L1MemoryManager", None)
-        _LazyMemoryAllocator = getattr(_memory_manager, "LazyMemoryAllocator", None)
-        if _L1MemoryManager is None or _LazyMemoryAllocator is None:
-            return
-        if getattr(_L1MemoryManager, "_agentic_rocm_final_capacity_patch", False):
-            return
-
-        _orig_get_memory_usage = _L1MemoryManager.get_memory_usage
-
-        def _patched_get_memory_usage(self):
-            allocator = getattr(self, "_allocator", None)
-            if isinstance(allocator, _LazyMemoryAllocator):
-                address_manager = allocator.get_address_manager()
-                used_size = (
-                    address_manager.get_heap_size() - address_manager.get_free_size()
-                )
-                return used_size, allocator._final_size
-            return _orig_get_memory_usage(self)
-
-        _L1MemoryManager.get_memory_usage = _patched_get_memory_usage
-        _L1MemoryManager._agentic_rocm_final_capacity_patch = True
-
-    def _maybe_patch_lazy_memory_allocator() -> None:
-        module = sys.modules.get("lmcache.v1.lazy_memory_allocator")
-        if module is not None and hasattr(module, "LazyMemoryAllocator"):
-            _patch_lazy_memory_allocator(module)
-
-    def _maybe_patch_l1_memory_manager() -> None:
-        module = sys.modules.get("lmcache.v1.distributed.memory_manager")
-        if module is not None and hasattr(module, "L1MemoryManager"):
-            _patch_l1_memory_manager(module)
-
-    def _agentic_rocm_import(name, globals=None, locals=None, fromlist=(), level=0):
-        module = _orig_import(name, globals, locals, fromlist, level)
-        if name == "lmcache.v1.lazy_memory_allocator" or (
-            name.startswith("lmcache") and "lmcache.v1.lazy_memory_allocator" in sys.modules
-        ):
-            _maybe_patch_lazy_memory_allocator()
-        if name == "lmcache.v1.distributed.memory_manager" or (
-            name.startswith("lmcache")
-            and "lmcache.v1.distributed.memory_manager" in sys.modules
-        ):
-            _maybe_patch_l1_memory_manager()
-        return module
-
-    builtins.__import__ = _agentic_rocm_import
-    _maybe_patch_lazy_memory_allocator()
-    _maybe_patch_l1_memory_manager()
-
-if os.environ.get("LMCACHE_ROCM_MP_BLOCK_FALLBACK") == "1":
-    import torch
-    import lmcache.non_cuda_equivalents as lmc
-
-    if not hasattr(lmc, "multi_layer_block_kv_transfer"):
-        _DTYPE_BY_NAME = {
-            "bfloat16": torch.bfloat16,
-            "float16": torch.float16,
-            "float32": torch.float32,
-        }
-
-        def _dtype_from_env() -> torch.dtype:
-            name = os.environ.get("LMCACHE_ROCM_MP_BLOCK_FALLBACK_DTYPE", "bfloat16")
-            try:
-                return _DTYPE_BY_NAME[name]
-            except KeyError as exc:
-                raise ValueError(f"Unsupported LMCache ROCm fallback dtype: {name}") from exc
-
-        def _paged_view(ptr: int, shape_desc, dtype: torch.dtype, device: torch.device) -> torch.Tensor:
-            block_stride = shape_desc.block_stride_elems or (
-                shape_desc.bs * shape_desc.nh * shape_desc.hs
-            )
-            base = lmc._tensor_from_ptr(
-                ptr,
-                (shape_desc.nb * block_stride,),
-                dtype,
-                device,
-            )
-            return torch.as_strided(
-                base,
-                (shape_desc.nb, shape_desc.bs, shape_desc.nh * shape_desc.hs),
-                (block_stride, shape_desc.nh * shape_desc.hs, 1),
-            )
-
-        def _tmp_view(ptr: int, shape_desc, num_layers: int, chunk_slots: int, dtype: torch.dtype, device: torch.device) -> torch.Tensor:
-            return lmc._tensor_from_ptr(
-                ptr,
-                (shape_desc.kv_size, num_layers, chunk_slots, shape_desc.nh * shape_desc.hs),
-                dtype,
-                device,
-            )
-
-        def multi_layer_block_kv_transfer(
-            group_kv_pointers,
-            tmp_buffer_ptrs,
-            block_ids,
-            paged_memory_device,
-            direction,
-            shape_desc,
-            lmcache_chunk_size,
-            gpu_kv_format,
-            skip_blocks=0,
-        ) -> None:
-            # Kimi K2.5 uses vLLM MLA: one KV tensor per layer with
-            # shape [num_blocks, block_size, hidden_size]. LMCache's Python
-            # fallback has no block-transfer entrypoint yet, so implement the
-            # same gather/scatter contract with torch indexing on ROCm.
-            if shape_desc.kv_size != 1:
-                raise NotImplementedError(
-                    "ROCm LMCache MP block fallback currently supports MLA KV caches only"
-                )
-
-            dtype = _dtype_from_env()
-            device = (
-                paged_memory_device
-                if isinstance(paged_memory_device, torch.device)
-                else torch.device(paged_memory_device)
-            )
-            num_layers = int(group_kv_pointers.numel())
-            blocks_per_chunk = lmcache_chunk_size // shape_desc.bs
-            direction_name = getattr(direction, "name", str(direction))
-
-            for chunk_idx, tmp_ptr in enumerate(tmp_buffer_ptrs):
-                start = chunk_idx * blocks_per_chunk
-                end = start + blocks_per_chunk
-                chunk_blocks = block_ids[start:end].to(device=device, dtype=torch.long)
-
-                dest_slot_offset = 0
-                if skip_blocks and chunk_idx == 0:
-                    chunk_blocks = chunk_blocks[int(skip_blocks):]
-                    dest_slot_offset = int(skip_blocks) * shape_desc.bs
-                if chunk_blocks.numel() == 0:
-                    continue
-
-                num_slots = int(chunk_blocks.numel()) * shape_desc.bs
-                tmp = _tmp_view(
-                    int(tmp_ptr),
-                    shape_desc,
-                    num_layers,
-                    lmcache_chunk_size,
-                    dtype,
-                    device,
-                )
-
-                for layer_idx in range(num_layers):
-                    paged = _paged_view(
-                        int(group_kv_pointers[layer_idx].item()),
-                        shape_desc,
-                        dtype,
-                        device,
-                    )
-                    tmp_slice = tmp[
-                        0,
-                        layer_idx,
-                        dest_slot_offset : dest_slot_offset + num_slots,
-                        :,
-                    ]
-                    if direction_name == "D2H":
-                        gathered = paged.index_select(0, chunk_blocks).reshape(
-                            num_slots, shape_desc.nh * shape_desc.hs
-                        )
-                        tmp_slice.copy_(gathered)
-                    elif direction_name == "H2D":
-                        src = tmp_slice.reshape(
-                            int(chunk_blocks.numel()),
-                            shape_desc.bs,
-                            shape_desc.nh * shape_desc.hs,
-                        )
-                        paged.index_copy_(0, chunk_blocks, src)
-                    else:
-                        raise ValueError(f"Unsupported transfer direction: {direction}")
-
-        lmc.multi_layer_block_kv_transfer = multi_layer_block_kv_transfer
-
-# ---- Chunked KV loading (prevents GPU block exhaustion at high concurrency) ----
-if os.environ.get("CHUNKED_LMCACHE_MAX_TOKENS_PER_LOAD", "0") != "0":
-    import chunked_connector_patch  # noqa: F401
-
-# ---- vLLM scheduler assertion fix (stale KV transfer notifications) ----
-import scheduler_assertion_patch  # noqa: F401
-PY
-}
-
-write_chunked_connector_patch() {
-    local patch_dir="$1"
-    mkdir -p "$patch_dir"
-    cat > "$patch_dir/chunked_connector_patch.py" <<'PY'
-"""
-Monkey-patch for LMCacheMPConnector to add chunked KV loading.
-
-Fixes GPU block exhaustion deadlock at high concurrency by capping
-the number of external tokens reported AND retrieved per scheduling step.
-
-Usage: set CHUNKED_LMCACHE_MAX_TOKENS_PER_LOAD=<tokens> and import this
-module from sitecustomize.py before LMCache is loaded.
-"""
-
-import logging
-import os
-import sys
-import builtins
-
-logger = logging.getLogger("chunked_lmcache_patch")
-
-_MAX_TOKENS = int(os.environ.get("CHUNKED_LMCACHE_MAX_TOKENS_PER_LOAD", "32768"))
-
-# Per-request chunk tracking (module-level, survives across calls)
-_chunk_state: dict[str, dict] = {}
-
-
-def _apply_patch():
-    """Patch LMCacheMPConnector in-place."""
-    mod = sys.modules.get("lmcache.integration.vllm.lmcache_mp_connector")
-    if mod is None:
-        return
-    cls = getattr(mod, "LMCacheMPConnector", None)
-    if cls is None or getattr(cls, "_chunked_patch_applied", False):
-        return
-
-    LMCacheMPRequestState = getattr(mod, "LMCacheMPRequestState", None)
-    _orig_get_matched = cls.get_num_new_matched_tokens
-    _orig_get_finished = cls.get_finished
-
-    def _get_blocks_per_chunk(self):
-        block_size = getattr(self, "block_size", 1)
-        return max(1, _MAX_TOKENS // block_size)
-
-    def _patched_get_num_new_matched_tokens(self, request, num_computed_tokens):
-        full_match = _orig_get_matched(self, request, num_computed_tokens)
-        if full_match <= 0 or _MAX_TOKENS <= 0:
-            return full_match
-
-        req_id = request.request_id
-        block_size = getattr(self, "block_size", 1)
-        blocks_per_chunk = _get_blocks_per_chunk(self)
-        full_match_blocks = full_match // block_size
-
-        state = _chunk_state.get(req_id)
-        if state is None or state.get("num_computed_at_start") != num_computed_tokens:
-            state = {
-                "full_match_blocks": full_match_blocks,
-                "chunk_end_blocks": 0,
-                "num_computed_at_start": num_computed_tokens,
-                "lookup_done": False,
-            }
-            _chunk_state[req_id] = state
-
-        if state["lookup_done"]:
-            return 0
-
-        remaining = state["full_match_blocks"] - state["chunk_end_blocks"]
-        if remaining <= 0:
-            state["lookup_done"] = True
-            return 0
-
-        this_chunk = min(remaining, blocks_per_chunk)
-        state["chunk_end_blocks"] += this_chunk
-        if state["chunk_end_blocks"] >= state["full_match_blocks"]:
-            state["lookup_done"] = True
-
-        capped = this_chunk * block_size
-        if capped < full_match:
-            logger.debug(
-                "Chunked LMCache: req %s capped %d -> %d tokens "
-                "(chunk %d/%d blocks)",
-                req_id, full_match, capped, this_chunk, full_match_blocks,
-            )
-
-        # Cap the tracker's hit blocks to match what we report
-        tracker = getattr(request, "kv_transfer_params", None)
-        if tracker is not None:
-            orig_hits = getattr(tracker, "num_lmcache_hit_blocks", 0)
-            if orig_hits > this_chunk:
-                tracker.num_lmcache_hit_blocks = this_chunk
-
-        return capped
-
-    def _patched_get_finished(self, scheduler_output):
-        result = _orig_get_finished(self, scheduler_output)
-        # Clean up chunk state for finished requests.
-        # vLLM passes scheduler_output as a set of request-ID strings
-        # (not a SchedulerOutput object), so iterate directly when it
-        # is a set/frozenset; fall back to the attribute path for
-        # forward compatibility.
-        if isinstance(scheduler_output, (set, frozenset)):
-            finished = scheduler_output
-        else:
-            finished = getattr(scheduler_output, "finished_req_ids", [])
-        for req in finished:
-            _chunk_state.pop(req, None)
-        return result
-
-    cls.get_num_new_matched_tokens = _patched_get_num_new_matched_tokens
-    cls.get_finished = _patched_get_finished
-    cls._chunked_patch_applied = True
-    logger.info(
-        "Chunked LMCache connector patch applied "
-        "(max_tokens_per_load=%d)", _MAX_TOKENS,
-    )
-
-
-_orig_import = builtins.__import__
-
-
-def _patching_import(name, *args, **kwargs):
-    module = _orig_import(name, *args, **kwargs)
-    if (
-        name == "lmcache.integration.vllm.lmcache_mp_connector"
-        or (
-            name.startswith("lmcache")
-            and "lmcache.integration.vllm.lmcache_mp_connector" in sys.modules
-        )
-    ):
-        _apply_patch()
-    return module
-
-
-builtins.__import__ = _patching_import
-_apply_patch()
-PY
-}
-
-write_scheduler_assertion_patch() {
-    local patch_dir="$1"
-    mkdir -p "$patch_dir"
-    cat > "$patch_dir/scheduler_assertion_patch.py" <<'PY'
-"""
-Patch vLLM scheduler to handle stale finished_recving gracefully.
-
-The assertion at scheduler.py crashes when a KV transfer reports
-"finished recving" but the request is already in RUNNING state.
-This happens when transfers complete asynchronously and the scheduler
-has already moved the request forward.
-
-Fix: Instead of asserting, log a warning and skip.
-"""
-
-import logging
-import sys
-import builtins
-
-logger = logging.getLogger("scheduler_assertion_patch")
-
-
-def _apply_patch():
-    """Patch vLLM scheduler's _update_from_kv_xfer_finished."""
-    sched_mod = sys.modules.get("vllm.v1.core.sched.scheduler")
-    if sched_mod is None:
-        return
-    req_mod = sys.modules.get("vllm.v1.request")
-    if req_mod is None:
-        return
-    Scheduler = getattr(sched_mod, "Scheduler", None)
-    RequestStatus = getattr(req_mod, "RequestStatus", None)
-    if Scheduler is None or RequestStatus is None:
-        return
-    if getattr(Scheduler, "_kv_xfer_patch_applied", False):
-        return
-
-    _orig_update = Scheduler._update_from_kv_xfer_finished
-
-    def _patched_update(self, kv_connector_output):
-        if self.connector is not None:
-            self.connector.update_connector_output(kv_connector_output)
-        for req_id in kv_connector_output.finished_recving or ():
-            if req_id not in self.requests:
-                continue
-            req = self.requests[req_id]
-            if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS:
-                self.finished_recving_kv_req_ids.add(req_id)
-            elif RequestStatus.is_finished(req.status):
-                self._free_blocks(self.requests[req_id])
-            else:
-                logger.warning(
-                    "Stale finished_recving for req %s in status %s; skipping.",
-                    req_id, req.status.name,
-                )
-        for req_id in kv_connector_output.finished_sending or ():
-            if req_id not in self.requests:
-                continue
-            self._free_blocks(self.requests[req_id])
-
-    Scheduler._update_from_kv_xfer_finished = _patched_update
-    Scheduler._kv_xfer_patch_applied = True
-    logger.info("Scheduler KV transfer assertion patch applied")
-
-
-_orig_import = builtins.__import__
-
-
-def _patching_import(name, *args, **kwargs):
-    module = _orig_import(name, *args, **kwargs)
-    if (
-        name == "vllm.v1.core.sched.scheduler"
-        or (
-            name.startswith("vllm")
-            and "vllm.v1.core.sched.scheduler" in sys.modules
-        )
-    ):
-        _apply_patch()
-    return module
-
-
-builtins.__import__ = _patching_import
-_apply_patch()
-PY
-}
-
-# Workaround for MEC FW <177 RCCL memory reclaim issue
-version=$(rocm-smi --showfw 2>/dev/null | grep MEC | head -n 1 | awk '{print $NF}')
-if [[ "$version" == "" || ${version:-0} -lt 177 ]]; then
-    export HSA_NO_SCRATCH_RECLAIM=1
-fi
-
-export VLLM_ROCM_USE_AITER=1
-export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4
-
 # ---- Server config ----------------------------------------------------------
 SERVER_LOG="$RESULT_DIR/server.log"
 LMCACHE_LOG="$RESULT_DIR/lmcache_server.log"
@@ -591,6 +63,8 @@ mkdir -p "$RESULT_DIR"
 
 OFFLOAD_ARGS=()
 PREFIX_CACHE_ARGS=()
+
+# ---- Lmcache config ----------------------------------------------------------
 LMCACHE_PID=""
 
 cleanup_lmcache_server() {
@@ -648,7 +122,9 @@ case "$OFFLOADING" in
         # MI355X nodes have ~2.7 TiB of host DRAM available for offload;
         # reserve 2.5 TB for the offload pool (leaves ~200 GB headroom for
         # worker RSS / page cache / slurm cgroup).
-        TOTAL_CPU_DRAM_GB=2500
+        #TODO: fix
+        TOTAL_CPU_DRAM_GB=3000
+        TOTAL_CPU_DRAM_PARTITION_GB="${TOTAL_CPU_DRAM_PARTITION_GB:-$((TOTAL_CPU_DRAM_GB / (8 / TP)))}"
         # Use vLLM's regular native KV-offload path (OffloadingConnector),
         # NOT the SimpleCPUOffloadConnector. The "native" backend resolves to
         # OffloadingConnector by default; setting VLLM_USE_SIMPLE_KV_OFFLOAD=1
@@ -659,7 +135,7 @@ case "$OFFLOADING" in
         # (vllm/config/vllm.py:662).
         OFFLOAD_ARGS=(
             --kv_offloading_backend native
-            --kv_offloading_size "$TOTAL_CPU_DRAM_GB"
+            --kv_offloading_size "$TOTAL_CPU_DRAM_PARTITION_GB"
             --disable-hybrid-kv-cache-manager
         )
         ;;
@@ -667,74 +143,20 @@ case "$OFFLOADING" in
         { set +x; } 2>/dev/null
         unset VLLM_USE_SIMPLE_KV_OFFLOAD
 
-        agentic_pip_install --quiet --no-cache-dir lmcache
-        # LMCache's current dependency chain can install NVIDIA/CUDA NIXL and
-        # CuPy packages on ROCm. vLLM 0.21.0 treats ROCm as "cuda-like", and
-        # during Kimi fused-MoE model inspection it imports nixl_ep whenever
-        # that module is importable, even when this run is not using EP/NIXL
-        # kernels. The CUDA extension then fails immediately on AMD nodes with
-        # "ImportError: libcuda.so.1".
-        #
-        # LMCache MP also uses CuPy stream APIs while registering vLLM's KV
-        # caches. The CUDA CuPy wheel imports on ROCm, but it fails at runtime
-        # with cudaErrorInsufficientDriver when LMCache touches the stream. Use
-        # the ROCm 7 CuPy wheel so the same API dispatches through HIP.
-        python3 -m pip uninstall -y \
-            nixl nixl-cu12 nixl-cu13 nixl_ep \
-            >/dev/null 2>&1 || true
-        python3 -m pip uninstall -y \
-            cupy cupy-cuda11x cupy-cuda12x cupy-cuda13x \
-            >/dev/null 2>&1 || true
-        agentic_pip_install --quiet --no-cache-dir cupy-rocm-7-0
-        python3 - <<'PY'
-import importlib.util
-import sys
-
-spec = importlib.util.find_spec("nixl_ep")
-if spec is not None:
-    locations = ", ".join(spec.submodule_search_locations or [spec.origin or "unknown"])
-    print(
-        "Error: nixl_ep is still importable after LMCache install; "
-        "this ROCm Kimi run would import a CUDA-only nixl_ep module. "
-        f"location={locations}",
-        file=sys.stderr,
-    )
-    sys.exit(1)
-
-try:
-    from cupy_backends.cuda.api import runtime as cupy_runtime
-except Exception as exc:
-    print(f"Error: failed to import CuPy runtime after ROCm CuPy install: {exc}", file=sys.stderr)
-    sys.exit(1)
-
-if not getattr(cupy_runtime, "is_hip", False):
-    print(
-        "Error: CuPy is still using the CUDA backend after installing "
-        "cupy-rocm-7-0; LMCache MP would fail during KV-cache registration.",
-        file=sys.stderr,
-    )
-    sys.exit(1)
-PY
-        LMCACHE_ROCM_PATCH_DIR="$RESULT_DIR/lmcache_rocm_patch"
-        write_lmcache_rocm_mp_patch "$LMCACHE_ROCM_PATCH_DIR"
-        write_chunked_connector_patch "$LMCACHE_ROCM_PATCH_DIR"
-        write_scheduler_assertion_patch "$LMCACHE_ROCM_PATCH_DIR"
-        export LMCACHE_ROCM_MP_BLOCK_FALLBACK=1
-        export LMCACHE_ROCM_MP_BLOCK_FALLBACK_DTYPE=bfloat16
-        export LMCACHE_ROCM_DEMAND_PINNED_ALLOCATOR=1
-        # Cap external KV tokens loaded per scheduling step to prevent GPU
-        # block exhaustion deadlock at high concurrency (c>=32).  Default
-        # 32768 keeps peak block demand within the GPU KV pool.  Set to 0 to
-        # disable chunking (only safe at low concurrency).
-        export CHUNKED_LMCACHE_MAX_TOKENS_PER_LOAD="${CHUNKED_LMCACHE_MAX_TOKENS_PER_LOAD:-32768}"
-        export PYTHONPATH="$LMCACHE_ROCM_PATCH_DIR${PYTHONPATH:+:$PYTHONPATH}"
+        git clone https://github.com/LMCache/LMCache.git
+        cd LMCache
+        pip install -r requirements/build.txt 
+        CXX=hipcc BUILD_WITH_HIP=1 pip install -e .   --no-build-isolation
+        cd ..
+
         python3 -c "import lmcache.integration.vllm.lmcache_mp_connector" >/dev/null
 
         # Match the B200 Kimi LMCache setup: keep a 2.5 TB semantic CPU KV
         # pool, but let the external MP server own that pool so vLLM does not
         # split --kv-offloading-size across TP ranks through the integrated
         # LMCache backend.
-        TOTAL_CPU_DRAM_GB=2500
+        #TODO: fix
+        TOTAL_CPU_DRAM_GB=3000
         LMCACHE_HOST="${LMCACHE_HOST:-127.0.0.1}"
         LMCACHE_PORT="${LMCACHE_PORT:-5555}"
         LMCACHE_HTTP_PORT="${LMCACHE_HTTP_PORT:-8080}"
@@ -742,7 +164,7 @@ PY
         # ZMQ endpoint. Bind the server to a raw host, but pass the connector a
         # ZMQ-style host string.
         LMCACHE_CONNECT_HOST="${LMCACHE_CONNECT_HOST:-tcp://$LMCACHE_HOST}"
-        LMCACHE_L1_SIZE_GB="${LMCACHE_L1_SIZE_GB:-$TOTAL_CPU_DRAM_GB}"
+        LMCACHE_L1_SIZE_GB="${LMCACHE_L1_SIZE_GB:-$((TOTAL_CPU_DRAM_GB / (8 / TP)))}"
         LMCACHE_L1_INIT_SIZE_GB="${LMCACHE_L1_INIT_SIZE_GB:-20}"
         # LMCache read locks are leases on chunks that lookup has promised
         # vLLM can retrieve. The default 300s TTL is too short for this
@@ -750,10 +172,11 @@ PY
         # lookup and retrieve while GPU KV is saturated, which leaves the
         # object present in L1 but no longer readable. Keep the 2.5 TB pool
         # size unchanged and only extend the lookup-to-retrieve lease.
-        LMCACHE_L1_READ_TTL_SECONDS="${LMCACHE_L1_READ_TTL_SECONDS:-3600}"
+        LMCACHE_L1_READ_TTL_SECONDS="${LMCACHE_L1_READ_TTL_SECONDS:-7200}"
         LMCACHE_CHUNK_SIZE="${LMCACHE_CHUNK_SIZE:-256}"
         LMCACHE_MAX_WORKERS="${LMCACHE_MAX_WORKERS:-$TP}"
         export PYTHONHASHSEED="${PYTHONHASHSEED:-0}"
+        export LMCACHE_BLOCKING_TIMEOUT_SECS=120
 
         echo "Starting LMCache MP server..."
         LMCACHE_CMD=(
@@ -786,6 +209,7 @@ PY
     *) echo "Error: unsupported OFFLOADING value '$OFFLOADING'" >&2; exit 1 ;;
 esac
 
+# ---- LLM server config ----------------------------------------------------------
 EP_ARGS=()
 if [ "$EP_SIZE" -gt 1 ]; then
     EP_ARGS=(--enable-expert-parallel)
@@ -794,6 +218,23 @@ fi
 echo "Starting vllm server..."
 export PYTHONNOUSERSITE=1
 
+# Install amd-quark for MXFP4 (manual install due to ROCm vLLM bug)
+pip install amd-quark
+
+# Disable AITER RMSNorm for TP < 8 due to accuracy issues
+if [ "${TP}" -lt 8 ]; then
+  export VLLM_ROCM_USE_AITER_RMSNORM=0
+fi
+
+# Workaround for MEC FW <177 RCCL memory reclaim issue
+version=$(rocm-smi --showfw 2>/dev/null | grep MEC | head -n 1 | awk '{print $NF}')
+if [[ "$version" == "" || ${version:-0} -lt 177 ]]; then
+    export HSA_NO_SCRATCH_RECLAIM=1
+fi
+
+export VLLM_ROCM_USE_AITER=1
+export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4
+
 { set +x; } 2>/dev/null
 VLLM_CMD=(
     vllm serve "$MODEL_PATH" --served-model-name "$MODEL"
@@ -802,9 +243,9 @@ VLLM_CMD=(
     --tensor-parallel-size="$TP"
     "${EP_ARGS[@]}"
     --gpu-memory-utilization 0.90
+    --kv-cache-dtype fp8 \
     --block-size=1
     --trust-remote-code
-    --max-model-len "$MAX_MODEL_LEN"
     --max-num-seqs "$CONC"
     --mm-encoder-tp-mode data
     "${PREFIX_CACHE_ARGS[@]}"
@@ -821,4 +262,4 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S
 # ---- Run benchmark ----------------------------------------------------------
 build_replay_cmd "$RESULT_DIR"
 
-run_agentic_replay_and_write_outputs "$RESULT_DIR"
+run_agentic_replay_and_write_outputs "$RESULT_DIR"
\ No newline at end of file
diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp4_mi355x.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp4_mi355x.sh
new file mode 100755
index 000000000..640fe7f65
--- /dev/null
+++ b/benchmarks/single_node/agentic/minimaxm2.5_fp4_mi355x.sh
@@ -0,0 +1,272 @@
+#!/usr/bin/env bash
+set -euo pipefail
+set -x
+
+# Agentic trace replay benchmark for Kimi-K2.5 FP4 on MI355X using vLLM.
+#
+# Required env vars:
+#   MODEL, TP, CONC, OFFLOADING, TOTAL_CPU_DRAM_GB, RESULT_DIR
+#
+# OFFLOADING values:
+#   none    - vLLM GPU KV only.
+#   cpu     - vLLM native CPU offload.
+#   lmcache - LMCache MP server + vLLM LMCacheMPConnector.
+
+source "$(dirname "$0")/../../benchmark_lib.sh"
+
+check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR EP_SIZE DP_ATTENTION
+
+PORT=${PORT:-8888}
+DURATION=${DURATION:-1800}
+EP_SIZE=${EP_SIZE:-1}
+
+if [[ -n "${SLURM_JOB_ID:-}" ]]; then
+    echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
+fi
+
+# ROCR/HIP visibility for vLLM 0.14+
+if [ -n "${ROCR_VISIBLE_DEVICES:-}" ]; then
+    export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES"
+fi
+
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
+
+rocm-smi || true
+amd-smi || true
+
+# ---- Resolve traces and install deps ----------------------------------------
+# MiniMax-M2.5 servers run at max_model_len ~256k; the unfiltered 052726
+# corpus has requests up to ~1M proxy tokens that would be rejected.
+# Switch to the 256k-capped variant (470 traces, max in+out <= 256k).
+#export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k
+#060226
+export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060226_256k
+
+resolve_trace_source
+install_agentic_deps
+
+# ---- Server config ----------------------------------------------------------
+SERVER_LOG="$RESULT_DIR/server.log"
+LMCACHE_LOG="$RESULT_DIR/lmcache_server.log"
+mkdir -p "$RESULT_DIR"
+
+OFFLOAD_ARGS=()
+PREFIX_CACHE_ARGS=()
+
+# ---- Lmcache config ----------------------------------------------------------
+LMCACHE_PID=""
+
+cleanup_lmcache_server() {
+    if [[ -n "$LMCACHE_PID" ]] && kill -0 "$LMCACHE_PID" 2>/dev/null; then
+        kill "$LMCACHE_PID" 2>/dev/null || true
+        wait "$LMCACHE_PID" 2>/dev/null || true
+    fi
+}
+
+trap cleanup_lmcache_server EXIT
+
+wait_for_lmcache_ready() {
+    { set +x; } 2>/dev/null
+    local attempts="${LMCACHE_READY_ATTEMPTS:-120}"
+    local tail_pid=""
+
+    while [ ! -f "$LMCACHE_LOG" ]; do
+        if [[ -n "$LMCACHE_PID" ]] && ! kill -0 "$LMCACHE_PID" 2>/dev/null; then
+            echo "LMCache server died before creating log file. Exiting." >&2
+            exit 1
+        fi
+        sleep 1
+    done
+
+    tail -f -n +1 "$LMCACHE_LOG" &
+    tail_pid=$!
+
+    for ((i = 1; i <= attempts; i++)); do
+        if curl --output /dev/null --silent --fail "http://127.0.0.1:${LMCACHE_HTTP_PORT}/healthcheck"; then
+            kill "$tail_pid" 2>/dev/null || true
+            wait "$tail_pid" 2>/dev/null || true
+            return 0
+        fi
+        if [[ -n "$LMCACHE_PID" ]] && ! kill -0 "$LMCACHE_PID" 2>/dev/null; then
+            echo "LMCache server died before becoming healthy. Log follows:" >&2
+            kill "$tail_pid" 2>/dev/null || true
+            wait "$tail_pid" 2>/dev/null || true
+            cat "$LMCACHE_LOG" >&2 || true
+            exit 1
+        fi
+        sleep 1
+    done
+
+    echo "Timed out waiting for LMCache server healthcheck. Log follows:" >&2
+    kill "$tail_pid" 2>/dev/null || true
+    wait "$tail_pid" 2>/dev/null || true
+    cat "$LMCACHE_LOG" >&2 || true
+    exit 1
+}
+
+case "$OFFLOADING" in
+    none) ;;
+    cpu)
+        unset VLLM_USE_SIMPLE_KV_OFFLOAD
+        # MI355X nodes have ~2.7 TiB of host DRAM available for offload;
+        # reserve 2.5 TB for the offload pool (leaves ~200 GB headroom for
+        # worker RSS / page cache / slurm cgroup).
+        TOTAL_CPU_DRAM_GB=3000
+        TOTAL_CPU_DRAM_PARTITION_GB="${TOTAL_CPU_DRAM_PARTITION_GB:-$((TOTAL_CPU_DRAM_GB / (8 / TP)))}"
+        # Use vLLM's regular native KV-offload path (OffloadingConnector),
+        # NOT the SimpleCPUOffloadConnector. The "native" backend resolves to
+        # OffloadingConnector by default; setting VLLM_USE_SIMPLE_KV_OFFLOAD=1
+        # would switch it to SimpleCPUOffloadConnector. We intentionally leave
+        # that env var UNSET here so the regular OffloadingConnector path is
+        # used. The shortcut --kv_offloading_backend native + --kv_offloading_size
+        # form constructs the KVTransferConfig at engine startup
+        # (vllm/config/vllm.py:662).
+
+        # Remove --disable-hybrid-kv-cache-manager and enable hybrid kv cache manager (default)
+        # This gives extra cache hit than disabling hybrid kv cache manager
+        # srok,
+        # --no-disable-hybrid-kv-cache-manager is not compatible with lmcache, even for non-hma
+        # https://github.com/vllm-project/vllm/blob/0585b5ba2eaa7860d6976bc7ba376bdbca5119fc/vllm/distributed/kv_transfer/kv_connector/factory.py#L56-L60
+        OFFLOAD_ARGS=(
+            --kv_offloading_backend native
+            --kv_offloading_size "$TOTAL_CPU_DRAM_PARTITION_GB"
+            --disable-hybrid-kv-cache-manager
+        )
+        ;;
+    lmcache)
+        { set +x; } 2>/dev/null
+        unset VLLM_USE_SIMPLE_KV_OFFLOAD
+
+        git clone https://github.com/LMCache/LMCache.git
+        cd LMCache
+        pip install -r requirements/build.txt 
+        CXX=hipcc BUILD_WITH_HIP=1 pip install -e .   --no-build-isolation
+        cd ..
+
+        python3 -c "import lmcache.integration.vllm.lmcache_mp_connector" >/dev/null
+
+        # Match the B200 Kimi LMCache setup: keep a 2.5 TB semantic CPU KV
+        # pool, but let the external MP server own that pool so vLLM does not
+        # split --kv-offloading-size across TP ranks through the integrated
+        # LMCache backend.
+        TOTAL_CPU_DRAM_GB=3000
+        LMCACHE_HOST="${LMCACHE_HOST:-127.0.0.1}"
+        LMCACHE_PORT="${LMCACHE_PORT:-5555}"
+        LMCACHE_HTTP_PORT="${LMCACHE_HTTP_PORT:-8080}"
+        # LMCacheMPConnector concatenates lmcache.mp.host and port into the
+        # ZMQ endpoint. Bind the server to a raw host, but pass the connector a
+        # ZMQ-style host string.
+        LMCACHE_CONNECT_HOST="${LMCACHE_CONNECT_HOST:-tcp://$LMCACHE_HOST}"
+        #LMCACHE_L1_SIZE_GB="${LMCACHE_L1_SIZE_GB:-$((TOTAL_CPU_DRAM_GB / (8 / TP)))}"
+        # (srok)TODO: intentionally increased DRAM size
+        TOTAL_CPU_DRAM_GB=2000
+        LMCACHE_L1_SIZE_GB="${LMCACHE_L1_SIZE_GB:-$((TOTAL_CPU_DRAM_GB))}"
+        LMCACHE_L1_INIT_SIZE_GB="${LMCACHE_L1_INIT_SIZE_GB:-20}"
+        # LMCache read locks are leases on chunks that lookup has promised
+        # vLLM can retrieve. The default 300s TTL is too short for this
+        # long-context agentic queue: TP8/conc32 can spend >300s between
+        # lookup and retrieve while GPU KV is saturated, which leaves the
+        # object present in L1 but no longer readable. Keep the 2.5 TB pool
+        # size unchanged and only extend the lookup-to-retrieve lease.
+        LMCACHE_L1_READ_TTL_SECONDS="${LMCACHE_L1_READ_TTL_SECONDS:-7200}"
+        # (srok) check 256 vs 32
+        #LMCACHE_CHUNK_SIZE="${LMCACHE_CHUNK_SIZE:-256}"
+        LMCACHE_CHUNK_SIZE="${LMCACHE_CHUNK_SIZE:-32}"
+        LMCACHE_MAX_WORKERS="${LMCACHE_MAX_WORKERS:-$TP}"
+        export PYTHONHASHSEED="${PYTHONHASHSEED:-0}"
+        export LMCACHE_BLOCKING_TIMEOUT_SECS=120
+
+        set -x
+        echo "Starting LMCache MP server..."
+        LMCACHE_CMD=(
+            lmcache server
+            --host "$LMCACHE_HOST"
+            --port "$LMCACHE_PORT"
+            --http-host "$LMCACHE_HOST"
+            --http-port "$LMCACHE_HTTP_PORT"
+            --l1-size-gb "$LMCACHE_L1_SIZE_GB"
+            --l1-init-size-gb "$LMCACHE_L1_INIT_SIZE_GB"
+            --l1-read-ttl-seconds "$LMCACHE_L1_READ_TTL_SECONDS"
+            --chunk-size "$LMCACHE_CHUNK_SIZE"
+            --max-workers "$LMCACHE_MAX_WORKERS"
+            --eviction-policy LRU
+        )
+        printf '%q ' "${LMCACHE_CMD[@]}" > "$RESULT_DIR/lmcache_command.txt"
+        printf '\n' >> "$RESULT_DIR/lmcache_command.txt"
+        "${LMCACHE_CMD[@]}" > "$LMCACHE_LOG" 2>&1 &
+        LMCACHE_PID=$!
+        echo "LMCache server PID: $LMCACHE_PID"
+        wait_for_lmcache_ready
+
+        PREFIX_CACHE_ARGS=(--enable-prefix-caching)
+        # srok,
+        # --no-disable-hybrid-kv-cache-manager is not compatible with lmcache, even for non-hma
+        # https://github.com/vllm-project/vllm/blob/0585b5ba2eaa7860d6976bc7ba376bdbca5119fc/vllm/distributed/kv_transfer/kv_connector/factory.py#L56-L60
+        OFFLOAD_ARGS=(
+            --kv-transfer-config
+            "{\"kv_connector\":\"LMCacheMPConnector\",\"kv_connector_module_path\":\"lmcache.integration.vllm.lmcache_mp_connector\",\"kv_role\":\"kv_both\",\"kv_connector_extra_config\":{\"lmcache.mp.host\":\"$LMCACHE_CONNECT_HOST\",\"lmcache.mp.port\":$LMCACHE_PORT}}"
+            --disable-hybrid-kv-cache-manager
+        )
+        ;;
+    *) echo "Error: unsupported OFFLOADING value '$OFFLOADING'" >&2; exit 1 ;;
+esac
+
+# ---- LLM server config ----------------------------------------------------------
+EP_ARGS=()
+if [ "$EP_SIZE" -gt 1 ]; then
+    EP_ARGS=(--enable-expert-parallel)
+fi
+
+echo "Starting vllm server..."
+export PYTHONNOUSERSITE=1
+
+# Install amd-quark for MXFP4 (manual install due to ROCm vLLM bug)
+pip install -q amd-quark
+
+# Workaround for MEC FW <177 RCCL memory reclaim issue
+version=$(rocm-smi --showfw 2>/dev/null | grep MEC | head -n 1 | awk '{print $NF}')
+if [[ "$version" == "" || ${version:-0} -lt 177 ]]; then
+    export HSA_NO_SCRATCH_RECLAIM=1
+fi
+
+export VLLM_ROCM_USE_AITER=1
+export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4
+
+{ set +x; } 2>/dev/null
+VLLM_CMD=(
+    vllm serve "$MODEL_PATH" --served-model-name "$MODEL"
+    --host 0.0.0.0
+    --port "$PORT"
+    --tensor-parallel-size="$TP"
+    "${EP_ARGS[@]}"
+    --gpu-memory-utilization 0.95
+    --kv-cache-dtype fp8 \
+    --block-size=32
+    --trust-remote-code
+    --attention-backend "ROCM_AITER_FA" 
+    --max-num-seqs "$CONC"
+    "${PREFIX_CACHE_ARGS[@]}"
+    "${OFFLOAD_ARGS[@]}"
+)
+printf '%q ' "${VLLM_CMD[@]}" | tee "$RESULT_DIR/vllm_command.txt"
+printf '\n' | tee -a "$RESULT_DIR/vllm_command.txt"
+"${VLLM_CMD[@]}" > "$SERVER_LOG" 2>&1 &
+SERVER_PID=$!
+echo "Server PID: $SERVER_PID"
+
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
+# ---- Run benchmark ----------------------------------------------------------
+build_replay_cmd "$RESULT_DIR"
+
+run_agentic_replay_and_write_outputs "$RESULT_DIR"
\ No newline at end of file
diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh
index 8e15e7850..9f1f79a3f 100755
--- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh
+++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh
@@ -2,18 +2,23 @@
 set -euo pipefail
 set -x
 
-# Agentic trace replay benchmark for MiniMax-M2.5 FP8 on MI355X using vLLM.
+# Agentic trace replay benchmark for Kimi-K2.5 FP4 on MI355X using vLLM.
 #
 # Required env vars:
-#   MODEL, TP, CONC, RESULT_DIR
+#   MODEL, TP, CONC, OFFLOADING, TOTAL_CPU_DRAM_GB, RESULT_DIR
+#
+# OFFLOADING values:
+#   none    - vLLM GPU KV only.
+#   cpu     - vLLM native CPU offload.
+#   lmcache - LMCache MP server + vLLM LMCacheMPConnector.
 
 source "$(dirname "$0")/../../benchmark_lib.sh"
 
-check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION EP_SIZE
+check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR EP_SIZE DP_ATTENTION
 
-if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then
-    MAX_MODEL_LEN=131072
-fi
+PORT=${PORT:-8888}
+DURATION=${DURATION:-1800}
+EP_SIZE=${EP_SIZE:-1}
 
 if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
@@ -24,6 +29,10 @@ if [ -n "${ROCR_VISIBLE_DEVICES:-}" ]; then
     export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES"
 fi
 
+if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+rocm-smi || true
+amd-smi || true
+
 # `hf download` creates the target dir if missing and is itself idempotent.
 # When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
 # Either way, MODEL_PATH is what the server is launched with.
@@ -35,59 +44,240 @@ else
     hf download "$MODEL"
     export MODEL_PATH="$MODEL"
 fi
-rocm-smi || true
-amd-smi || true
 
 # ---- Resolve traces and install deps ----------------------------------------
 # MiniMax-M2.5 servers run at max_model_len ~256k; the unfiltered 052726
 # corpus has requests up to ~1M proxy tokens that would be rejected.
 # Switch to the 256k-capped variant (470 traces, max in+out <= 256k).
-export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060826_256k
+#export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k
+#060226
+export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060226_256k
 
 resolve_trace_source
 install_agentic_deps
 
 # ---- Server config ----------------------------------------------------------
 SERVER_LOG="$RESULT_DIR/server.log"
+LMCACHE_LOG="$RESULT_DIR/lmcache_server.log"
 mkdir -p "$RESULT_DIR"
 
-OFFLOAD_ARGS=""
+OFFLOAD_ARGS=()
+PREFIX_CACHE_ARGS=()
+
+# ---- Lmcache config ----------------------------------------------------------
+LMCACHE_PID=""
+
+cleanup_lmcache_server() {
+    if [[ -n "$LMCACHE_PID" ]] && kill -0 "$LMCACHE_PID" 2>/dev/null; then
+        kill "$LMCACHE_PID" 2>/dev/null || true
+        wait "$LMCACHE_PID" 2>/dev/null || true
+    fi
+}
+
+trap cleanup_lmcache_server EXIT
+
+wait_for_lmcache_ready() {
+    { set +x; } 2>/dev/null
+    local attempts="${LMCACHE_READY_ATTEMPTS:-120}"
+    local tail_pid=""
+
+    while [ ! -f "$LMCACHE_LOG" ]; do
+        if [[ -n "$LMCACHE_PID" ]] && ! kill -0 "$LMCACHE_PID" 2>/dev/null; then
+            echo "LMCache server died before creating log file. Exiting." >&2
+            exit 1
+        fi
+        sleep 1
+    done
+
+    tail -f -n +1 "$LMCACHE_LOG" &
+    tail_pid=$!
+
+    for ((i = 1; i <= attempts; i++)); do
+        if curl --output /dev/null --silent --fail "http://127.0.0.1:${LMCACHE_HTTP_PORT}/healthcheck"; then
+            kill "$tail_pid" 2>/dev/null || true
+            wait "$tail_pid" 2>/dev/null || true
+            return 0
+        fi
+        if [[ -n "$LMCACHE_PID" ]] && ! kill -0 "$LMCACHE_PID" 2>/dev/null; then
+            echo "LMCache server died before becoming healthy. Log follows:" >&2
+            kill "$tail_pid" 2>/dev/null || true
+            wait "$tail_pid" 2>/dev/null || true
+            cat "$LMCACHE_LOG" >&2 || true
+            exit 1
+        fi
+        sleep 1
+    done
+
+    echo "Timed out waiting for LMCache server healthcheck. Log follows:" >&2
+    kill "$tail_pid" 2>/dev/null || true
+    wait "$tail_pid" 2>/dev/null || true
+    cat "$LMCACHE_LOG" >&2 || true
+    exit 1
+}
+
 case "$OFFLOADING" in
     none) ;;
     cpu)
-        # SimpleCPUOffloadConnector now works on ROCm with the
-        # vllm/vllm-openai-rocm:nightly-51f22dcfd0... image (vllm-project/vllm@20cac26b).
-        # Use the same offload path as NVIDIA so cross-vendor cpu-offload
-        # numbers are apples-to-apples.
-        # MI355X nodes have substantial DRAM; override workflow default (600 GB)
-        # so we offload up to 2 TB of KV cache.
+        unset VLLM_USE_SIMPLE_KV_OFFLOAD
+        # MI355X nodes have ~2.7 TiB of host DRAM available for offload;
+        # reserve 2.5 TB for the offload pool (leaves ~200 GB headroom for
+        # worker RSS / page cache / slurm cgroup).
+        TOTAL_CPU_DRAM_GB=3000
+        TOTAL_CPU_DRAM_PARTITION_GB="${TOTAL_CPU_DRAM_PARTITION_GB:-$((TOTAL_CPU_DRAM_GB / (8 / TP)))}"
+        # Use vLLM's regular native KV-offload path (OffloadingConnector),
+        # NOT the SimpleCPUOffloadConnector. The "native" backend resolves to
+        # OffloadingConnector by default; setting VLLM_USE_SIMPLE_KV_OFFLOAD=1
+        # would switch it to SimpleCPUOffloadConnector. We intentionally leave
+        # that env var UNSET here so the regular OffloadingConnector path is
+        # used. The shortcut --kv_offloading_backend native + --kv_offloading_size
+        # form constructs the KVTransferConfig at engine startup
+        # (vllm/config/vllm.py:662).
+
+        # Remove --disable-hybrid-kv-cache-manager and enable hybrid kv cache manager (default)
+        # This gives extra cache hit than disabling hybrid kv cache manager
+        OFFLOAD_ARGS=(
+            --kv_offloading_backend native
+            --kv_offloading_size "$TOTAL_CPU_DRAM_PARTITION_GB"
+        )
+        ;;
+    lmcache)
+        { set +x; } 2>/dev/null
+        unset VLLM_USE_SIMPLE_KV_OFFLOAD
+
+        git clone https://github.com/LMCache/LMCache.git
+        cd LMCache
+        pip install -r requirements/build.txt 
+        CXX=hipcc BUILD_WITH_HIP=1 pip install -e .   --no-build-isolation
+        cd ..
+
+        python3 -c "import lmcache.integration.vllm.lmcache_mp_connector" >/dev/null
+
+        # Match the B200 Kimi LMCache setup: keep a 2.5 TB semantic CPU KV
+        # pool, but let the external MP server own that pool so vLLM does not
+        # split --kv-offloading-size across TP ranks through the integrated
+        # LMCache backend.
+        TOTAL_CPU_DRAM_GB=3000
+        LMCACHE_HOST="${LMCACHE_HOST:-127.0.0.1}"
+        LMCACHE_PORT="${LMCACHE_PORT:-5555}"
+        LMCACHE_HTTP_PORT="${LMCACHE_HTTP_PORT:-8080}"
+        # LMCacheMPConnector concatenates lmcache.mp.host and port into the
+        # ZMQ endpoint. Bind the server to a raw host, but pass the connector a
+        # ZMQ-style host string.
+        LMCACHE_CONNECT_HOST="${LMCACHE_CONNECT_HOST:-tcp://$LMCACHE_HOST}"
+        #LMCACHE_L1_SIZE_GB="${LMCACHE_L1_SIZE_GB:-$((TOTAL_CPU_DRAM_GB / (8 / TP)))}"
+        # (srok)TODO: intentionally increased DRAM size
         TOTAL_CPU_DRAM_GB=2000
-        export VLLM_USE_SIMPLE_KV_OFFLOAD=1
-        OFFLOAD_ARGS="--kv_offloading_backend native --kv_offloading_size $TOTAL_CPU_DRAM_GB --disable-hybrid-kv-cache-manager"
+        LMCACHE_L1_SIZE_GB="${LMCACHE_L1_SIZE_GB:-$((TOTAL_CPU_DRAM_GB))}"
+        LMCACHE_L1_INIT_SIZE_GB="${LMCACHE_L1_INIT_SIZE_GB:-20}"
+        # LMCache read locks are leases on chunks that lookup has promised
+        # vLLM can retrieve. The default 300s TTL is too short for this
+        # long-context agentic queue: TP8/conc32 can spend >300s between
+        # lookup and retrieve while GPU KV is saturated, which leaves the
+        # object present in L1 but no longer readable. Keep the 2.5 TB pool
+        # size unchanged and only extend the lookup-to-retrieve lease.
+        LMCACHE_L1_READ_TTL_SECONDS="${LMCACHE_L1_READ_TTL_SECONDS:-7200}"
+        # (srok) check 256 vs 32
+        #LMCACHE_CHUNK_SIZE="${LMCACHE_CHUNK_SIZE:-256}"
+        LMCACHE_CHUNK_SIZE="${LMCACHE_CHUNK_SIZE:-32}"
+        LMCACHE_MAX_WORKERS="${LMCACHE_MAX_WORKERS:-$TP}"
+        export PYTHONHASHSEED="${PYTHONHASHSEED:-0}"
+        export LMCACHE_BLOCKING_TIMEOUT_SECS=120
+
+        set -x
+        echo "Starting LMCache MP server..."
+        LMCACHE_CMD=(
+            lmcache server
+            --host "$LMCACHE_HOST"
+            --port "$LMCACHE_PORT"
+            --http-host "$LMCACHE_HOST"
+            --http-port "$LMCACHE_HTTP_PORT"
+            --l1-size-gb "$LMCACHE_L1_SIZE_GB"
+            --l1-init-size-gb "$LMCACHE_L1_INIT_SIZE_GB"
+            --l1-read-ttl-seconds "$LMCACHE_L1_READ_TTL_SECONDS"
+            --chunk-size "$LMCACHE_CHUNK_SIZE"
+            --max-workers "$LMCACHE_MAX_WORKERS"
+            --eviction-policy LRU
+        )
+        printf '%q ' "${LMCACHE_CMD[@]}" > "$RESULT_DIR/lmcache_command.txt"
+        printf '\n' >> "$RESULT_DIR/lmcache_command.txt"
+        "${LMCACHE_CMD[@]}" > "$LMCACHE_LOG" 2>&1 &
+        LMCACHE_PID=$!
+        echo "LMCache server PID: $LMCACHE_PID"
+        wait_for_lmcache_ready
+
+        PREFIX_CACHE_ARGS=(--enable-prefix-caching)
+        # Remove --disable-hybrid-kv-cache-manager and enable hybrid kv cache manager (default)
+        # This gives extra cache hit than disabling hybrid kv cache manager
+        OFFLOAD_ARGS=(
+            --kv-transfer-config
+            "{\"kv_connector\":\"LMCacheMPConnector\",\"kv_connector_module_path\":\"lmcache.integration.vllm.lmcache_mp_connector\",\"kv_role\":\"kv_both\",\"kv_connector_extra_config\":{\"lmcache.mp.host\":\"$LMCACHE_CONNECT_HOST\",\"lmcache.mp.port\":$LMCACHE_PORT}}"
+        )
         ;;
     *) echo "Error: unsupported OFFLOADING value '$OFFLOADING'" >&2; exit 1 ;;
 esac
 
-if [ "$EP_SIZE" -gt 1 ]; then EP=" --enable-expert-parallel"; else EP=" "; fi
+# ---- LLM server config ----------------------------------------------------------
+EP_ARGS=()
+if [ "$EP_SIZE" -gt 1 ]; then
+    EP_ARGS=(--enable-expert-parallel)
+fi
 
 echo "Starting vllm server..."
+export PYTHONNOUSERSITE=1
+
+# Install amd-quark for MXFP4 (manual install due to ROCm vLLM bug)
+pip install -q amd-quark
+
+# Workaround for MEC FW <177 RCCL memory reclaim issue
+version=$(rocm-smi --showfw 2>/dev/null | grep MEC | head -n 1 | awk '{print $NF}')
+if [[ "$version" == "" || ${version:-0} -lt 177 ]]; then
+    export HSA_NO_SCRATCH_RECLAIM=1
+fi
+
 export VLLM_ROCM_USE_AITER=1
 export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4
-export PYTHONNOUSERSITE=1
+export VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=0
+VLLM_BLOCK_SIZE=32
+ASYNC_SCHEDULING_ARGS=""
+
+if [[ "$TP" == "8" && "$EP_SIZE" == "8" ]]; then
+    export VLLM_ROCM_USE_AITER_MOE=0
+    ASYNC_SCHEDULING_ARGS="--no-async-scheduling"
+    echo "TP8/EP8: using block size 32, shuffle disabled, AITER MoE disabled, async scheduling disabled."
+elif (( CONC < 64 )); then
+    ASYNC_SCHEDULING_ARGS="--no-async-scheduling"
+    echo "c${CONC}: using block size 32, shuffle disabled, async scheduling disabled."
+elif (( CONC == 64 )); then
+    ASYNC_SCHEDULING_ARGS="--no-async-scheduling"
+    export VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=1
+    VLLM_BLOCK_SIZE=16
+    echo "c64: using block size 16, shuffle enabled, async scheduling disabled."
+else
+    export VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=1
+    VLLM_BLOCK_SIZE=16
+    echo "c${CONC}: using block size 16, shuffle enabled, async scheduling enabled."
+fi
 
-vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \
---host 0.0.0.0 \
---port $PORT \
---tensor-parallel-size=$TP \
-$EP \
---gpu-memory-utilization 0.95 \
---max-model-len $MAX_MODEL_LEN \
---kv-cache-dtype fp8 \
---block-size=32 \
---max-num-seqs $CONC \
---attention-backend "ROCM_AITER_UNIFIED_ATTN" \
---trust-remote-code \
-$OFFLOAD_ARGS > "$SERVER_LOG" 2>&1 &
+{ set +x; } 2>/dev/null
+VLLM_CMD=(
+    vllm serve "$MODEL"
+    --host 0.0.0.0
+    --port "$PORT"
+    --tensor-parallel-size="$TP"
+    "${EP_ARGS[@]}"
+    --gpu-memory-utilization 0.95
+    --kv-cache-dtype fp8 
+    --block-size=$VLLM_BLOCK_SIZE 
+    --trust-remote-code
+    --attention-backend "ROCM_AITER_FA" 
+    --max-num-seqs "$CONC"
+    $ASYNC_SCHEDULING_ARGS 
+    "${PREFIX_CACHE_ARGS[@]}"
+    "${OFFLOAD_ARGS[@]}"
+)
+printf '%q ' "${VLLM_CMD[@]}" | tee "$RESULT_DIR/vllm_command.txt"
+printf '\n' | tee -a "$RESULT_DIR/vllm_command.txt"
+"${VLLM_CMD[@]}" > "$SERVER_LOG" 2>&1 &
 SERVER_PID=$!
 echo "Server PID: $SERVER_PID"
 
diff --git a/benchmarks/single_node/agentic/qwen3.5_fp4_mi355x.sh b/benchmarks/single_node/agentic/qwen3.5_fp4_mi355x.sh
new file mode 100755
index 000000000..fe85b05ab
--- /dev/null
+++ b/benchmarks/single_node/agentic/qwen3.5_fp4_mi355x.sh
@@ -0,0 +1,150 @@
+#!/usr/bin/env bash
+set -euo pipefail
+set -x
+
+# Agentic trace replay benchmark for Qwen3.5 FP8 on MI300X using SGLang.
+#
+# Base server recipe follows the upstream MI300X reference
+# (benchmarks/single_node/qwen3.5_fp8_mi300x.sh, the "AMD Andy" recipe):
+# aiter attention backend, aiter allreduce fusion, mem-fraction 0.75.
+# The agentic harness (resolve_trace_source / build_replay_cmd /
+# run_agentic_replay_and_write_outputs) replaces run_benchmark_serving, and
+# --disable-radix-cache is dropped because agentic replay needs prefix reuse.
+#
+# Required env vars:
+#   MODEL, TP, CONC, OFFLOADING, TOTAL_CPU_DRAM_GB, RESULT_DIR, DURATION, EP_SIZE
+#
+# OFFLOADING values:
+#   none    - SGLang GPU KV with the default RadixAttention prefix cache.
+#   hicache - SGLang HiCache with a local CPU hierarchical cache on top of radix.
+
+source "$(dirname "$0")/../../benchmark_lib.sh"
+
+check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR EP_SIZE DP_ATTENTION
+
+PORT=${PORT:-8888}
+DURATION=${DURATION:-1800}
+EP_SIZE=${EP_SIZE:-1}
+
+SCHEDULER_RECV_INTERVAL=${SCHEDULER_RECV_INTERVAL:-30}
+
+if [[ -n "${SLURM_JOB_ID:-}" ]]; then
+    echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
+fi
+
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
+
+rocm-smi || true
+amd-smi || true
+
+# ---- Resolve traces and install deps ----------------------------------------
+# Cap the replay corpus at 256k (470 traces, max in+out <= 256k) instead of the
+# unfiltered 052726 corpus whose ~1M-token traces get rejected and add no perf
+# signal at high concurrency.
+#export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k
+#060226
+export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060226_256k
+
+# ---- Resolve traces and install deps ----------------------------------------
+resolve_trace_source
+install_agentic_deps
+
+# ---- Cache / offload config -------------------------------------------------
+SERVER_LOG="$RESULT_DIR/server.log"
+mkdir -p "$RESULT_DIR"
+
+CACHE_ARGS=()
+WARMUP_ARGS=()
+CUDA_GRAPH_MAX_BS="$CONC"
+case "$OFFLOADING" in
+    none)
+        # Leave SGLang's default RadixAttention prefix cache on — agentic
+        # replay needs it; --disable-radix-cache would zero the hit rate.
+        ;;
+    hicache)
+        # Qwen3.5's hybrid GDN/Mamba path allocates two HiCache host pools per
+        # TP rank (one hierarchical KV, one hierarchical Mamba), so the
+        # node-total DRAM budget divides by TP and the host-pool count.
+        TOTAL_CPU_DRAM_GB=3000
+        HICACHE_HOST_POOL_COUNT="${HICACHE_HOST_POOL_COUNT:-2}"
+        HICACHE_MAX_SIZE_GB_PER_RANK_POOL="${HICACHE_MAX_SIZE_GB_PER_RANK_POOL:-${HICACHE_MAX_SIZE_GB_PER_RANK:-300}}"
+        HICACHE_WRITE_POLICY="${HICACHE_WRITE_POLICY:-write_through_selective}"
+        # Qwen3.5's hybrid Mamba path runs SGLang's no_buffer scheduler, which
+        # requires page_size=1. Keep the safer direct/layer_first copy path;
+        # kernel/page_first faults on first prefill in this mode on ROCm.
+        HICACHE_PAGE_SIZE="${HICACHE_PAGE_SIZE:-1}"
+        HICACHE_IO_BACKEND="${HICACHE_IO_BACKEND:-direct}"
+        HICACHE_MEM_LAYOUT="${HICACHE_MEM_LAYOUT:-layer_first}"
+        HICACHE_SIZE_GB="${HICACHE_SIZE_GB:-$((TOTAL_CPU_DRAM_GB / TP / HICACHE_HOST_POOL_COUNT))}"
+        if [ "$HICACHE_SIZE_GB" -gt "$HICACHE_MAX_SIZE_GB_PER_RANK_POOL" ]; then
+            HICACHE_SIZE_GB="$HICACHE_MAX_SIZE_GB_PER_RANK_POOL"
+        fi
+        if [ "$HICACHE_SIZE_GB" -lt 1 ]; then
+            echo "Error: computed HICACHE_SIZE_GB=$HICACHE_SIZE_GB from TOTAL_CPU_DRAM_GB=$TOTAL_CPU_DRAM_GB, TP=$TP, HICACHE_HOST_POOL_COUNT=$HICACHE_HOST_POOL_COUNT" >&2
+            exit 1
+        fi
+        echo "HiCache CPU pool: ${HICACHE_SIZE_GB} GB per rank per host pool across TP=${TP}, host_pool_count=${HICACHE_HOST_POOL_COUNT}"
+        CACHE_ARGS=(
+            --page-size "$HICACHE_PAGE_SIZE"
+            --enable-hierarchical-cache
+            --hicache-size "$HICACHE_SIZE_GB"
+            --hicache-io-backend "$HICACHE_IO_BACKEND"
+            --hicache-mem-layout "$HICACHE_MEM_LAYOUT"
+            --hicache-write-policy "$HICACHE_WRITE_POLICY"
+        )
+        # HiCache startup reaches API readiness but SGLang's internal warmup
+        # request can time out on this path; let aiperf own benchmark traffic.
+        WARMUP_ARGS=(--skip-server-warmup)
+        # Don't force ROCm graph capture at every high concurrency point; conc=16
+        # is the highest known-good capture size for this model/server path.
+        HICACHE_CUDA_GRAPH_MAX_BS="${HICACHE_CUDA_GRAPH_MAX_BS:-256}"
+        if [ "$HICACHE_CUDA_GRAPH_MAX_BS" -lt "$CUDA_GRAPH_MAX_BS" ]; then
+            CUDA_GRAPH_MAX_BS="$HICACHE_CUDA_GRAPH_MAX_BS"
+        fi
+        ;;
+    *)
+        echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, hicache)" >&2
+        exit 1
+        ;;
+esac
+
+echo "Starting SGLang server..."
+export PYTHONNOUSERSITE=1
+
+python3 -m sglang.launch_server \
+    --attention-backend aiter \
+    --model-path $MODEL \
+    --host=0.0.0.0 \
+    --port $PORT \
+    --tensor-parallel-size $TP \
+    --ep-size $EP_SIZE \
+    --trust-remote-code \
+    --model-loader-extra-config '{"enable_multithread_load": true}' \
+    --watchdog-timeout 1200  \
+    --tokenizer-worker-num 6 \
+    --cuda-graph-max-bs $CONC \
+    --max-running-requests $CONC \
+    --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \
+    --mem-fraction-static 0.8 \
+    "${CACHE_ARGS[@]}" \
+    "${WARMUP_ARGS[@]}" \
+    --enable-metrics > "$SERVER_LOG" 2>&1 &
+SERVER_PID=$!
+echo "Server PID: $SERVER_PID"
+
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
+# ---- Run benchmark ----------------------------------------------------------
+build_replay_cmd "$RESULT_DIR"
+
+run_agentic_replay_and_write_outputs "$RESULT_DIR"
\ No newline at end of file
diff --git a/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh b/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh
index ff901b674..8c6f82410 100755
--- a/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh
+++ b/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh
@@ -2,18 +2,31 @@
 set -euo pipefail
 set -x
 
-# Agentic trace replay benchmark for Qwen3.5 FP8 on MI355X using SGLang.
+# Agentic trace replay benchmark for Qwen3.5 FP8 on MI300X using SGLang.
+#
+# Base server recipe follows the upstream MI300X reference
+# (benchmarks/single_node/qwen3.5_fp8_mi300x.sh, the "AMD Andy" recipe):
+# aiter attention backend, aiter allreduce fusion, mem-fraction 0.75.
+# The agentic harness (resolve_trace_source / build_replay_cmd /
+# run_agentic_replay_and_write_outputs) replaces run_benchmark_serving, and
+# --disable-radix-cache is dropped because agentic replay needs prefix reuse.
 #
 # Required env vars:
-#   MODEL, TP, CONC, RESULT_DIR
+#   MODEL, TP, CONC, OFFLOADING, TOTAL_CPU_DRAM_GB, RESULT_DIR, DURATION, EP_SIZE
+#
+# OFFLOADING values:
+#   none    - SGLang GPU KV with the default RadixAttention prefix cache.
+#   hicache - SGLang HiCache with a local CPU hierarchical cache on top of radix.
 
 source "$(dirname "$0")/../../benchmark_lib.sh"
 
-check_env_vars MODEL TP CONC RESULT_DIR DURATION EP_SIZE
+check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR EP_SIZE DP_ATTENTION
 
-if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then
-    MAX_MODEL_LEN=131072
-fi
+PORT=${PORT:-8888}
+DURATION=${DURATION:-1800}
+EP_SIZE=${EP_SIZE:-1}
+
+SCHEDULER_RECV_INTERVAL=${SCHEDULER_RECV_INTERVAL:-30}
 
 if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
@@ -30,23 +43,87 @@ else
     hf download "$MODEL"
     export MODEL_PATH="$MODEL"
 fi
+
 rocm-smi || true
 amd-smi || true
 
+# ---- Resolve traces and install deps ----------------------------------------
+# Cap the replay corpus at 256k (470 traces, max in+out <= 256k) instead of the
+# unfiltered 052726 corpus whose ~1M-token traces get rejected and add no perf
+# signal at high concurrency.
+#export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k
+#060226
+export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060226_256k
+
 # ---- Resolve traces and install deps ----------------------------------------
 resolve_trace_source
 install_agentic_deps
 
-# ---- Start SGLang server ----------------------------------------------------
+# ---- Cache / offload config -------------------------------------------------
 SERVER_LOG="$RESULT_DIR/server.log"
 mkdir -p "$RESULT_DIR"
 
+CACHE_ARGS=()
+WARMUP_ARGS=()
+CUDA_GRAPH_MAX_BS="$CONC"
+case "$OFFLOADING" in
+    none)
+        # Leave SGLang's default RadixAttention prefix cache on — agentic
+        # replay needs it; --disable-radix-cache would zero the hit rate.
+        ;;
+    hicache)
+        # Qwen3.5's hybrid GDN/Mamba path allocates two HiCache host pools per
+        # TP rank (one hierarchical KV, one hierarchical Mamba), so the
+        # node-total DRAM budget divides by TP and the host-pool count.
+        TOTAL_CPU_DRAM_GB=3000
+        HICACHE_HOST_POOL_COUNT="${HICACHE_HOST_POOL_COUNT:-2}"
+        HICACHE_MAX_SIZE_GB_PER_RANK_POOL="${HICACHE_MAX_SIZE_GB_PER_RANK_POOL:-${HICACHE_MAX_SIZE_GB_PER_RANK:-300}}"
+        HICACHE_WRITE_POLICY="${HICACHE_WRITE_POLICY:-write_through_selective}"
+        # Qwen3.5's hybrid Mamba path runs SGLang's no_buffer scheduler, which
+        # requires page_size=1. Keep the safer direct/layer_first copy path;
+        # kernel/page_first faults on first prefill in this mode on ROCm.
+        HICACHE_PAGE_SIZE="${HICACHE_PAGE_SIZE:-1}"
+        HICACHE_IO_BACKEND="${HICACHE_IO_BACKEND:-direct}"
+        HICACHE_MEM_LAYOUT="${HICACHE_MEM_LAYOUT:-layer_first}"
+        HICACHE_SIZE_GB="${HICACHE_SIZE_GB:-$((TOTAL_CPU_DRAM_GB / TP / HICACHE_HOST_POOL_COUNT))}"
+        if [ "$HICACHE_SIZE_GB" -gt "$HICACHE_MAX_SIZE_GB_PER_RANK_POOL" ]; then
+            HICACHE_SIZE_GB="$HICACHE_MAX_SIZE_GB_PER_RANK_POOL"
+        fi
+        if [ "$HICACHE_SIZE_GB" -lt 1 ]; then
+            echo "Error: computed HICACHE_SIZE_GB=$HICACHE_SIZE_GB from TOTAL_CPU_DRAM_GB=$TOTAL_CPU_DRAM_GB, TP=$TP, HICACHE_HOST_POOL_COUNT=$HICACHE_HOST_POOL_COUNT" >&2
+            exit 1
+        fi
+        echo "HiCache CPU pool: ${HICACHE_SIZE_GB} GB per rank per host pool across TP=${TP}, host_pool_count=${HICACHE_HOST_POOL_COUNT}"
+        CACHE_ARGS=(
+            --page-size "$HICACHE_PAGE_SIZE"
+            --enable-hierarchical-cache
+            --hicache-size "$HICACHE_SIZE_GB"
+            --hicache-io-backend "$HICACHE_IO_BACKEND"
+            --hicache-mem-layout "$HICACHE_MEM_LAYOUT"
+            --hicache-write-policy "$HICACHE_WRITE_POLICY"
+        )
+        # HiCache startup reaches API readiness but SGLang's internal warmup
+        # request can time out on this path; let aiperf own benchmark traffic.
+        WARMUP_ARGS=(--skip-server-warmup)
+        # Don't force ROCm graph capture at every high concurrency point; conc=16
+        # is the highest known-good capture size for this model/server path.
+        HICACHE_CUDA_GRAPH_MAX_BS="${HICACHE_CUDA_GRAPH_MAX_BS:-256}"
+        if [ "$HICACHE_CUDA_GRAPH_MAX_BS" -lt "$CUDA_GRAPH_MAX_BS" ]; then
+            CUDA_GRAPH_MAX_BS="$HICACHE_CUDA_GRAPH_MAX_BS"
+        fi
+        ;;
+    *)
+        echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, hicache)" >&2
+        exit 1
+        ;;
+esac
+
 echo "Starting SGLang server..."
 export PYTHONNOUSERSITE=1
 
 python3 -m sglang.launch_server \
     --attention-backend triton \
-    --model-path "$MODEL_PATH" --served-model-name "$MODEL" \
+    --model-path $MODEL \
     --host=0.0.0.0 \
     --port $PORT \
     --tensor-parallel-size $TP \
@@ -56,10 +133,10 @@ python3 -m sglang.launch_server \
     --enable-aiter-allreduce-fusion \
     --cuda-graph-max-bs $CONC \
     --max-running-requests $CONC \
-    --max-prefill-tokens 32768 \
-    --scheduler-recv-interval 30 \
+    --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \
     --mem-fraction-static 0.8 \
-    --context-length $MAX_MODEL_LEN \
+    "${CACHE_ARGS[@]}" \
+    "${WARMUP_ARGS[@]}" \
     --enable-metrics > "$SERVER_LOG" 2>&1 &
 SERVER_PID=$!
 echo "Server PID: $SERVER_PID"
@@ -69,4 +146,4 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S
 # ---- Run benchmark ----------------------------------------------------------
 build_replay_cmd "$RESULT_DIR"
 
-run_agentic_replay_and_write_outputs "$RESULT_DIR"
+run_agentic_replay_and_write_outputs "$RESULT_DIR"
\ No newline at end of file
diff --git a/runners/launch_mi355x-amds.sh b/runners/launch_mi355x-amds.sh
index d62e6bc4b..96b9ad4f0 100644
--- a/runners/launch_mi355x-amds.sh
+++ b/runners/launch_mi355x-amds.sh
@@ -210,7 +210,7 @@ else
     #   mia1-p01-g09: pyxis broken (persistently fails to create container filesystem)
     #   mia1-p01-g11: docker.sock permissions denied (cluster-cleanup step fails)
     # Both have been root-caused via #1431/#1432/#1440/#1441/#1443 sweep failures.
-    salloc --partition=$PARTITION --exclude=mia1-p01-g09,mia1-p01-g11 --gres=gpu:$TP --exclusive --cpus-per-task=128 --time=500 --no-shell --job-name="$RUNNER_NAME"
+    salloc --partition=$PARTITION --exclude=mia1-p01-g09,mia1-p01-g11,mia1-p01-g37 --gres=gpu:$TP --exclusive --cpus-per-task=128 --time=500 --no-shell --job-name="$RUNNER_NAME"
     JOB_ID=$(squeue --name="$RUNNER_NAME" -h -o %A | head -n1)
 
     srun --jobid=$JOB_ID bash -c "docker stop \$(docker ps -a -q)"

From ba1bb37c2c95215185a2a103516aa21f64bb5b65 Mon Sep 17 00:00:00 2001
From: ajith-sirra-amd <ajith.sirra@amd.com>
Date: Thu, 11 Jun 2026 12:54:36 +0530
Subject: [PATCH 02/21] Add GLM5.1 & Qwen3.5 MI300 Agentic Scripts

Signed-off-by: ajith-sirra-amd <ajith.sirra@amd.com>
---
 .../single_node/agentic/glm5.1_fp8_mi300x.sh  | 117 ++++++++++++++
 .../single_node/agentic/qwen3.5_fp8_mi300x.sh | 147 ++++++++++++++++++
 2 files changed, 264 insertions(+)
 create mode 100644 benchmarks/single_node/agentic/glm5.1_fp8_mi300x.sh
 create mode 100644 benchmarks/single_node/agentic/qwen3.5_fp8_mi300x.sh

diff --git a/benchmarks/single_node/agentic/glm5.1_fp8_mi300x.sh b/benchmarks/single_node/agentic/glm5.1_fp8_mi300x.sh
new file mode 100644
index 000000000..3918ef9de
--- /dev/null
+++ b/benchmarks/single_node/agentic/glm5.1_fp8_mi300x.sh
@@ -0,0 +1,117 @@
+#!/usr/bin/env bash
+set -euo pipefail
+set -x
+
+# Agentic trace replay benchmark for GLM-5.1 FP8 on MI300X using SGLang.
+#
+# Required env vars:
+#   MODEL, TP, CONC, OFFLOADING, TOTAL_CPU_DRAM_GB, RESULT_DIR, DURATION
+#
+# OFFLOADING values:
+#   none    - SGLang GPU KV with the default RadixAttention prefix cache.
+#   hicache - SGLang HiCache with a local CPU hierarchical cache on top of radix.
+
+source "$(dirname "$0")/../../benchmark_lib.sh"
+
+check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION
+
+if [[ -n "${SLURM_JOB_ID:-}" ]]; then
+    echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
+fi
+
+if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+rocm-smi || true
+amd-smi || true
+
+# ---- Resolve traces and install deps ----------------------------------------
+resolve_trace_source
+install_agentic_deps
+
+# ROCm / SGLang performance tuning for MI300X (gfx942)
+export SAFETENSORS_FAST_GPU=1
+
+# ---- Cache / offload config -------------------------------------------------
+SERVER_LOG="$RESULT_DIR/server.log"
+mkdir -p "$RESULT_DIR"
+
+CACHE_ARGS=()
+CUDA_GRAPH_MAX_BS="$CONC"
+case "$OFFLOADING" in
+    none)
+        # Leave SGLang's default RadixAttention prefix cache on — agentic
+        # replay needs it; --disable-radix-cache would zero the hit rate.
+        ;;
+    hicache)
+        # GLM-5.1 is a dense-KV (non-hybrid) model, so it allocates a single
+        # HiCache host pool per TP rank. --hicache-size is per rank per host
+        # pool while the workflow input is a node-total DRAM budget, so divide
+        # by TP and the host-pool count. Overridable for one-off tuning.
+        HICACHE_HOST_POOL_COUNT="${HICACHE_HOST_POOL_COUNT:-1}"
+        HICACHE_MAX_SIZE_GB_PER_RANK_POOL="${HICACHE_MAX_SIZE_GB_PER_RANK_POOL:-180}"
+        HICACHE_WRITE_POLICY="${HICACHE_WRITE_POLICY:-write_through}"
+        HICACHE_IO_BACKEND="${HICACHE_IO_BACKEND:-direct}"
+        HICACHE_MEM_LAYOUT="${HICACHE_MEM_LAYOUT:-layer_first}"
+        HICACHE_SIZE_GB="${HICACHE_SIZE_GB:-$((TOTAL_CPU_DRAM_GB / TP / HICACHE_HOST_POOL_COUNT))}"
+        if [ "$HICACHE_SIZE_GB" -gt "$HICACHE_MAX_SIZE_GB_PER_RANK_POOL" ]; then
+            HICACHE_SIZE_GB="$HICACHE_MAX_SIZE_GB_PER_RANK_POOL"
+        fi
+        if [ "$HICACHE_SIZE_GB" -lt 1 ]; then
+            echo "Error: computed HICACHE_SIZE_GB=$HICACHE_SIZE_GB from TOTAL_CPU_DRAM_GB=$TOTAL_CPU_DRAM_GB, TP=$TP, HICACHE_HOST_POOL_COUNT=$HICACHE_HOST_POOL_COUNT" >&2
+            exit 1
+        fi
+        echo "HiCache CPU pool: ${HICACHE_SIZE_GB} GB per rank per host pool across TP=${TP}, host_pool_count=${HICACHE_HOST_POOL_COUNT}"
+        CACHE_ARGS=(
+            --enable-hierarchical-cache
+            --hicache-size "$HICACHE_SIZE_GB"
+            --hicache-io-backend "$HICACHE_IO_BACKEND"
+            --hicache-mem-layout "$HICACHE_MEM_LAYOUT"
+            --hicache-write-policy "$HICACHE_WRITE_POLICY"
+        )
+        # Capture ROCm graphs up to full concurrency so the hicache arm is a
+        # fair A/B against the none arm (which captures to $CONC). The MI355X
+        # recipe caps this at 16 due to a high-conc capture crash on that HW;
+        # on MI300X we follow $CONC. Override via env if MI300X hits the same
+        # startup crash at high conc.
+        HICACHE_CUDA_GRAPH_MAX_BS="${HICACHE_CUDA_GRAPH_MAX_BS:-$CONC}"
+        if [ "$HICACHE_CUDA_GRAPH_MAX_BS" -lt "$CUDA_GRAPH_MAX_BS" ]; then
+            CUDA_GRAPH_MAX_BS="$HICACHE_CUDA_GRAPH_MAX_BS"
+        fi
+        ;;
+    *)
+        echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, hicache)" >&2
+        exit 1
+        ;;
+esac
+
+pip install -U transformers
+
+echo "Starting SGLang server..."
+export PYTHONNOUSERSITE=1
+
+python3 -m sglang.launch_server \
+    --model-path $MODEL \
+    --host=0.0.0.0 \
+    --port $PORT \
+    --tensor-parallel-size $TP \
+    --trust-remote-code \
+    --cuda-graph-max-bs $CUDA_GRAPH_MAX_BS \
+    --max-running-requests $CONC \
+    --mem-fraction-static 0.85 \
+    --tool-call-parser glm47 \
+    --reasoning-parser glm45 \
+    --model-loader-extra-config '{"enable_multithread_load": true, "num_threads": 8}' \
+    --nsa-prefill-backend tilelang \
+    --nsa-decode-backend tilelang \
+    --kv-cache-dtype fp8_e4m3 \
+    --tokenizer-worker-num $((TP*2)) \
+    "${CACHE_ARGS[@]}" \
+    --enable-metrics > "$SERVER_LOG" 2>&1 &
+SERVER_PID=$!
+echo "Server PID: $SERVER_PID"
+
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
+# ---- Run benchmark ----------------------------------------------------------
+build_replay_cmd "$RESULT_DIR"
+
+run_agentic_replay_and_write_outputs "$RESULT_DIR"
\ No newline at end of file
diff --git a/benchmarks/single_node/agentic/qwen3.5_fp8_mi300x.sh b/benchmarks/single_node/agentic/qwen3.5_fp8_mi300x.sh
new file mode 100644
index 000000000..e1032772d
--- /dev/null
+++ b/benchmarks/single_node/agentic/qwen3.5_fp8_mi300x.sh
@@ -0,0 +1,147 @@
+#!/usr/bin/env bash
+set -euo pipefail
+set -x
+
+# Agentic trace replay benchmark for Qwen3.5 FP8 on MI300X using SGLang.
+#
+# Base server recipe follows the upstream MI300X reference
+# (benchmarks/single_node/qwen3.5_fp8_mi300x.sh, the "AMD Andy" recipe):
+# aiter attention backend, aiter allreduce fusion, mem-fraction 0.75.
+# The agentic harness (resolve_trace_source / build_replay_cmd /
+# run_agentic_replay_and_write_outputs) replaces run_benchmark_serving, and
+# --disable-radix-cache is dropped because agentic replay needs prefix reuse.
+#
+# Required env vars:
+#   MODEL, TP, CONC, OFFLOADING, TOTAL_CPU_DRAM_GB, RESULT_DIR, DURATION, EP_SIZE
+#
+# OFFLOADING values:
+#   none    - SGLang GPU KV with the default RadixAttention prefix cache.
+#   hicache - SGLang HiCache with a local CPU hierarchical cache on top of radix.
+
+source "$(dirname "$0")/../../benchmark_lib.sh"
+
+check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION EP_SIZE
+
+SCHEDULER_RECV_INTERVAL=${SCHEDULER_RECV_INTERVAL:-30}
+
+if [[ -n "${SLURM_JOB_ID:-}" ]]; then
+    echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
+fi
+
+if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+rocm-smi || true
+amd-smi || true
+
+# RCCL on these MI300X hosts fails ncclCommInitRank with an unhandled CUDA
+# error when P2P is enabled; disable the P2P transport so TP init falls back
+# to the shared-memory path. Overridable for hosts where P2P works.
+export NCCL_P2P_DISABLE="${NCCL_P2P_DISABLE:-1}"
+
+# ---- Resolve traces and install deps ----------------------------------------
+# Cap the replay corpus at 256k (470 traces, max in+out <= 256k) instead of the
+# unfiltered 052726 corpus whose ~1M-token traces get rejected and add no perf
+# signal at high concurrency.
+export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k
+
+resolve_trace_source
+install_agentic_deps
+
+# ---- Cache / offload config -------------------------------------------------
+SERVER_LOG="$RESULT_DIR/server.log"
+mkdir -p "$RESULT_DIR"
+
+CACHE_ARGS=()
+WARMUP_ARGS=()
+CUDA_GRAPH_MAX_BS="$CONC"
+case "$OFFLOADING" in
+    none)
+        # Leave SGLang's default RadixAttention prefix cache on — agentic
+        # replay needs it; --disable-radix-cache would zero the hit rate.
+        ;;
+    hicache)
+        # Qwen3.5's hybrid GDN/Mamba path allocates two HiCache host pools per
+        # TP rank (one hierarchical KV, one hierarchical Mamba), so the
+        # node-total DRAM budget divides by TP and the host-pool count.
+        # MI300X nodes here expose ~2.3 TB usable CPU DRAM. The hybrid
+        # GDN/Mamba path allocates TWO host pools per TP rank (KV + Mamba), so
+        # the node total is HICACHE_SIZE_GB * TP * HICACHE_HOST_POOL_COUNT. The
+        # harness passes a generic TOTAL_CPU_DRAM_GB=2500, which yields
+        # 2500/8/2=156 GB/pool -> 156*8*2=2496 GB > available -> OOM-kill (137).
+        # Default to a node-safe 1900 (1888 GB allocated), overridable via env.
+        TOTAL_CPU_DRAM_GB="${HICACHE_TOTAL_CPU_DRAM_GB:-1900}"
+        HICACHE_HOST_POOL_COUNT="${HICACHE_HOST_POOL_COUNT:-2}"
+        HICACHE_MAX_SIZE_GB_PER_RANK_POOL="${HICACHE_MAX_SIZE_GB_PER_RANK_POOL:-${HICACHE_MAX_SIZE_GB_PER_RANK:-180}}"
+        HICACHE_WRITE_POLICY="${HICACHE_WRITE_POLICY:-write_through_selective}"
+        # Qwen3.5's hybrid Mamba path runs SGLang's no_buffer scheduler, which
+        # requires page_size=1. Keep the safer direct/layer_first copy path;
+        # kernel/page_first faults on first prefill in this mode on ROCm.
+        HICACHE_PAGE_SIZE="${HICACHE_PAGE_SIZE:-1}"
+        HICACHE_IO_BACKEND="${HICACHE_IO_BACKEND:-direct}"
+        HICACHE_MEM_LAYOUT="${HICACHE_MEM_LAYOUT:-layer_first}"
+        HICACHE_SIZE_GB="${HICACHE_SIZE_GB:-$((TOTAL_CPU_DRAM_GB / TP / HICACHE_HOST_POOL_COUNT))}"
+        if [ "$HICACHE_SIZE_GB" -gt "$HICACHE_MAX_SIZE_GB_PER_RANK_POOL" ]; then
+            HICACHE_SIZE_GB="$HICACHE_MAX_SIZE_GB_PER_RANK_POOL"
+        fi
+        if [ "$HICACHE_SIZE_GB" -lt 1 ]; then
+            echo "Error: computed HICACHE_SIZE_GB=$HICACHE_SIZE_GB from TOTAL_CPU_DRAM_GB=$TOTAL_CPU_DRAM_GB, TP=$TP, HICACHE_HOST_POOL_COUNT=$HICACHE_HOST_POOL_COUNT" >&2
+            exit 1
+        fi
+        echo "HiCache CPU pool: ${HICACHE_SIZE_GB} GB per rank per host pool across TP=${TP}, host_pool_count=${HICACHE_HOST_POOL_COUNT}"
+        CACHE_ARGS=(
+            --page-size "$HICACHE_PAGE_SIZE"
+            --enable-hierarchical-cache
+            --hicache-size "$HICACHE_SIZE_GB"
+            --hicache-io-backend "$HICACHE_IO_BACKEND"
+            --hicache-mem-layout "$HICACHE_MEM_LAYOUT"
+            --hicache-write-policy "$HICACHE_WRITE_POLICY"
+        )
+        # HiCache startup reaches API readiness but SGLang's internal warmup
+        # request can time out on this path; let aiperf own benchmark traffic.
+        WARMUP_ARGS=(--skip-server-warmup)
+        # Capture ROCm graphs up to full concurrency so the hicache arm is a
+        # fair A/B against the none arm (which captures to $CONC). The MI355X
+        # recipe caps this at 16 due to a high-conc capture crash on that HW;
+        # on MI300X we lift it to match $CONC. Override via env if MI300X hits
+        # the same startup crash at high conc.
+        HICACHE_CUDA_GRAPH_MAX_BS="${HICACHE_CUDA_GRAPH_MAX_BS:-$CONC}"
+        if [ "$HICACHE_CUDA_GRAPH_MAX_BS" -lt "$CUDA_GRAPH_MAX_BS" ]; then
+            CUDA_GRAPH_MAX_BS="$HICACHE_CUDA_GRAPH_MAX_BS"
+        fi
+        ;;
+    *)
+        echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, hicache)" >&2
+        exit 1
+        ;;
+esac
+
+echo "Starting SGLang server..."
+export PYTHONNOUSERSITE=1
+
+# following AMD Andy's MI300X recipe
+# https://www.linkedin.com/feed/update/urn:li:activity:7429203734389280768/
+python3 -m sglang.launch_server \
+    --attention-backend aiter \
+    --model-path $MODEL \
+    --host=0.0.0.0 \
+    --port $PORT \
+    --tensor-parallel-size $TP \
+    --ep-size $EP_SIZE \
+    --trust-remote-code \
+    --tokenizer-worker-num 6 \
+    --enable-aiter-allreduce-fusion \
+    --cuda-graph-max-bs $CUDA_GRAPH_MAX_BS \
+    --max-running-requests $CONC \
+    --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \
+    --mem-fraction-static 0.75 \
+    "${CACHE_ARGS[@]}" \
+    "${WARMUP_ARGS[@]}" \
+    --enable-metrics > "$SERVER_LOG" 2>&1 &
+SERVER_PID=$!
+echo "Server PID: $SERVER_PID"
+
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
+# ---- Run benchmark ----------------------------------------------------------
+build_replay_cmd "$RESULT_DIR"
+
+run_agentic_replay_and_write_outputs "$RESULT_DIR"
\ No newline at end of file

From eba42330b46d3388c0bb7cf78e537410930237dd Mon Sep 17 00:00:00 2001
From: seungrokj <seungrok.jung@amd.com>
Date: Fri, 12 Jun 2026 15:07:17 +0900
Subject: [PATCH 03/21] [AMD] add DSV4-FP4-MI355x atom agentic benchmark and
 master yaml config

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/configs/amd-master.yaml               |  16 ++
 .../agentic/dsv4_fp4_mi355x_atom.sh           | 260 ++++++++++++++++++
 2 files changed, 276 insertions(+)
 create mode 100755 benchmarks/single_node/agentic/dsv4_fp4_mi355x_atom.sh

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index ee8718506..6b6f38fb5 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -2174,6 +2174,22 @@ dsv4-fp4-mi355x-atom-mtp:
       search-space:
       - { tp: 8, ep: 1, conc-start: 1, conc-end: 1024, spec-decoding: mtp }
 
+# target
+dsv4-fp4-mi355x-atom-agentic-lmcache:
+  image: rocm/atom-dev:nightly_202606101557
+  model: deepseek-ai/DeepSeek-V4-Pro
+  model-prefix: dsv4
+  runner: mi355x
+  precision: fp4
+  framework: atom
+  multinode: false
+  scenarios:
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      - { tp: 8, ep: 1, offloading: none, conc-list:    [44, 48, 52, 56, 60] }
+      #- { tp: 8, ep: 1, offloading: lmcache, conc-list:    [44, 48, 52, 56, 60] }
+
 qwen3.5-bf16-mi325x-sglang-mtp:
   image: lmsysorg/sglang:v0.5.12-rocm720-mi30x
   model: Qwen/Qwen3.5-397B-A17B
diff --git a/benchmarks/single_node/agentic/dsv4_fp4_mi355x_atom.sh b/benchmarks/single_node/agentic/dsv4_fp4_mi355x_atom.sh
new file mode 100755
index 000000000..701f39b41
--- /dev/null
+++ b/benchmarks/single_node/agentic/dsv4_fp4_mi355x_atom.sh
@@ -0,0 +1,260 @@
+#!/usr/bin/env bash
+set -euo pipefail
+set -x
+
+# Agentic trace replay benchmark for Kimi-K2.5 FP4 on MI355X using ATOM.
+#
+# Required env vars:
+#   MODEL, TP, CONC, OFFLOADING, TOTAL_CPU_DRAM_GB, RESULT_DIR
+#
+# OFFLOADING values:
+#   none    - ATOM GPU KV only.
+#   cpu     - ATOM native CPU offload.
+#   lmcache - LMCache MP server + ATOM LMCacheMPConnector.
+
+source "$(dirname "$0")/../../benchmark_lib.sh"
+
+check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR EP_SIZE DP_ATTENTION
+
+PORT=${PORT:-8888}
+DURATION=${DURATION:-1800}
+EP_SIZE=${EP_SIZE:-1}
+
+if [[ -n "${SLURM_JOB_ID:-}" ]]; then
+    echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
+fi
+
+# ROCR/HIP visibility for ATOM 
+if [ -n "${ROCR_VISIBLE_DEVICES:-}" ]; then
+    export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES"
+fi
+
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
+
+rocm-smi || true
+amd-smi || true
+
+# ---- Resolve traces and install deps ----------------------------------------
+# https://huggingface.co/datasets/semianalysisai/cc-traces-weka-with-subagents-060826
+export WEKA_LOADER_OVERRIDE=semianalysisai/cc-traces-weka-with-subagents-060826
+
+# ---- Resolve traces and install deps ----------------------------------------
+resolve_trace_source
+install_agentic_deps
+
+# ---- Server config ----------------------------------------------------------
+SERVER_LOG="$RESULT_DIR/server.log"
+LMCACHE_LOG="$RESULT_DIR/lmcache_server.log"
+mkdir -p "$RESULT_DIR"
+
+OFFLOAD_ARGS=()
+PREFIX_CACHE_ARGS=()
+
+# ---- Lmcache config ----------------------------------------------------------
+LMCACHE_PID=""
+
+cleanup_lmcache_server() {
+    if [[ -n "$LMCACHE_PID" ]] && kill -0 "$LMCACHE_PID" 2>/dev/null; then
+        kill "$LMCACHE_PID" 2>/dev/null || true
+        wait "$LMCACHE_PID" 2>/dev/null || true
+    fi
+}
+
+trap cleanup_lmcache_server EXIT
+
+wait_for_lmcache_ready() {
+    { set +x; } 2>/dev/null
+    local attempts="${LMCACHE_READY_ATTEMPTS:-120}"
+    local tail_pid=""
+
+    while [ ! -f "$LMCACHE_LOG" ]; do
+        if [[ -n "$LMCACHE_PID" ]] && ! kill -0 "$LMCACHE_PID" 2>/dev/null; then
+            echo "LMCache server died before creating log file. Exiting." >&2
+            exit 1
+        fi
+        sleep 1
+    done
+
+    tail -f -n +1 "$LMCACHE_LOG" &
+    tail_pid=$!
+
+    for ((i = 1; i <= attempts; i++)); do
+        if curl --output /dev/null --silent --fail "http://127.0.0.1:${LMCACHE_HTTP_PORT}/healthcheck"; then
+            kill "$tail_pid" 2>/dev/null || true
+            wait "$tail_pid" 2>/dev/null || true
+            return 0
+        fi
+        if [[ -n "$LMCACHE_PID" ]] && ! kill -0 "$LMCACHE_PID" 2>/dev/null; then
+            echo "LMCache server died before becoming healthy. Log follows:" >&2
+            kill "$tail_pid" 2>/dev/null || true
+            wait "$tail_pid" 2>/dev/null || true
+            cat "$LMCACHE_LOG" >&2 || true
+            exit 1
+        fi
+        sleep 1
+    done
+
+    echo "Timed out waiting for LMCache server healthcheck. Log follows:" >&2
+    kill "$tail_pid" 2>/dev/null || true
+    wait "$tail_pid" 2>/dev/null || true
+    cat "$LMCACHE_LOG" >&2 || true
+    exit 1
+}
+
+case "$OFFLOADING" in
+    none) ;;
+    cpu)
+        unset VLLM_USE_SIMPLE_KV_OFFLOAD
+        # MI355X nodes have ~2.7 TiB of host DRAM available for offload;
+        # reserve 2.5 TB for the offload pool (leaves ~200 GB headroom for
+        # worker RSS / page cache / slurm cgroup).
+        #TODO: fix
+        TOTAL_CPU_DRAM_GB=3000
+        TOTAL_CPU_DRAM_PARTITION_GB="${TOTAL_CPU_DRAM_PARTITION_GB:-$((TOTAL_CPU_DRAM_GB / (8 / TP)))}"
+        # Use vLLM's regular native KV-offload path (OffloadingConnector),
+        # NOT the SimpleCPUOffloadConnector. The "native" backend resolves to
+        # OffloadingConnector by default; setting VLLM_USE_SIMPLE_KV_OFFLOAD=1
+        # would switch it to SimpleCPUOffloadConnector. We intentionally leave
+        # that env var UNSET here so the regular OffloadingConnector path is
+        # used. The shortcut --kv_offloading_backend native + --kv_offloading_size
+        # form constructs the KVTransferConfig at engine startup
+        # (vllm/config/vllm.py:662).
+        OFFLOAD_ARGS=(
+            --kv_offloading_backend native
+            --kv_offloading_size "$TOTAL_CPU_DRAM_PARTITION_GB"
+            --disable-hybrid-kv-cache-manager
+        )
+        ;;
+    lmcache)
+        { set +x; } 2>/dev/null
+        unset VLLM_USE_SIMPLE_KV_OFFLOAD
+
+        git clone https://github.com/LMCache/LMCache.git
+        cd LMCache
+        pip install -r requirements/build.txt 
+        CXX=hipcc BUILD_WITH_HIP=1 pip install -e .   --no-build-isolation
+        cd ..
+
+        python3 -c "import lmcache.integration.vllm.lmcache_mp_connector" >/dev/null
+
+        # Match the B200 Kimi LMCache setup: keep a 2.5 TB semantic CPU KV
+        # pool, but let the external MP server own that pool so vLLM does not
+        # split --kv-offloading-size across TP ranks through the integrated
+        # LMCache backend.
+        #TODO: fix
+        TOTAL_CPU_DRAM_GB=3000
+        LMCACHE_HOST="${LMCACHE_HOST:-127.0.0.1}"
+        LMCACHE_PORT="${LMCACHE_PORT:-5555}"
+        LMCACHE_HTTP_PORT="${LMCACHE_HTTP_PORT:-8080}"
+        # LMCacheMPConnector concatenates lmcache.mp.host and port into the
+        # ZMQ endpoint. Bind the server to a raw host, but pass the connector a
+        # ZMQ-style host string.
+        LMCACHE_CONNECT_HOST="${LMCACHE_CONNECT_HOST:-tcp://$LMCACHE_HOST}"
+        LMCACHE_L1_SIZE_GB="${LMCACHE_L1_SIZE_GB:-$((TOTAL_CPU_DRAM_GB / (8 / TP)))}"
+        LMCACHE_L1_INIT_SIZE_GB="${LMCACHE_L1_INIT_SIZE_GB:-20}"
+        # LMCache read locks are leases on chunks that lookup has promised
+        # vLLM can retrieve. The default 300s TTL is too short for this
+        # long-context agentic queue: TP8/conc32 can spend >300s between
+        # lookup and retrieve while GPU KV is saturated, which leaves the
+        # object present in L1 but no longer readable. Keep the 2.5 TB pool
+        # size unchanged and only extend the lookup-to-retrieve lease.
+        LMCACHE_L1_READ_TTL_SECONDS="${LMCACHE_L1_READ_TTL_SECONDS:-7200}"
+        LMCACHE_CHUNK_SIZE="${LMCACHE_CHUNK_SIZE:-256}"
+        LMCACHE_MAX_WORKERS="${LMCACHE_MAX_WORKERS:-$TP}"
+        export PYTHONHASHSEED="${PYTHONHASHSEED:-0}"
+        export LMCACHE_BLOCKING_TIMEOUT_SECS=120
+
+        echo "Starting LMCache MP server..."
+        LMCACHE_CMD=(
+            lmcache server
+            --host "$LMCACHE_HOST"
+            --port "$LMCACHE_PORT"
+            --http-host "$LMCACHE_HOST"
+            --http-port "$LMCACHE_HTTP_PORT"
+            --l1-size-gb "$LMCACHE_L1_SIZE_GB"
+            --l1-init-size-gb "$LMCACHE_L1_INIT_SIZE_GB"
+            --l1-read-ttl-seconds "$LMCACHE_L1_READ_TTL_SECONDS"
+            --chunk-size "$LMCACHE_CHUNK_SIZE"
+            --max-workers "$LMCACHE_MAX_WORKERS"
+            --eviction-policy LRU
+        )
+        printf '%q ' "${LMCACHE_CMD[@]}" > "$RESULT_DIR/lmcache_command.txt"
+        printf '\n' >> "$RESULT_DIR/lmcache_command.txt"
+        "${LMCACHE_CMD[@]}" > "$LMCACHE_LOG" 2>&1 &
+        LMCACHE_PID=$!
+        echo "LMCache server PID: $LMCACHE_PID"
+        wait_for_lmcache_ready
+        # (srok) TODO:
+        PREFIX_CACHE_ARGS=(--enable_prefix_caching)
+        OFFLOAD_ARGS=(
+            --kv-transfer-config
+            "{\"kv_connector\":\"LMCacheMPConnector\",\"kv_connector_module_path\":\"lmcache.integration.vllm.lmcache_mp_connector\",\"kv_role\":\"kv_both\",\"kv_connector_extra_config\":{\"lmcache.mp.host\":\"$LMCACHE_CONNECT_HOST\",\"lmcache.mp.port\":$LMCACHE_PORT}}"
+            --disable-hybrid-kv-cache-manager
+        )
+        ;;
+    *) echo "Error: unsupported OFFLOADING value '$OFFLOADING'" >&2; exit 1 ;;
+esac
+
+# ---- LLM server config ----------------------------------------------------------
+EP_ARGS=()
+if [ "$EP_SIZE" -gt 1 ]; then
+    EP_ARGS=(--enable-expert-parallel)
+fi
+
+echo "Starting ATOM server..."
+export PYTHONNOUSERSITE=1
+
+# Workaround for MEC FW <177 RCCL memory reclaim issue
+version=$(rocm-smi --showfw 2>/dev/null | grep MEC | head -n 1 | awk '{print $NF}')
+if [[ "$version" == "" || ${version:-0} -lt 177 ]]; then
+    export HSA_NO_SCRATCH_RECLAIM=1
+fi
+
+PARALLEL_ARGS=(-tp "$TP") #TP
+if [ "$DP_ATTENTION" = "true" ]; then
+    if [ "$EP_SIZE" -gt 1 ]; then #DP+EP
+        PARALLEL_ARGS=(-tp "$TP" --enable-expert-parallel --enable-dp-attention )
+    else #DP+TP
+        PARALLEL_ARGS=(-tp "$TP" --enable-dp-attention )
+    fi
+fi 
+
+set -x
+export ATOM_DISABLE_MMAP=true
+export AITER_BF16_FP8_MOE_BOUND=0
+export ATOM_MOE_GU_ITLV=1
+{ set +x; } 2>/dev/null
+
+ATOM_CMD=(
+    python3 -m atom.entrypoints.openai_server \
+        --model $MODEL \
+        --server-port $PORT \
+        "${PARALLEL_ARGS[@]}" \
+        --kv_cache_dtype fp8 \
+        --trust-remote-code \
+        --gpu-memory-utilization 0.85 \
+    "${PREFIX_CACHE_ARGS[@]}"
+    "${OFFLOAD_ARGS[@]}"
+)
+printf '%q ' "${ATOM_CMD[@]}" | tee "$RESULT_DIR/vllm_command.txt"
+printf '\n' | tee -a "$RESULT_DIR/vllm_command.txt"
+"${ATOM_CMD[@]}" > "$SERVER_LOG" 2>&1 &
+SERVER_PID=$!
+echo "Server PID: $SERVER_PID"
+
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
+# ---- Run benchmark ----------------------------------------------------------
+build_replay_cmd "$RESULT_DIR"
+
+run_agentic_replay_and_write_outputs "$RESULT_DIR"
\ No newline at end of file

From 32f50079d9911470bf28e6810342cb4d29dbf9a1 Mon Sep 17 00:00:00 2001
From: seungrokj <seungrok.jung@amd.com>
Date: Fri, 12 Jun 2026 15:13:36 +0900
Subject: [PATCH 04/21] [AMD] update DSV4-FP4-MI355x atom agentic benchmark and
 master yaml config

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/configs/amd-master.yaml                        | 3 ++-
 benchmarks/single_node/agentic/dsv4_fp4_mi355x_atom.sh | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 6b6f38fb5..7ce6882c3 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -2187,7 +2187,8 @@ dsv4-fp4-mi355x-atom-agentic-lmcache:
     agentic-coding:
     - duration: 1800
       search-space:
-      - { tp: 8, ep: 1, offloading: none, conc-list:    [44, 48, 52, 56, 60] }
+      - { tp: 8, ep: 1, offloading: none, conc-list:    [52] }
+      #- { tp: 8, ep: 1, offloading: none, conc-list:    [44, 48, 52, 56, 60] }
       #- { tp: 8, ep: 1, offloading: lmcache, conc-list:    [44, 48, 52, 56, 60] }
 
 qwen3.5-bf16-mi325x-sglang-mtp:
diff --git a/benchmarks/single_node/agentic/dsv4_fp4_mi355x_atom.sh b/benchmarks/single_node/agentic/dsv4_fp4_mi355x_atom.sh
index 701f39b41..1ec554669 100755
--- a/benchmarks/single_node/agentic/dsv4_fp4_mi355x_atom.sh
+++ b/benchmarks/single_node/agentic/dsv4_fp4_mi355x_atom.sh
@@ -46,7 +46,7 @@ amd-smi || true
 
 # ---- Resolve traces and install deps ----------------------------------------
 # https://huggingface.co/datasets/semianalysisai/cc-traces-weka-with-subagents-060826
-export WEKA_LOADER_OVERRIDE=semianalysisai/cc-traces-weka-with-subagents-060826
+export WEKA_LOADER_OVERRIDE=semianalysisai_cc-traces-weka-with-subagents-060826
 
 # ---- Resolve traces and install deps ----------------------------------------
 resolve_trace_source

From 351e729285088efd7db8a135ad13429e37e6b49d Mon Sep 17 00:00:00 2001
From: seungrokj <seungrok.jung@amd.com>
Date: Fri, 12 Jun 2026 17:04:32 +0900
Subject: [PATCH 05/21] [AMD] dsv4_fp4_mi355x_atom.sh: update agentic benchmark
 script

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 benchmarks/single_node/agentic/dsv4_fp4_mi355x_atom.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/single_node/agentic/dsv4_fp4_mi355x_atom.sh b/benchmarks/single_node/agentic/dsv4_fp4_mi355x_atom.sh
index 1ec554669..f1e680cbe 100755
--- a/benchmarks/single_node/agentic/dsv4_fp4_mi355x_atom.sh
+++ b/benchmarks/single_node/agentic/dsv4_fp4_mi355x_atom.sh
@@ -46,7 +46,7 @@ amd-smi || true
 
 # ---- Resolve traces and install deps ----------------------------------------
 # https://huggingface.co/datasets/semianalysisai/cc-traces-weka-with-subagents-060826
-export WEKA_LOADER_OVERRIDE=semianalysisai_cc-traces-weka-with-subagents-060826
+export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060826
 
 # ---- Resolve traces and install deps ----------------------------------------
 resolve_trace_source

From 64ce90cde70f6d504ef067415ed84481f5a803d9 Mon Sep 17 00:00:00 2001
From: ajith-sirra-amd <ajith.sirra@amd.com>
Date: Fri, 12 Jun 2026 14:12:23 +0530
Subject: [PATCH 06/21] Add DSV4 MI355X Agentic Scripts

Signed-off-by: ajith-sirra-amd <ajith.sirra@amd.com>
---
 .github/configs/amd-master.yaml               |  15 ++
 .../single_node/agentic/dsv4_fp4_mi355x.sh    | 208 ++++++++++++++++++
 .../agentic/dsv4_fp4_mi355x_sglang.sh         |  44 +++-
 3 files changed, 263 insertions(+), 4 deletions(-)
 create mode 100644 benchmarks/single_node/agentic/dsv4_fp4_mi355x.sh

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 7ce6882c3..7460aca80 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -2736,3 +2736,18 @@ dsv4-fp4-mi355x-sglang-agentic:
       search-space:
       - { tp: 8, offloading: none, conc-list: [16, 32, 64] }
       - { tp: 8, dp-attn: true, offloading: none, conc-list: [64, 128, 256] }
+
+dsv4-fp4-mi355x-sglang-agentic-hicache:
+  image: rocm/sgl-dev:rocm720-mi35x-0363e6c-20260509-DSv4
+  model: deepseek-ai/DeepSeek-V4-Pro
+  model-prefix: dsv4
+  runner: mi355x
+  precision: fp4
+  framework: sglang
+  multinode: false
+  scenarios:
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      - { tp: 8, offloading: none, conc-list: [52] }
+      - { tp: 8, offloading: hicache, conc-list: [52] }
\ No newline at end of file
diff --git a/benchmarks/single_node/agentic/dsv4_fp4_mi355x.sh b/benchmarks/single_node/agentic/dsv4_fp4_mi355x.sh
new file mode 100644
index 000000000..3d05e0bf4
--- /dev/null
+++ b/benchmarks/single_node/agentic/dsv4_fp4_mi355x.sh
@@ -0,0 +1,208 @@
+#!/usr/bin/env bash
+set -euo pipefail
+set -x
+
+# Agentic trace replay benchmark for DeepSeek-V4-Pro FP4 on MI355X using SGLang.
+# Adapted from benchmarks/single_node/dsv4_fp4_mi355x_sglang.sh (fixed-seq-len
+# sibling) with the agentic harness (build_replay_cmd / write_agentic_result_json
+# / analyze_benchmark_distributions) swapped in for run_benchmark_serving.
+#
+# This launcher does NOT support CPU offload. SGLang's KV offload paths are
+# different from vLLM's SimpleCPUOffloadConnector, and the matching agentic
+# config (dsv4-fp4-mi355x-sglang-agentic) only sweeps offloading=none.
+#
+# Required env vars:
+#   MODEL, TP, CONC, OFFLOADING, TOTAL_CPU_DRAM_GB, RESULT_DIR
+
+source "$(dirname "$0")/../../benchmark_lib.sh"
+
+check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION EP_SIZE DP_ATTENTION
+
+if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then
+    MAX_MODEL_LEN=1000000
+fi
+
+if [[ -n "${SLURM_JOB_ID:-}" ]]; then
+    echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
+fi
+
+# ROCR/HIP visibility under slurm cgroups.
+if [ -n "${ROCR_VISIBLE_DEVICES:-}" ]; then
+    export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES"
+fi
+
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
+rocm-smi || true
+amd-smi || true
+
+# ---- Resolve traces and install deps ----------------------------------------
+resolve_trace_source
+install_agentic_deps
+
+# Reject anything other than none: this launcher has no SGLang CPU-offload
+# wiring (different surface than vLLM's SimpleCPUOffloadConnector).
+CACHE_ARGS=()
+WARMUP_ARGS=()
+CUDA_GRAPH_MAX_BS="$CONC"
+[ "$CUDA_GRAPH_MAX_BS" -gt 64 ] && CUDA_GRAPH_MAX_BS=64
+case "$OFFLOADING" in
+    none)
+        # Leave SGLang's default RadixAttention prefix cache on — agentic
+        # replay needs it; --disable-radix-cache would zero the hit rate.
+        ;;
+    hicache)
+        # DeepSeek V4 HiCache uses ratio-based capacity control, not GB-based.
+        # DSv4 allocates several physical host sub-pools for each logical host
+        # token. MI355X nodes have ~3 TB of host DRAM (similar to B200's 3.8
+        # TiB), so ratio=8 at TP≥8 provides a large useful CPU tier within the
+        # node budget. Lower TP configs use higher ratios to maintain adequate
+        # host token capacity without exceeding DRAM limits.
+        if [ "$TP" -ge 8 ]; then
+            DEFAULT_HICACHE_RATIO=8
+        else
+            DEFAULT_HICACHE_RATIO=16
+        fi
+        HICACHE_RATIO="${HICACHE_RATIO:-$DEFAULT_HICACHE_RATIO}"
+        HICACHE_WRITE_POLICY="${HICACHE_WRITE_POLICY:-write_through}"
+        HICACHE_IO_BACKEND="${HICACHE_IO_BACKEND:-direct}"
+        HICACHE_MEM_LAYOUT="${HICACHE_MEM_LAYOUT:-page_first_direct}"
+        export SGLANG_ENABLE_UNIFIED_RADIX_TREE=1
+        CACHE_ARGS=(
+            --enable-hierarchical-cache
+            --hicache-ratio "$HICACHE_RATIO"
+            --hicache-write-policy "$HICACHE_WRITE_POLICY"
+            --hicache-io-backend "$HICACHE_IO_BACKEND"
+            --hicache-mem-layout "$HICACHE_MEM_LAYOUT"
+        )
+        echo "HiCache DSv4 CPU tier: ratio=$HICACHE_RATIO, write_policy=$HICACHE_WRITE_POLICY, io_backend=$HICACHE_IO_BACKEND, mem_layout=$HICACHE_MEM_LAYOUT"
+        ;;
+    *)
+        echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, hicache)" >&2
+        exit 1
+        ;;
+esac
+
+# Transformers in the container doesn't recognize the `deepseek_v4` model_type.
+# PR #23608's fallback in hf_transformers_utils.get_config tries to handle this
+# by writing a patched config to /tmp, but in practice isn't catching the error
+# in this image. Patch the cached config.json directly instead: set model_type
+# to `deepseek_v3` so AutoConfig.from_pretrained succeeds, and keep
+# architectures=['DeepseekV4ForCausalLM'] so SGLang dispatches to its native
+# DSv4 model class (python/sglang/srt/models/deepseek_v4.py).
+python3 << PYEOF
+import json
+from huggingface_hub import hf_hub_download
+path = hf_hub_download(repo_id="$MODEL", filename="config.json")
+with open(path) as f:
+    config = json.load(f)
+if config.get("model_type") == "deepseek_v4":
+    config["model_type"] = "deepseek_v3"
+    with open(path, "w") as f:
+        json.dump(config, f, indent=2)
+    print(f"Patched {path}: model_type deepseek_v4 -> deepseek_v3")
+else:
+    print(f"No patch needed: model_type is {config.get('model_type')!r}")
+PYEOF
+
+# DSv4 FP4-experts path. Mirrors the env block in the fixed-seq-len sibling
+# (benchmarks/single_node/dsv4_fp4_mi355x_sglang.sh), which tracks the active
+# block in python/run_dsv4.sh on the amd/deepseek_v4 branch:
+#   SGLANG_DSV4_FP4_EXPERTS=True   -> route experts through FP4 kernels
+#   SGLANG_FORCE_TRITON_MOE_FP8=0  -> dispatch MoE through aiter and apply
+#                                    the swiglu_limit clamp in the triton
+#                                    MoE fallback path.
+export SGLANG_REASONING_EFFORT=max
+export SGLANG_OPT_USE_FUSED_COMPRESS=true
+export SGLANG_OPT_USE_OLD_COMPRESSOR=true
+export SGLANG_OPT_USE_TILELANG_SWA_PREPARE=false
+export SGLANG_OPT_USE_JIT_KERNEL_FUSED_TOPK=false
+export SGLANG_OPT_USE_FUSED_HASH_TOPK=false
+export SGLANG_OPT_DEEPGEMM_HC_PRENORM=false
+export SGLANG_OPT_USE_TILELANG_MHC_PRE=false
+export SGLANG_OPT_USE_TILELANG_MHC_POST=false
+export SGLANG_OPT_USE_AITER_MHC_PRE=true
+export SGLANG_OPT_USE_AITER_MHC_POST=true
+export SGLANG_ENABLE_THINKING=1
+export SGLANG_USE_AITER=1
+export SGLANG_USE_ROCM700A=1
+export SGLANG_TOPK_TRANSFORM_512_TORCH=0
+export SGLANG_FP8_PAGED_MQA_LOGITS_TORCH=1
+export SGLANG_DSV4_FP4_EXPERTS=True
+export SGLANG_OPT_DPSK_V4_RADIX=0
+export SGLANG_OPT_USE_OVERLAP_STORE_CACHE=false
+export SGLANG_OPT_USE_FUSED_STORE_CACHE=false
+export SGLANG_FORCE_TRITON_MOE_FP8=0
+export SGLANG_HACK_FLASHMLA_BACKEND=tilelang
+export SGLANG_OPT_USE_TILELANG_INDEXER=true
+export SGLANG_OPT_USE_TRITON_SWA_PREPARE=true
+
+# ---- Server config ----------------------------------------------------------
+SERVER_LOG="$RESULT_DIR/server.log"
+mkdir -p "$RESULT_DIR"
+
+# Parallelism: pure TP, TP+EP, or DEP (DP-attn + EP). Matches the dsv4 b200
+# vllm agentic launcher so the agentic sweep can probe both interactivity and
+# throughput regimes.
+PARALLEL_ARGS=(--tensor-parallel-size "$TP")
+if [ "$DP_ATTENTION" = "true" ]; then
+    PARALLEL_ARGS+=(
+        --dp "$TP"
+        --enable-dp-attention
+        --enable-prefill-delayer
+    )
+fi
+if [ "${EP_SIZE:-1}" -gt 1 ]; then
+    PARALLEL_ARGS+=(--ep-size "$EP_SIZE")
+fi
+
+# --max-running-requests is per-engine. With DP-attn each DP engine handles
+# only CONC/$TP sequences in steady state (the agentic harness load-balances
+# users across DP ranks), so size the per-engine cap to that.
+# Pure TP is a single engine and sees all CONC sequences itself.
+if [ "$DP_ATTENTION" = "true" ]; then
+    PER_ENGINE_MAX_RUNNING=$(( CONC / TP ))
+    [ "$PER_ENGINE_MAX_RUNNING" -lt 1 ] && PER_ENGINE_MAX_RUNNING=1
+else
+    PER_ENGINE_MAX_RUNNING=$CONC
+fi
+
+echo "Starting sglang server..."
+python3 -m sglang.launch_server \
+    --model-path "$MODEL_PATH" --served-model-name "$MODEL" \
+    --host=0.0.0.0 \
+    --port "$PORT" \
+    "${PARALLEL_ARGS[@]}" \
+    --trust-remote-code \
+    --attention-backend compressed \
+    --max-running-requests "$PER_ENGINE_MAX_RUNNING" \
+    --cuda-graph-max-bs "$CUDA_GRAPH_MAX_BS" \
+    --page-size 256 \
+    --context-length "$MAX_MODEL_LEN" \
+    --chunked-prefill-size 8192 \
+    --disable-shared-experts-fusion \
+    --tool-call-parser deepseekv4 \
+    --reasoning-parser deepseek-v4 \
+    --chat-template "$(dirname "$0")/../chat_templates/deepseek_v4_thinking.jinja" \
+    --watchdog-timeout 1800 \
+    --enable-metrics \
+    "${CACHE_ARGS[@]}" \
+    "${WARMUP_ARGS[@]}" > "$SERVER_LOG" 2>&1 &
+SERVER_PID=$!
+echo "Server PID: $SERVER_PID"
+
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
+# ---- Run benchmark ----------------------------------------------------------
+build_replay_cmd "$RESULT_DIR"
+
+run_agentic_replay_and_write_outputs "$RESULT_DIR"
diff --git a/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh b/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh
index 029c8ea7f..3d05e0bf4 100755
--- a/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh
+++ b/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh
@@ -51,10 +51,43 @@ install_agentic_deps
 
 # Reject anything other than none: this launcher has no SGLang CPU-offload
 # wiring (different surface than vLLM's SimpleCPUOffloadConnector).
+CACHE_ARGS=()
+WARMUP_ARGS=()
+CUDA_GRAPH_MAX_BS="$CONC"
+[ "$CUDA_GRAPH_MAX_BS" -gt 64 ] && CUDA_GRAPH_MAX_BS=64
 case "$OFFLOADING" in
-    none) ;;
+    none)
+        # Leave SGLang's default RadixAttention prefix cache on — agentic
+        # replay needs it; --disable-radix-cache would zero the hit rate.
+        ;;
+    hicache)
+        # DeepSeek V4 HiCache uses ratio-based capacity control, not GB-based.
+        # DSv4 allocates several physical host sub-pools for each logical host
+        # token. MI355X nodes have ~3 TB of host DRAM (similar to B200's 3.8
+        # TiB), so ratio=8 at TP≥8 provides a large useful CPU tier within the
+        # node budget. Lower TP configs use higher ratios to maintain adequate
+        # host token capacity without exceeding DRAM limits.
+        if [ "$TP" -ge 8 ]; then
+            DEFAULT_HICACHE_RATIO=8
+        else
+            DEFAULT_HICACHE_RATIO=16
+        fi
+        HICACHE_RATIO="${HICACHE_RATIO:-$DEFAULT_HICACHE_RATIO}"
+        HICACHE_WRITE_POLICY="${HICACHE_WRITE_POLICY:-write_through}"
+        HICACHE_IO_BACKEND="${HICACHE_IO_BACKEND:-direct}"
+        HICACHE_MEM_LAYOUT="${HICACHE_MEM_LAYOUT:-page_first_direct}"
+        export SGLANG_ENABLE_UNIFIED_RADIX_TREE=1
+        CACHE_ARGS=(
+            --enable-hierarchical-cache
+            --hicache-ratio "$HICACHE_RATIO"
+            --hicache-write-policy "$HICACHE_WRITE_POLICY"
+            --hicache-io-backend "$HICACHE_IO_BACKEND"
+            --hicache-mem-layout "$HICACHE_MEM_LAYOUT"
+        )
+        echo "HiCache DSv4 CPU tier: ratio=$HICACHE_RATIO, write_policy=$HICACHE_WRITE_POLICY, io_backend=$HICACHE_IO_BACKEND, mem_layout=$HICACHE_MEM_LAYOUT"
+        ;;
     *)
-        echo "Error: dsv4_fp4_mi355x_sglang.sh only supports OFFLOADING=none (got '$OFFLOADING')" >&2
+        echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, hicache)" >&2
         exit 1
         ;;
 esac
@@ -152,7 +185,7 @@ python3 -m sglang.launch_server \
     --trust-remote-code \
     --attention-backend compressed \
     --max-running-requests "$PER_ENGINE_MAX_RUNNING" \
-    --cuda-graph-max-bs "$PER_ENGINE_MAX_RUNNING" \
+    --cuda-graph-max-bs "$CUDA_GRAPH_MAX_BS" \
     --page-size 256 \
     --context-length "$MAX_MODEL_LEN" \
     --chunked-prefill-size 8192 \
@@ -160,7 +193,10 @@ python3 -m sglang.launch_server \
     --tool-call-parser deepseekv4 \
     --reasoning-parser deepseek-v4 \
     --chat-template "$(dirname "$0")/../chat_templates/deepseek_v4_thinking.jinja" \
-    --watchdog-timeout 1800 > "$SERVER_LOG" 2>&1 &
+    --watchdog-timeout 1800 \
+    --enable-metrics \
+    "${CACHE_ARGS[@]}" \
+    "${WARMUP_ARGS[@]}" > "$SERVER_LOG" 2>&1 &
 SERVER_PID=$!
 echo "Server PID: $SERVER_PID"
 

From 8ca4bc1b3eb6fc1fdd2fdbe702c56dcd16de37cc Mon Sep 17 00:00:00 2001
From: ajith-sirra-amd <ajith.sirra@amd.com>
Date: Fri, 12 Jun 2026 14:17:52 +0530
Subject: [PATCH 07/21] Add DSV4 MI355X Agentic Scripts

Signed-off-by: ajith-sirra-amd <ajith.sirra@amd.com>
---
 benchmarks/single_node/agentic/dsv4_fp4_mi355x.sh        | 4 ++++
 benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh | 4 ++++
 2 files changed, 8 insertions(+)

diff --git a/benchmarks/single_node/agentic/dsv4_fp4_mi355x.sh b/benchmarks/single_node/agentic/dsv4_fp4_mi355x.sh
index 3d05e0bf4..236895cd2 100644
--- a/benchmarks/single_node/agentic/dsv4_fp4_mi355x.sh
+++ b/benchmarks/single_node/agentic/dsv4_fp4_mi355x.sh
@@ -45,6 +45,10 @@ fi
 rocm-smi || true
 amd-smi || true
 
+# ---- Resolve traces and install deps ----------------------------------------
+# https://huggingface.co/datasets/semianalysisai/cc-traces-weka-with-subagents-060826
+export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060826
+
 # ---- Resolve traces and install deps ----------------------------------------
 resolve_trace_source
 install_agentic_deps
diff --git a/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh b/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh
index 3d05e0bf4..236895cd2 100755
--- a/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh
+++ b/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh
@@ -45,6 +45,10 @@ fi
 rocm-smi || true
 amd-smi || true
 
+# ---- Resolve traces and install deps ----------------------------------------
+# https://huggingface.co/datasets/semianalysisai/cc-traces-weka-with-subagents-060826
+export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060826
+
 # ---- Resolve traces and install deps ----------------------------------------
 resolve_trace_source
 install_agentic_deps

From 37f57a70e39ad8b1a5189a6698d8a9be9b609119 Mon Sep 17 00:00:00 2001
From: seungrokj <seungrok.jung@amd.com>
Date: Mon, 15 Jun 2026 01:59:58 +0900
Subject: [PATCH 08/21] [AMD] update DSV4-FP4-MI355X SGLang agentic benchmark
 and master yaml config

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/configs/amd-master.yaml               |   8 +-
 .../fixed_seq_len/dsv4_fp4_mi355x_sglang.sh   | 254 ++++++++++--------
 2 files changed, 149 insertions(+), 113 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 7460aca80..9b9cceb12 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -2737,8 +2737,9 @@ dsv4-fp4-mi355x-sglang-agentic:
       - { tp: 8, offloading: none, conc-list: [16, 32, 64] }
       - { tp: 8, dp-attn: true, offloading: none, conc-list: [64, 128, 256] }
 
+# target
 dsv4-fp4-mi355x-sglang-agentic-hicache:
-  image: rocm/sgl-dev:rocm720-mi35x-0363e6c-20260509-DSv4
+  image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260610
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: mi355x
@@ -2749,5 +2750,6 @@ dsv4-fp4-mi355x-sglang-agentic-hicache:
     agentic-coding:
     - duration: 1800
       search-space:
-      - { tp: 8, offloading: none, conc-list: [52] }
-      - { tp: 8, offloading: hicache, conc-list: [52] }
\ No newline at end of file
+        #DPA, conc>=64
+      - { tp: 8, dp-attn: false, offloading: none, conc-list: [4, 8, 16, 32, 48, 64, 128, 196, 256]  }
+      - { tp: 8, dp-attn: false, offloading: hicache, conc-list: [4, 8, 16, 32, 48, 64, 128, 196, 256]  }
\ No newline at end of file
diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_sglang.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_sglang.sh
index b02a09489..c6f6cba25 100755
--- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_sglang.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_sglang.sh
@@ -1,101 +1,138 @@
 #!/usr/bin/env bash
+set -euo pipefail
+set -x
+
+# Agentic trace replay benchmark for DeepSeek-V4-Pro FP4 on MI355X using SGLang.
+# Adapted from benchmarks/single_node/dsv4_fp4_mi355x_sglang.sh (fixed-seq-len
+# sibling) with the agentic harness (build_replay_cmd / write_agentic_result_json
+# / analyze_benchmark_distributions) swapped in for run_benchmark_serving.
+#
+# This launcher does NOT support CPU offload. SGLang's KV offload paths are
+# different from vLLM's SimpleCPUOffloadConnector, and the matching agentic
+# config (dsv4-fp4-mi355x-sglang-agentic) only sweeps offloading=none.
+#
+# Required env vars:
+#   MODEL, TP, CONC, OFFLOADING, TOTAL_CPU_DRAM_GB, RESULT_DIR
 
 source "$(dirname "$0")/../../benchmark_lib.sh"
 
-check_env_vars \
-    MODEL \
-    TP \
-    DP_ATTENTION \
-    EP_SIZE \
-    CONC \
-    ISL \
-    OSL \
-    RANDOM_RANGE_RATIO \
-    RESULT_FILENAME \
-    MAX_MODEL_LEN
-
-if [[ -n "$SLURM_JOB_ID" ]]; then
-  echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION EP_SIZE DP_ATTENTION
+
+if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then
+    MAX_MODEL_LEN=1000000
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
-
-# sglang ships in the image at the SHA encoded in the image tag (built
-# from the amd/deepseek_v4 branch in sgl-project/sglang). To bump sglang,
-# bump the image tag in .github/configs/amd-master.yaml.
-
-# Transformers in the container doesn't recognize the `deepseek_v4` model_type.
-# PR #23608's fallback in hf_transformers_utils.get_config tries to handle this
-# by writing a patched config to /tmp, but in practice isn't catching the error
-# in this image. Patch the cached config.json directly instead: set model_type
-# to `deepseek_v3` so AutoConfig.from_pretrained succeeds, and keep
-# architectures=['DeepseekV4ForCausalLM'] so SGLang dispatches to its native
-# DSv4 model class (python/sglang/srt/models/deepseek_v4.py).
-python3 << PYEOF
-import json
-from huggingface_hub import hf_hub_download
-path = hf_hub_download(repo_id="$MODEL", filename="config.json")
-with open(path) as f:
-    config = json.load(f)
-if config.get("model_type") == "deepseek_v4":
-    config["model_type"] = "deepseek_v3"
-    with open(path, "w") as f:
-        json.dump(config, f, indent=2)
-    print(f"Patched {path}: model_type deepseek_v4 -> deepseek_v3")
-else:
-    print(f"No patch needed: model_type is {config.get('model_type')!r}")
-PYEOF
-
-# DSv4 FP4-experts path. Tracks the env block in python/run_dsv4.sh on the
-# amd/deepseek_v4 branch (HEAD's active block is FP8; we override the two
-# FP4-specific flags below):
-#   SGLANG_DSV4_FP4_EXPERTS=True   -> route experts through the FP4 kernels
-#   SGLANG_FORCE_TRITON_MOE_FP8=0  -> dispatch MoE through aiter and apply
-#                                    the swiglu_limit clamp in the triton
-#                                    MoE fallback path.
-export SGLANG_REASONING_EFFORT=max
-export SGLANG_OPT_USE_FUSED_COMPRESS=true
-export SGLANG_OPT_USE_OLD_COMPRESSOR=false
-export SGLANG_OPT_USE_TILELANG_SWA_PREPARE=false
-export SGLANG_OPT_USE_JIT_KERNEL_FUSED_TOPK=false
-export SGLANG_OPT_USE_FUSED_HASH_TOPK=true
+if [[ -n "${SLURM_JOB_ID:-}" ]]; then
+    echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
+fi
+
+# ROCR/HIP visibility under slurm cgroups.
+if [ -n "${ROCR_VISIBLE_DEVICES:-}" ]; then
+    export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES"
+fi
+
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
+rocm-smi || true
+amd-smi || true
+
+# ---- Resolve traces and install deps ----------------------------------------
+# https://huggingface.co/datasets/semianalysisai/cc-traces-weka-with-subagents-060826
+# export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060226
+
+# ---- Resolve traces and install deps ----------------------------------------
+resolve_trace_source
+install_agentic_deps
+
+# ---- Server config ----------------------------------------------------------
+SERVER_LOG="$RESULT_DIR/server.log"
+mkdir -p "$RESULT_DIR"
+
+# ---- Hicache config ----------------------------------------------------------
+# Reject anything other than none: this launcher has no SGLang CPU-offload
+# wiring (different surface than vLLM's SimpleCPUOffloadConnector).
+
+case "$OFFLOADING" in
+    none)
+        # Leave SGLang's default RadixAttention prefix cache on — agentic
+        # replay needs it; --disable-radix-cache would zero the hit rate.
+        ;;
+    hicache)
+        # DeepSeek V4 HiCache uses ratio-based capacity control, not GB-based.
+        # DSv4 allocates several physical host sub-pools for each logical host
+        # token. MI355X nodes have ~3 TB of host DRAM (similar to B200's 3.8
+        # TiB), so ratio=8 at TP≥8 provides a large useful CPU tier within the
+        # node budget. Lower TP configs use higher ratios to maintain adequate
+        # host token capacity without exceeding DRAM limits.
+        if [ "$TP" -ge 8 ]; then
+            DEFAULT_HICACHE_RATIO=8
+        else
+            DEFAULT_HICACHE_RATIO=16
+        fi
+        HICACHE_RATIO="${HICACHE_RATIO:-$DEFAULT_HICACHE_RATIO}"
+        HICACHE_WRITE_POLICY="${HICACHE_WRITE_POLICY:-write_through}"
+        HICACHE_IO_BACKEND="${HICACHE_IO_BACKEND:-direct}"
+        HICACHE_MEM_LAYOUT="${HICACHE_MEM_LAYOUT:-page_first_direct}"
+        export SGLANG_ENABLE_UNIFIED_RADIX_TREE=1
+        CACHE_ARGS=(
+            --enable-hierarchical-cache
+            --hicache-ratio "$HICACHE_RATIO"
+            --hicache-write-policy "$HICACHE_WRITE_POLICY"
+            --hicache-io-backend "$HICACHE_IO_BACKEND"
+            --hicache-mem-layout "$HICACHE_MEM_LAYOUT"
+        )
+        echo "HiCache DSv4 CPU tier: ratio=$HICACHE_RATIO, write_policy=$HICACHE_WRITE_POLICY, io_backend=$HICACHE_IO_BACKEND, mem_layout=$HICACHE_MEM_LAYOUT"
+        ;;
+    *)
+        echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, hicache)" >&2
+        exit 1
+        ;;
+esac
+
+# ---- LLM server config ----------------------------------------------------------
+
+CACHE_ARGS=()
+WARMUP_ARGS=()
+CUDA_GRAPH_MAX_BS="$CONC"
+[ "$CUDA_GRAPH_MAX_BS" -gt 64 ] && CUDA_GRAPH_MAX_BS=64
+
+export SGLANG_DEFAULT_THINKING=1
+export SGLANG_DSV4_REASONING_EFFORT=max
 export SGLANG_OPT_DEEPGEMM_HC_PRENORM=false
+export SGLANG_USE_AITER=1
+export SGLANG_USE_ROCM700A=0
+export SGLANG_OPT_USE_FUSED_COMPRESS=true
+export SGLANG_HACK_FLASHMLA_BACKEND=unified_kv_triton
+export SGLANG_OPT_FP8_WO_A_GEMM=false
+export SGLANG_OPT_USE_JIT_INDEXER_METADATA=false
+export SGLANG_OPT_USE_TOPK_V2=false
+export SGLANG_OPT_USE_AITER_INDEXER=true
+export SGLANG_OPT_USE_TILELANG_INDEXER=false
 export SGLANG_OPT_USE_TILELANG_MHC_PRE=false
 export SGLANG_OPT_USE_TILELANG_MHC_POST=false
-export SGLANG_OPT_USE_AITER_MHC_PRE=true
-export SGLANG_OPT_USE_AITER_MHC_POST=true
-export SGLANG_ENABLE_THINKING=1
-export SGLANG_USE_AITER=1
-export SGLANG_USE_ROCM700A=1
-export SGLANG_TOPK_TRANSFORM_512_TORCH=0
 export SGLANG_FP8_PAGED_MQA_LOGITS_TORCH=1
-export SGLANG_DSV4_FP4_EXPERTS=True
-export SGLANG_OPT_DPSK_V4_RADIX=1
-export SGLANG_OPT_USE_OVERLAP_STORE_CACHE=false
-export SGLANG_OPT_USE_FUSED_STORE_CACHE=true
-export SGLANG_FORCE_TRITON_MOE_FP8=0
-export SGLANG_HACK_FLASHMLA_BACKEND=triton
-export SGLANG_OPT_USE_TILELANG_INDEXER=true
-export SGLANG_OPT_USE_TRITON_SWA_PREPARE=true
+export SGLANG_OPT_USE_FUSED_COMPRESS_TRITON=true
 export AITER_BF16_FP8_MOE_BOUND=0
-export SGLANG_OPT_FUSE_WQA_WKV=true
-export SGLANG_OPT_USE_FUSED_PAGED_COMPRESS=true
-export SGLANG_OPT_USE_MULTI_STREAM_OVERLAP=0
+export SGLANG_EAGER_INPUT_NO_COPY=true
 
-SERVER_LOG=/workspace/server.log
-
-EVAL_CONTEXT_ARGS=""
-if [ "${EVAL_ONLY}" = "true" ]; then
-    setup_eval_context
-    EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN"
-fi
-# Start GPU monitoring (power, temperature, clocks every second)
-start_gpu_monitor
+# multi-stream
+export SGLANG_OPT_USE_MULTI_STREAM_OVERLAP=false
+export SGLANG_ROCM_USE_MULTI_STREAM=false
 
-PARALLEL_ARGS=(
-    --tensor-parallel-size "$TP"
-)
-if [ "${DP_ATTENTION}" = "true" ]; then
+# Parallelism: pure TP, TP+EP, or DEP (DP-attn + EP). Matches the dsv4 b200
+# vllm agentic launcher so the agentic sweep can probe both interactivity and
+# throughput regimes.
+PARALLEL_ARGS=(--tensor-parallel-size "$TP")
+if [ "$DP_ATTENTION" = "true" ]; then
     PARALLEL_ARGS+=(
         --dp "$TP"
         --enable-dp-attention
@@ -106,14 +143,26 @@ if [ "${EP_SIZE:-1}" -gt 1 ]; then
     PARALLEL_ARGS+=(--ep-size "$EP_SIZE")
 fi
 
-python3 -m sglang.launch_server \
+# --max-running-requests is per-engine. With DP-attn each DP engine handles
+# only CONC/$TP sequences in steady state (the agentic harness load-balances
+# users across DP ranks), so size the per-engine cap to that.
+# Pure TP is a single engine and sees all CONC sequences itself.
+if [ "$DP_ATTENTION" = "true" ]; then
+    PER_ENGINE_MAX_RUNNING=$(( CONC / TP ))
+    [ "$PER_ENGINE_MAX_RUNNING" -lt 1 ] && PER_ENGINE_MAX_RUNNING=1
+else
+    PER_ENGINE_MAX_RUNNING=$CONC
+fi
+
+echo "Starting sglang server..."
+sglang serve \
     --model-path $MODEL \
     --host=0.0.0.0 \
     --port $PORT \
     "${PARALLEL_ARGS[@]}" \
     --trust-remote-code \
     --disable-radix-cache \
-    --attention-backend compressed \
+    --attention-backend dsv4 \
     --max-running-requests ${CONC} \
     --mem-fraction-static 0.90 \
     --swa-full-tokens-ratio 0.15 \
@@ -124,31 +173,16 @@ python3 -m sglang.launch_server \
     --tool-call-parser deepseekv4 \
     --reasoning-parser deepseek-v4 \
     --chat-template "$(dirname "$0")/../chat_templates/deepseek_v4_thinking.jinja" \
-    --watchdog-timeout 1800 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
-
+    --watchdog-timeout 1800 \
+    --enable-metrics \
+    "${CACHE_ARGS[@]}" \
+    "${WARMUP_ARGS[@]}" > "$SERVER_LOG" 2>&1 &
 SERVER_PID=$!
+echo "Server PID: $SERVER_PID"
 
-# Wait for server to be ready
 wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
 
-run_benchmark_serving \
-    --model "$MODEL" \
-    --port "$PORT" \
-    --backend vllm \
-    --input-len "$ISL" \
-    --output-len "$OSL" \
-    --random-range-ratio "$RANDOM_RANGE_RATIO" \
-    --num-prompts "$((CONC * 10))" \
-    --max-concurrency "$CONC" \
-    --result-filename "$RESULT_FILENAME" \
-    --result-dir /workspace/
-
-# After throughput, run evaluation only if RUN_EVAL is true
-if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT"
-    append_lm_eval_summary
-fi
+# ---- Run benchmark ----------------------------------------------------------
+build_replay_cmd "$RESULT_DIR"
 
-# Stop GPU monitoring
-stop_gpu_monitor
-set +x
+run_agentic_replay_and_write_outputs "$RESULT_DIR"

From 76d90e0dcbd5ac4dc02c8a90d97428e0481694b3 Mon Sep 17 00:00:00 2001
From: seungrokj <seungrok.jung@amd.com>
Date: Mon, 15 Jun 2026 09:20:25 +0900
Subject: [PATCH 09/21] [AMD] update DSV4-FP4-MI355X SGLang
 agentic/fixed-seq-len benchmark scripts and master yaml

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/configs/amd-master.yaml               |   6 +-
 .../agentic/dsv4_fp4_mi355x_sglang.sh         | 104 ++++------
 .../fixed_seq_len/dsv4_fp4_mi355x_sglang.sh   | 180 ++++++------------
 3 files changed, 100 insertions(+), 190 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 9b9cceb12..d2a90a6b1 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -2751,5 +2751,7 @@ dsv4-fp4-mi355x-sglang-agentic-hicache:
     - duration: 1800
       search-space:
         #DPA, conc>=64
-      - { tp: 8, dp-attn: false, offloading: none, conc-list: [4, 8, 16, 32, 48, 64, 128, 196, 256]  }
-      - { tp: 8, dp-attn: false, offloading: hicache, conc-list: [4, 8, 16, 32, 48, 64, 128, 196, 256]  }
\ No newline at end of file
+      #- { tp: 8, dp-attn: false, offloading: none, conc-list: [4, 8, 16, 32, 48, 64, 128, 196, 256]  }
+      #- { tp: 8, dp-attn: false, offloading: hicache, conc-list: [4, 8, 16, 32, 48, 64, 128, 196, 256]  }
+      - { tp: 8, dp-attn: false, offloading: none, conc-list: [64]  }
+      - { tp: 8, dp-attn: false, offloading: hicache, conc-list: [64]  }
\ No newline at end of file
diff --git a/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh b/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh
index 236895cd2..c6f6cba25 100755
--- a/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh
+++ b/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh
@@ -47,18 +47,20 @@ amd-smi || true
 
 # ---- Resolve traces and install deps ----------------------------------------
 # https://huggingface.co/datasets/semianalysisai/cc-traces-weka-with-subagents-060826
-export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060826
+# export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060226
 
 # ---- Resolve traces and install deps ----------------------------------------
 resolve_trace_source
 install_agentic_deps
 
+# ---- Server config ----------------------------------------------------------
+SERVER_LOG="$RESULT_DIR/server.log"
+mkdir -p "$RESULT_DIR"
+
+# ---- Hicache config ----------------------------------------------------------
 # Reject anything other than none: this launcher has no SGLang CPU-offload
 # wiring (different surface than vLLM's SimpleCPUOffloadConnector).
-CACHE_ARGS=()
-WARMUP_ARGS=()
-CUDA_GRAPH_MAX_BS="$CONC"
-[ "$CUDA_GRAPH_MAX_BS" -gt 64 ] && CUDA_GRAPH_MAX_BS=64
+
 case "$OFFLOADING" in
     none)
         # Leave SGLang's default RadixAttention prefix cache on — agentic
@@ -96,63 +98,35 @@ case "$OFFLOADING" in
         ;;
 esac
 
-# Transformers in the container doesn't recognize the `deepseek_v4` model_type.
-# PR #23608's fallback in hf_transformers_utils.get_config tries to handle this
-# by writing a patched config to /tmp, but in practice isn't catching the error
-# in this image. Patch the cached config.json directly instead: set model_type
-# to `deepseek_v3` so AutoConfig.from_pretrained succeeds, and keep
-# architectures=['DeepseekV4ForCausalLM'] so SGLang dispatches to its native
-# DSv4 model class (python/sglang/srt/models/deepseek_v4.py).
-python3 << PYEOF
-import json
-from huggingface_hub import hf_hub_download
-path = hf_hub_download(repo_id="$MODEL", filename="config.json")
-with open(path) as f:
-    config = json.load(f)
-if config.get("model_type") == "deepseek_v4":
-    config["model_type"] = "deepseek_v3"
-    with open(path, "w") as f:
-        json.dump(config, f, indent=2)
-    print(f"Patched {path}: model_type deepseek_v4 -> deepseek_v3")
-else:
-    print(f"No patch needed: model_type is {config.get('model_type')!r}")
-PYEOF
-
-# DSv4 FP4-experts path. Mirrors the env block in the fixed-seq-len sibling
-# (benchmarks/single_node/dsv4_fp4_mi355x_sglang.sh), which tracks the active
-# block in python/run_dsv4.sh on the amd/deepseek_v4 branch:
-#   SGLANG_DSV4_FP4_EXPERTS=True   -> route experts through FP4 kernels
-#   SGLANG_FORCE_TRITON_MOE_FP8=0  -> dispatch MoE through aiter and apply
-#                                    the swiglu_limit clamp in the triton
-#                                    MoE fallback path.
-export SGLANG_REASONING_EFFORT=max
-export SGLANG_OPT_USE_FUSED_COMPRESS=true
-export SGLANG_OPT_USE_OLD_COMPRESSOR=true
-export SGLANG_OPT_USE_TILELANG_SWA_PREPARE=false
-export SGLANG_OPT_USE_JIT_KERNEL_FUSED_TOPK=false
-export SGLANG_OPT_USE_FUSED_HASH_TOPK=false
+# ---- LLM server config ----------------------------------------------------------
+
+CACHE_ARGS=()
+WARMUP_ARGS=()
+CUDA_GRAPH_MAX_BS="$CONC"
+[ "$CUDA_GRAPH_MAX_BS" -gt 64 ] && CUDA_GRAPH_MAX_BS=64
+
+export SGLANG_DEFAULT_THINKING=1
+export SGLANG_DSV4_REASONING_EFFORT=max
 export SGLANG_OPT_DEEPGEMM_HC_PRENORM=false
+export SGLANG_USE_AITER=1
+export SGLANG_USE_ROCM700A=0
+export SGLANG_OPT_USE_FUSED_COMPRESS=true
+export SGLANG_HACK_FLASHMLA_BACKEND=unified_kv_triton
+export SGLANG_OPT_FP8_WO_A_GEMM=false
+export SGLANG_OPT_USE_JIT_INDEXER_METADATA=false
+export SGLANG_OPT_USE_TOPK_V2=false
+export SGLANG_OPT_USE_AITER_INDEXER=true
+export SGLANG_OPT_USE_TILELANG_INDEXER=false
 export SGLANG_OPT_USE_TILELANG_MHC_PRE=false
 export SGLANG_OPT_USE_TILELANG_MHC_POST=false
-export SGLANG_OPT_USE_AITER_MHC_PRE=true
-export SGLANG_OPT_USE_AITER_MHC_POST=true
-export SGLANG_ENABLE_THINKING=1
-export SGLANG_USE_AITER=1
-export SGLANG_USE_ROCM700A=1
-export SGLANG_TOPK_TRANSFORM_512_TORCH=0
 export SGLANG_FP8_PAGED_MQA_LOGITS_TORCH=1
-export SGLANG_DSV4_FP4_EXPERTS=True
-export SGLANG_OPT_DPSK_V4_RADIX=0
-export SGLANG_OPT_USE_OVERLAP_STORE_CACHE=false
-export SGLANG_OPT_USE_FUSED_STORE_CACHE=false
-export SGLANG_FORCE_TRITON_MOE_FP8=0
-export SGLANG_HACK_FLASHMLA_BACKEND=tilelang
-export SGLANG_OPT_USE_TILELANG_INDEXER=true
-export SGLANG_OPT_USE_TRITON_SWA_PREPARE=true
+export SGLANG_OPT_USE_FUSED_COMPRESS_TRITON=true
+export AITER_BF16_FP8_MOE_BOUND=0
+export SGLANG_EAGER_INPUT_NO_COPY=true
 
-# ---- Server config ----------------------------------------------------------
-SERVER_LOG="$RESULT_DIR/server.log"
-mkdir -p "$RESULT_DIR"
+# multi-stream
+export SGLANG_OPT_USE_MULTI_STREAM_OVERLAP=false
+export SGLANG_ROCM_USE_MULTI_STREAM=false
 
 # Parallelism: pure TP, TP+EP, or DEP (DP-attn + EP). Matches the dsv4 b200
 # vllm agentic launcher so the agentic sweep can probe both interactivity and
@@ -181,17 +155,19 @@ else
 fi
 
 echo "Starting sglang server..."
-python3 -m sglang.launch_server \
-    --model-path "$MODEL_PATH" --served-model-name "$MODEL" \
+sglang serve \
+    --model-path $MODEL \
     --host=0.0.0.0 \
-    --port "$PORT" \
+    --port $PORT \
     "${PARALLEL_ARGS[@]}" \
     --trust-remote-code \
-    --attention-backend compressed \
-    --max-running-requests "$PER_ENGINE_MAX_RUNNING" \
-    --cuda-graph-max-bs "$CUDA_GRAPH_MAX_BS" \
+    --disable-radix-cache \
+    --attention-backend dsv4 \
+    --max-running-requests ${CONC} \
+    --mem-fraction-static 0.90 \
+    --swa-full-tokens-ratio 0.15 \
     --page-size 256 \
-    --context-length "$MAX_MODEL_LEN" \
+    --context-length $MAX_MODEL_LEN \
     --chunked-prefill-size 8192 \
     --disable-shared-experts-fusion \
     --tool-call-parser deepseekv4 \
diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_sglang.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_sglang.sh
index c6f6cba25..6797f1023 100755
--- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_sglang.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_sglang.sh
@@ -1,109 +1,28 @@
 #!/usr/bin/env bash
-set -euo pipefail
-set -x
-
-# Agentic trace replay benchmark for DeepSeek-V4-Pro FP4 on MI355X using SGLang.
-# Adapted from benchmarks/single_node/dsv4_fp4_mi355x_sglang.sh (fixed-seq-len
-# sibling) with the agentic harness (build_replay_cmd / write_agentic_result_json
-# / analyze_benchmark_distributions) swapped in for run_benchmark_serving.
-#
-# This launcher does NOT support CPU offload. SGLang's KV offload paths are
-# different from vLLM's SimpleCPUOffloadConnector, and the matching agentic
-# config (dsv4-fp4-mi355x-sglang-agentic) only sweeps offloading=none.
-#
-# Required env vars:
-#   MODEL, TP, CONC, OFFLOADING, TOTAL_CPU_DRAM_GB, RESULT_DIR
 
 source "$(dirname "$0")/../../benchmark_lib.sh"
 
-check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION EP_SIZE DP_ATTENTION
-
-if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then
-    MAX_MODEL_LEN=1000000
-fi
-
-if [[ -n "${SLURM_JOB_ID:-}" ]]; then
-    echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
+check_env_vars \
+    MODEL \
+    TP \
+    DP_ATTENTION \
+    EP_SIZE \
+    CONC \
+    ISL \
+    OSL \
+    RANDOM_RANGE_RATIO \
+    RESULT_FILENAME \
+    MAX_MODEL_LEN
+
+if [[ -n "$SLURM_JOB_ID" ]]; then
+  echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
 fi
 
-# ROCR/HIP visibility under slurm cgroups.
-if [ -n "${ROCR_VISIBLE_DEVICES:-}" ]; then
-    export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES"
-fi
+if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 
-# `hf download` creates the target dir if missing and is itself idempotent.
-# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
-# Either way, MODEL_PATH is what the server is launched with.
-if [[ -n "${MODEL_PATH:-}" ]]; then
-    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
-        hf download "$MODEL" --local-dir "$MODEL_PATH"
-    fi
-else
-    hf download "$MODEL"
-    export MODEL_PATH="$MODEL"
-fi
-rocm-smi || true
-amd-smi || true
-
-# ---- Resolve traces and install deps ----------------------------------------
-# https://huggingface.co/datasets/semianalysisai/cc-traces-weka-with-subagents-060826
-# export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060226
-
-# ---- Resolve traces and install deps ----------------------------------------
-resolve_trace_source
-install_agentic_deps
-
-# ---- Server config ----------------------------------------------------------
-SERVER_LOG="$RESULT_DIR/server.log"
-mkdir -p "$RESULT_DIR"
-
-# ---- Hicache config ----------------------------------------------------------
-# Reject anything other than none: this launcher has no SGLang CPU-offload
-# wiring (different surface than vLLM's SimpleCPUOffloadConnector).
-
-case "$OFFLOADING" in
-    none)
-        # Leave SGLang's default RadixAttention prefix cache on — agentic
-        # replay needs it; --disable-radix-cache would zero the hit rate.
-        ;;
-    hicache)
-        # DeepSeek V4 HiCache uses ratio-based capacity control, not GB-based.
-        # DSv4 allocates several physical host sub-pools for each logical host
-        # token. MI355X nodes have ~3 TB of host DRAM (similar to B200's 3.8
-        # TiB), so ratio=8 at TP≥8 provides a large useful CPU tier within the
-        # node budget. Lower TP configs use higher ratios to maintain adequate
-        # host token capacity without exceeding DRAM limits.
-        if [ "$TP" -ge 8 ]; then
-            DEFAULT_HICACHE_RATIO=8
-        else
-            DEFAULT_HICACHE_RATIO=16
-        fi
-        HICACHE_RATIO="${HICACHE_RATIO:-$DEFAULT_HICACHE_RATIO}"
-        HICACHE_WRITE_POLICY="${HICACHE_WRITE_POLICY:-write_through}"
-        HICACHE_IO_BACKEND="${HICACHE_IO_BACKEND:-direct}"
-        HICACHE_MEM_LAYOUT="${HICACHE_MEM_LAYOUT:-page_first_direct}"
-        export SGLANG_ENABLE_UNIFIED_RADIX_TREE=1
-        CACHE_ARGS=(
-            --enable-hierarchical-cache
-            --hicache-ratio "$HICACHE_RATIO"
-            --hicache-write-policy "$HICACHE_WRITE_POLICY"
-            --hicache-io-backend "$HICACHE_IO_BACKEND"
-            --hicache-mem-layout "$HICACHE_MEM_LAYOUT"
-        )
-        echo "HiCache DSv4 CPU tier: ratio=$HICACHE_RATIO, write_policy=$HICACHE_WRITE_POLICY, io_backend=$HICACHE_IO_BACKEND, mem_layout=$HICACHE_MEM_LAYOUT"
-        ;;
-    *)
-        echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, hicache)" >&2
-        exit 1
-        ;;
-esac
-
-# ---- LLM server config ----------------------------------------------------------
-
-CACHE_ARGS=()
-WARMUP_ARGS=()
-CUDA_GRAPH_MAX_BS="$CONC"
-[ "$CUDA_GRAPH_MAX_BS" -gt 64 ] && CUDA_GRAPH_MAX_BS=64
+# sglang ships in the image at the SHA encoded in the image tag (built
+# from the amd/deepseek_v4 branch in sgl-project/sglang). To bump sglang,
+# bump the image tag in .github/configs/amd-master.yaml.
 
 export SGLANG_DEFAULT_THINKING=1
 export SGLANG_DSV4_REASONING_EFFORT=max
@@ -128,33 +47,31 @@ export SGLANG_EAGER_INPUT_NO_COPY=true
 export SGLANG_OPT_USE_MULTI_STREAM_OVERLAP=false
 export SGLANG_ROCM_USE_MULTI_STREAM=false
 
-# Parallelism: pure TP, TP+EP, or DEP (DP-attn + EP). Matches the dsv4 b200
-# vllm agentic launcher so the agentic sweep can probe both interactivity and
-# throughput regimes.
-PARALLEL_ARGS=(--tensor-parallel-size "$TP")
-if [ "$DP_ATTENTION" = "true" ]; then
+SERVER_LOG=/workspace/server.log
+
+EVAL_CONTEXT_ARGS=""
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN"
+fi
+# Start GPU monitoring (power, temperature, clocks every second)
+start_gpu_monitor
+
+PARALLEL_ARGS=(
+    --tensor-parallel-size "$TP"
+)
+if [ "${DP_ATTENTION}" = "true" ]; then
     PARALLEL_ARGS+=(
         --dp "$TP"
         --enable-dp-attention
         --enable-prefill-delayer
+	--prefill-delayer-max-delay-ms 5000
     )
 fi
 if [ "${EP_SIZE:-1}" -gt 1 ]; then
     PARALLEL_ARGS+=(--ep-size "$EP_SIZE")
 fi
 
-# --max-running-requests is per-engine. With DP-attn each DP engine handles
-# only CONC/$TP sequences in steady state (the agentic harness load-balances
-# users across DP ranks), so size the per-engine cap to that.
-# Pure TP is a single engine and sees all CONC sequences itself.
-if [ "$DP_ATTENTION" = "true" ]; then
-    PER_ENGINE_MAX_RUNNING=$(( CONC / TP ))
-    [ "$PER_ENGINE_MAX_RUNNING" -lt 1 ] && PER_ENGINE_MAX_RUNNING=1
-else
-    PER_ENGINE_MAX_RUNNING=$CONC
-fi
-
-echo "Starting sglang server..."
 sglang serve \
     --model-path $MODEL \
     --host=0.0.0.0 \
@@ -173,16 +90,31 @@ sglang serve \
     --tool-call-parser deepseekv4 \
     --reasoning-parser deepseek-v4 \
     --chat-template "$(dirname "$0")/../chat_templates/deepseek_v4_thinking.jinja" \
-    --watchdog-timeout 1800 \
-    --enable-metrics \
-    "${CACHE_ARGS[@]}" \
-    "${WARMUP_ARGS[@]}" > "$SERVER_LOG" 2>&1 &
+    --watchdog-timeout 1800 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
+
 SERVER_PID=$!
-echo "Server PID: $SERVER_PID"
 
+# Wait for server to be ready
 wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
 
-# ---- Run benchmark ----------------------------------------------------------
-build_replay_cmd "$RESULT_DIR"
+run_benchmark_serving \
+    --model "$MODEL" \
+    --port "$PORT" \
+    --backend vllm \
+    --input-len "$ISL" \
+    --output-len "$OSL" \
+    --random-range-ratio "$RANDOM_RANGE_RATIO" \
+    --num-prompts "$((CONC * 10))" \
+    --max-concurrency "$CONC" \
+    --result-filename "$RESULT_FILENAME" \
+    --result-dir /workspace/
+
+# After throughput, run evaluation only if RUN_EVAL is true
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT"
+    append_lm_eval_summary
+fi
 
-run_agentic_replay_and_write_outputs "$RESULT_DIR"
+# Stop GPU monitoring
+stop_gpu_monitor
+set +x
\ No newline at end of file

From 4ebc4e2fe2f74f6e39d4fcd50de0fcffd7074d22 Mon Sep 17 00:00:00 2001
From: seungrokj <seungrok.jung@amd.com>
Date: Mon, 15 Jun 2026 11:55:10 +0900
Subject: [PATCH 10/21] [AMD] remove unused CACHE_ARGS from
 dsv4_fp4_mi355x_sglang agentic script

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh b/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh
index c6f6cba25..73ebac6f7 100755
--- a/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh
+++ b/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh
@@ -100,7 +100,6 @@ esac
 
 # ---- LLM server config ----------------------------------------------------------
 
-CACHE_ARGS=()
 WARMUP_ARGS=()
 CUDA_GRAPH_MAX_BS="$CONC"
 [ "$CUDA_GRAPH_MAX_BS" -gt 64 ] && CUDA_GRAPH_MAX_BS=64

From 735e9a3a8df700f1a06a7b66c2148794f7d7aac6 Mon Sep 17 00:00:00 2001
From: seungrokj <seungrok.jung@amd.com>
Date: Mon, 15 Jun 2026 11:56:58 +0900
Subject: [PATCH 11/21] [AMD] tune hicache ratio and disable none-offloading in
 agentic config

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/configs/amd-master.yaml                          | 2 +-
 benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index d2a90a6b1..88a6046f7 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -2753,5 +2753,5 @@ dsv4-fp4-mi355x-sglang-agentic-hicache:
         #DPA, conc>=64
       #- { tp: 8, dp-attn: false, offloading: none, conc-list: [4, 8, 16, 32, 48, 64, 128, 196, 256]  }
       #- { tp: 8, dp-attn: false, offloading: hicache, conc-list: [4, 8, 16, 32, 48, 64, 128, 196, 256]  }
-      - { tp: 8, dp-attn: false, offloading: none, conc-list: [64]  }
+      #- { tp: 8, dp-attn: false, offloading: none, conc-list: [64]  }
       - { tp: 8, dp-attn: false, offloading: hicache, conc-list: [64]  }
\ No newline at end of file
diff --git a/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh b/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh
index 73ebac6f7..2003c2761 100755
--- a/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh
+++ b/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh
@@ -74,7 +74,7 @@ case "$OFFLOADING" in
         # node budget. Lower TP configs use higher ratios to maintain adequate
         # host token capacity without exceeding DRAM limits.
         if [ "$TP" -ge 8 ]; then
-            DEFAULT_HICACHE_RATIO=8
+            DEFAULT_HICACHE_RATIO=2
         else
             DEFAULT_HICACHE_RATIO=16
         fi

From d3caa2b94ecd72640d86e49a798de572ea4249fa Mon Sep 17 00:00:00 2001
From: seungrokj <seungrok.jung@amd.com>
Date: Mon, 15 Jun 2026 13:00:39 +0900
Subject: [PATCH 12/21] [AMD] remove --disable-radix-cache from
 dsv4_fp4_mi355x_sglang agentic script

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh b/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh
index 2003c2761..5c780b646 100755
--- a/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh
+++ b/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh
@@ -160,7 +160,6 @@ sglang serve \
     --port $PORT \
     "${PARALLEL_ARGS[@]}" \
     --trust-remote-code \
-    --disable-radix-cache \
     --attention-backend dsv4 \
     --max-running-requests ${CONC} \
     --mem-fraction-static 0.90 \

From c11f63776cc849220aefd380fca0b2e2c783e3f2 Mon Sep 17 00:00:00 2001
From: billishyahao <bill.he@amd.com>
Date: Fri, 12 Jun 2026 03:56:25 +0000
Subject: [PATCH 13/21] [AMD] add dsv4 sglang disagg

---
 .github/configs/amd-master.yaml               | 623 +++++++++---------
 benchmarks/multi_node/amd_utils/bench.sh      |   7 +-
 benchmarks/multi_node/amd_utils/env.sh        |  55 ++
 benchmarks/multi_node/amd_utils/models.yaml   |  35 +
 .../multi_node/amd_utils/server_sglang.sh     |  20 +-
 benchmarks/multi_node/amd_utils/submit.sh     |  10 +
 .../dsv4_fp4_mi355x_sglang-disagg.sh          |  83 +++
 7 files changed, 525 insertions(+), 308 deletions(-)
 create mode 100755 benchmarks/multi_node/dsv4_fp4_mi355x_sglang-disagg.sh

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 88a6046f7..80c14f58b 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -16,6 +16,7 @@ dsr1-fp4-mi355x-sglang:
     - isl: 8192
       osl: 1024
       search-space:
+      - { tp: 4, conc-start: 4, conc-end: 64 }
       - { tp: 8, conc-start: 4, conc-end: 64 }
     # Agentic-coding sweep commented out for this image-bump PR — the
     # 10-conc agentic matrix amplifies sweep cost and the bump validation
@@ -261,7 +262,7 @@ qwen3.5-fp8-mi325x-sglang:
       - { tp: 8, conc-start: 4, conc-end: 64 }
 
 qwen3.5-fp8-mi355x-sglang:
-  image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260517
+  image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260528
   model: Qwen/Qwen3.5-397B-A17B-FP8
   model-prefix: qwen3.5
   runner: mi355x
@@ -273,17 +274,14 @@ qwen3.5-fp8-mi355x-sglang:
     - isl: 1024
       osl: 1024
       search-space:
-      - { tp: 8, ep: 1, conc-start: 4, conc-end: 32 }
-      - { tp: 8, ep: 8, conc-start: 64, conc-end: 256 }
-      - { tp: 2, ep: 2, conc-start: 128, conc-end: 256 }
+      - { tp: 4, ep: 1, conc-start: 4, conc-end: 256 }
     - isl: 8192
       osl: 1024
       search-space:
-      - { tp: 2, ep: 2, conc-start: 4, conc-end: 32 }
-      - { tp: 4, ep: 1, conc-start: 32, conc-end: 256 }
+      - { tp: 4, ep: 1, conc-start: 4, conc-end: 256 }
 
 qwen3.5-fp8-mi355x-sglang-mtp:
-  image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260517
+  image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260528
   model: Qwen/Qwen3.5-397B-A17B-FP8
   model-prefix: qwen3.5
   runner: mi355x
@@ -295,14 +293,30 @@ qwen3.5-fp8-mi355x-sglang-mtp:
     - isl: 1024
       osl: 1024
       search-space:
-      - { tp: 8, ep: 1, conc-start: 4, conc-end: 32, spec-decoding: mtp }
-      - { tp: 8, ep: 8, conc-start: 64, conc-end: 256, spec-decoding: mtp }
-      - { tp: 2, ep: 2, conc-start: 128, conc-end: 256, spec-decoding: mtp }
+      - { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp }
     - isl: 8192
       osl: 1024
       search-space:
-      - { tp: 2, ep: 2, conc-start: 4, conc-end: 32, spec-decoding: mtp }
-      - { tp: 4, ep: 1, conc-start: 32, conc-end: 256, spec-decoding: mtp }
+      - { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp }
+
+# Diverged from qwen3.5-fp8-mi355x-sglang (agentic-coding sibling). Metadata is
+# identical to origin/main's qwen3.5-fp8-mi355x-sglang; the split exists because this
+# PR adds an agentic-coding scenarios block that differs from main
+# (either main had none or had a different conc/offload sweep).
+# The original qwen3.5-fp8-mi355x-sglang entry stays byte-identical to origin/main.
+qwen3.5-fp8-mi355x-sglang-agentic:
+  image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260414
+  model: Qwen/Qwen3.5-397B-A17B-FP8
+  model-prefix: qwen3.5
+  runner: mi355x
+  precision: fp8
+  framework: sglang
+  multinode: false
+  scenarios:
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] }
 
 qwen3.5-fp8-mi355x-atom:
   image: rocm/atom:rocm7.2.3_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom20260511
@@ -413,7 +427,7 @@ qwen3.5-fp8-mi355x-sglang-disagg:
           - "DECODE_MTP_SIZE=0"
 
 qwen3.5-fp4-mi355x-sglang:
-  image: lmsysorg/sglang:v0.5.12-rocm720-mi35x
+  image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260604
   model: amd/Qwen3.5-397B-A17B-MXFP4
   model-prefix: qwen3.5
   runner: mi355x
@@ -433,22 +447,6 @@ qwen3.5-fp4-mi355x-sglang:
       - { tp: 2, conc-start: 4, conc-end: 256 }
       - { tp: 4, conc-start: 4, conc-end: 16 }
 
-# target
-qwen3.5-fp4-mi355x-sglang-agentic-hicache:
-  image: lmsysorg/sglang:v0.5.12-rocm720-mi35x
-  model: amd/Qwen3.5-397B-A17B-MXFP4
-  model-prefix: qwen3.5
-  runner: mi355x
-  precision: fp4
-  framework: sglang
-  multinode: false
-  scenarios:
-    agentic-coding:
-    - duration: 1800
-      search-space:
-      - { tp: 2, ep: 1, offloading: none, conc-list: [8, 16, 32, 40, 48, 56, 72] }
-      - { tp: 2, ep: 1, offloading: hicache, conc-list: [8, 16, 32, 40, 48, 56, 72] }
-
 qwen3.5-fp4-mi355x-atom:
   image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post
   model: amd/Qwen3.5-397B-A17B-MXFP4
@@ -471,7 +469,7 @@ qwen3.5-fp4-mi355x-atom:
       - { tp: 4, conc-start: 4, conc-end: 16 }
 
 qwen3.5-fp4-mi355x-sglang-mtp:
-  image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260517
+  image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260604
   model: amd/Qwen3.5-397B-A17B-MXFP4
   model-prefix: qwen3.5
   runner: mi355x
@@ -483,12 +481,12 @@ qwen3.5-fp4-mi355x-sglang-mtp:
     - isl: 1024
       osl: 1024
       search-space:
-      - { tp: 2, conc-start: 4, conc-end: 256, spec-decoding: mtp }
+      - { tp: 2, conc-start: 4, conc-end: 128, spec-decoding: mtp }
       - { tp: 4, conc-start: 4, conc-end: 16, spec-decoding: mtp }
     - isl: 8192
       osl: 1024
       search-space:
-      - { tp: 2, conc-start: 4, conc-end: 256, spec-decoding: mtp }
+      - { tp: 2, conc-start: 4, conc-end: 128, spec-decoding: mtp }
       - { tp: 4, conc-start: 4, conc-end: 16, spec-decoding: mtp }
 
 qwen3.5-fp4-mi355x-sglang-disagg:
@@ -701,6 +699,26 @@ glm5.1-fp4-mi355x-sglang:
       - { tp: 2, conc-start: 4, conc-end: 256 }
       - { tp: 4, conc-start: 4, conc-end: 16 }
 
+# Diverged from glm5.1-fp4-mi355x-sglang (agentic-coding sibling). Metadata is
+# identical to origin/main's glm5.1-fp4-mi355x-sglang; the split exists because this
+# PR adds an agentic-coding scenarios block that differs from main
+# (either main had none or had a different conc/offload sweep).
+# The original glm5.1-fp4-mi355x-sglang entry stays byte-identical to origin/main.
+glm5.1-fp4-mi355x-sglang-agentic:
+  image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415
+  model: amd/GLM-5.1-MXFP4
+  model-prefix: glm5.1
+  runner: mi355x
+  precision: fp4
+  framework: sglang
+  multinode: false
+  scenarios:
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      # sglang manages KV eviction; mi355x glm5.1 caps at tp=4 conc=16 in fixed-seq, so cap conservatively
+      - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] }
+
 glm5.1-fp4-mi355x-atom:
   image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post
   model: amd/GLM-5.1-MXFP4
@@ -721,7 +739,7 @@ glm5.1-fp4-mi355x-atom:
       - { tp: 4, conc-start: 4, conc-end: 256 }
 
 kimik2.5-int4-mi355x-vllm:
-  image: vllm/vllm-openai-rocm:v0.22.0
+  image: vllm/vllm-openai-rocm:v0.21.0
   model: moonshotai/Kimi-K2.5
   model-prefix: kimik2.5
   runner: mi355x
@@ -740,7 +758,7 @@ kimik2.5-int4-mi355x-vllm:
       - { tp: 8, conc-start: 4, conc-end: 64 }
 
 kimik2.5-int4-mi325x-vllm:
-  image: vllm/vllm-openai-rocm:v0.22.0
+  image: vllm/vllm-openai-rocm:v0.21.0
   model: moonshotai/Kimi-K2.5
   model-prefix: kimik2.5
   runner: mi325x
@@ -759,7 +777,7 @@ kimik2.5-int4-mi325x-vllm:
       - { tp: 8, conc-start: 4, conc-end: 64 }
 
 kimik2.5-int4-mi300x-vllm:
-  image: vllm/vllm-openai-rocm:v0.22.0
+  image: vllm/vllm-openai-rocm:v0.21.0
   model: moonshotai/Kimi-K2.5
   model-prefix: kimik2.5
   runner: mi300x
@@ -798,6 +816,38 @@ kimik2.5-fp4-mi355x-vllm:
       - { tp: 8, conc-start: 4, conc-end: 64 }
       - { tp: 4, conc-start: 4, conc-end: 64 }
 
+# Diverged from kimik2.5-fp4-mi355x-vllm (agentic-coding sibling). Reasons below;
+# the original kimik2.5-fp4-mi355x-vllm entry is left identical to origin/main so
+# its fixed-seq-len sweep is unaffected.
+#   - image: 'vllm/vllm-openai-rocm:v0.18.0' -> 'vllm/vllm-openai-rocm:v0.21.0'
+kimik2.5-fp4-mi355x-vllm-agentic:
+  # v0.21.0 (released 2026-05-14) supersedes the prior nightly pin
+  # (51f22dcf...) which was carrying the SimpleCPUOffloadConnector ROCm
+  # cpu_offload_blocks > 0 fix. v0.21.0 is much newer than that fix and
+  # includes all subsequent ROCm offload work.
+  image: vllm/vllm-openai-rocm:v0.21.0
+  model: amd/Kimi-K2.5-MXFP4
+  model-prefix: kimik2.5
+  runner: mi355x
+  precision: fp4
+  framework: vllm
+  multinode: false
+  scenarios:
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      - { tp: 8, offloading: none, conc-list: [1, 2, 4, 8, 16, 24, 32, 40, 48] }
+      # CPU offload only above the KV cliff. Lower concurrencies fit
+      # entirely on-GPU, so paying the offload-path overhead there would
+      # just slow them down without measuring anything new.
+      - { tp: 8, offloading: cpu,  conc-list: [32, 40, 48, 56] }
+      # TP=4 probe: half-node layout doubles per-GPU weight footprint
+      # (~62 GB on MI355X's 288 GB HBM, plenty of headroom). Restrict to
+      # cliff-region concurrencies on both offload modes so we can directly
+      # compare TP=4 vs TP=8 at the same conc points.
+      - { tp: 4, offloading: none, conc-list: [16, 24, 32, 40] }
+      - { tp: 4, offloading: cpu,  conc-list: [16, 24, 32, 40] }
+
 kimik2.5-fp4-mi355x-atom:
   image: rocm/atom:rocm7.2.3_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom20260511
   model: amd/Kimi-K2.5-MXFP4
@@ -842,6 +892,33 @@ minimaxm2.5-fp8-mi355x-vllm:
       - { tp: 4, ep: 4, conc-start: 4, conc-end: 512 }
       - { tp: 8, ep: 8, conc-start: 2, conc-end: 2 }
 
+# Diverged from minimaxm2.5-fp8-mi355x-vllm (agentic-coding sibling). Reasons below;
+# the original minimaxm2.5-fp8-mi355x-vllm entry is left identical to origin/main so
+# its fixed-seq-len sweep is unaffected.
+#   - image: 'vllm/vllm-openai-rocm:v0.19.0' -> 'vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf'
+minimaxm2.5-fp8-mi355x-vllm-agentic:
+  # Nightly carrying vllm-project/vllm@20cac26b ("[Bug fix][KV Connector]
+  # add cpu_offload_blocks > 0 check before maybe_run_layer_kv_offload"),
+  # which enables SimpleCPUOffloadConnector on ROCm. Required for the
+  # cpu-offload sweep points to use the same offload path as the NVIDIA
+  # agentic-coding configs.
+  image: vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf
+  model: MiniMaxAI/MiniMax-M2.5
+  model-prefix: minimaxm2.5
+  runner: mi355x
+  precision: fp8
+  framework: vllm
+  multinode: false
+  scenarios:
+    agentic-coding:
+    # MI355X tp=4 ep=4: compute ceiling ~60 (empirical), KV cliff ~91 (analytical).
+    # Compute saturates first; cpu offload likely won't help, but worth confirming.
+    # AMD uses native OffloadingConnector (NOT SimpleCPUOffloadConnector).
+    - duration: 1800
+      search-space:
+      - { tp: 4, ep: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 48, 56, 64, 72, 96] }
+      - { tp: 4, ep: 4, offloading: cpu,  conc-list: [48, 56, 64, 72, 96] }
+
 minimaxm2.5-fp8-mi355x-atom:
   image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post
   model: MiniMaxAI/MiniMax-M2.5
@@ -888,22 +965,6 @@ minimaxm2.5-fp4-mi355x-atom:
       - { tp: 4, conc-start: 4, conc-end: 128 }
       - { tp: 8, conc-start: 4, conc-end: 16 }
 
-# target
-minimaxm2.5-fp4-mi355x-vllm-agentic-lmcache:
-  image: vllm/vllm-openai-rocm:v0.22.0
-  model: amd/MiniMax-M2.5-MXFP4
-  model-prefix: minimaxm2.5
-  runner: mi355x
-  precision: fp4
-  framework: vllm
-  multinode: false
-  scenarios:
-    agentic-coding:
-    - duration: 1800
-      search-space:
-      - { tp: 1, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 40, 48] }
-      - { tp: 1, ep: 1, offloading: lmcache, conc-list: [1, 2, 4, 8, 16, 32, 40, 48] }
-
 minimaxm2.5-fp4-mi355x-vllm:
   image: vllm/vllm-openai-rocm:v0.22.0
   model: amd/MiniMax-M2.5-MXFP4
@@ -928,7 +989,7 @@ minimaxm2.5-fp4-mi355x-vllm:
       - { tp: 4, conc-start: 4, conc-end: 64 }
 
 minimaxm2.5-fp8-mi300x-vllm:
-  image: vllm/vllm-openai-rocm:v0.22.0
+  image: vllm/vllm-openai-rocm:v0.21.0
   model: MiniMaxAI/MiniMax-M2.5
   model-prefix: minimaxm2.5
   runner: mi300x
@@ -948,6 +1009,29 @@ minimaxm2.5-fp8-mi300x-vllm:
       - { tp: 2, conc-start: 4, conc-end: 64 }
       - { tp: 4, conc-start: 4, conc-end: 64 }
 
+# Diverged from minimaxm2.5-fp8-mi300x-vllm (agentic-coding sibling). Reasons below;
+# the original minimaxm2.5-fp8-mi300x-vllm entry is left identical to origin/main so
+# its fixed-seq-len sweep is unaffected.
+#   - image: 'vllm/vllm-openai-rocm:v0.16.0' -> 'vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf'
+minimaxm2.5-fp8-mi300x-vllm-agentic:
+  # Nightly carrying vllm-project/vllm@20cac26b — see mi355x config above.
+  image: vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf
+  model: MiniMaxAI/MiniMax-M2.5
+  model-prefix: minimaxm2.5
+  runner: mi300x
+  precision: fp8
+  framework: vllm
+  multinode: false
+  scenarios:
+    agentic-coding:
+    # MI300X tp=4: compute ceiling ~25 (estimated, between H100 and H200);
+    # KV cliff ~52. Compute saturates first.
+    # AMD uses native OffloadingConnector (NOT SimpleCPUOffloadConnector).
+    - duration: 1800
+      search-space:
+      - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 20, 24, 28, 32, 40, 48] }
+      - { tp: 4, offloading: cpu,  conc-list: [16, 20, 24, 28, 32] }
+
 minimaxm2.5-fp8-mi325x-vllm:
   image: vllm/vllm-openai-rocm:v0.22.0
   model: MiniMaxAI/MiniMax-M2.5
@@ -969,8 +1053,32 @@ minimaxm2.5-fp8-mi325x-vllm:
       - { tp: 2, conc-start: 4, conc-end: 64 }
       - { tp: 8, ep: 8, conc-start: 4, conc-end: 256 }
 
+# Diverged from minimaxm2.5-fp8-mi325x-vllm (agentic-coding sibling). Reasons below;
+# the original minimaxm2.5-fp8-mi325x-vllm entry is left identical to origin/main so
+# its fixed-seq-len sweep is unaffected.
+#   - image: 'vllm/vllm-openai-rocm:v0.18.0' -> 'vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf'
+minimaxm2.5-fp8-mi325x-vllm-agentic:
+  # Nightly carrying vllm-project/vllm@20cac26b — see mi355x config above.
+  image: vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf
+  model: MiniMaxAI/MiniMax-M2.5
+  model-prefix: minimaxm2.5
+  runner: mi325x
+  precision: fp8
+  framework: vllm
+  multinode: false
+  scenarios:
+    agentic-coding:
+    # MI325X tp=4: cloned from MI300X recipe (slightly faster compute,
+    # similar HBM profile). Compute saturates first; cpu-offload window
+    # exercises the SimpleCPUOffloadConnector path enabled by the rocm
+    # nightly. Mirror MI300X conc grid for cross-vendor comparability.
+    - duration: 1800
+      search-space:
+      - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 20, 24, 28, 32, 40, 48] }
+      - { tp: 4, offloading: cpu,  conc-list: [16, 20, 24, 28, 32] }
+
 gptoss-fp4-mi300x-vllm:
-  image: vllm/vllm-openai-rocm:v0.22.0
+  image: vllm/vllm-openai-rocm:v0.17.0
   model: openai/gpt-oss-120b
   model-prefix: gptoss
   runner: mi300x
@@ -1021,7 +1129,7 @@ gptoss-fp4-mi325x-vllm:
 
 gptoss-fp4-mi355x-vllm:
   image: vllm/vllm-openai-rocm:v0.22.0
-  model: amd/gpt-oss-120b-w-mxfp4-a-fp8
+  model: openai/gpt-oss-120b
   model-prefix: gptoss
   runner: mi355x
   precision: fp4
@@ -1084,7 +1192,7 @@ dsr1-fp8-mi355x-atom:
       - { tp: 8, conc-start: 4, conc-end: 128 }
 
 dsr1-fp8-mi355x-atom-mtp:
-  image: rocm/atom:rocm7.2.3_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom20260511
+  image: rocm/atom:rocm7.2.4_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.3
   model: deepseek-ai/DeepSeek-R1-0528
   model-prefix: dsr1
   runner: mi355x
@@ -1096,7 +1204,7 @@ dsr1-fp8-mi355x-atom-mtp:
     - isl: 1024
       osl: 1024
       search-space:
-      - { tp: 8, conc-start: 4, conc-end: 256, spec-decoding: mtp }
+      - { tp: 8, conc-start: 4, conc-end: 512, spec-decoding: mtp }
     - isl: 8192
       osl: 1024
       search-space:
@@ -1411,7 +1519,7 @@ dsr1-fp8-mi355x-sglang-disagg-mtp:
           - "DECODE_MTP_SIZE=2"
 
 kimik2.5-fp4-mi355x-vllm-disagg:
-  image: vllm/vllm-openai-rocm:v0.22.0
+  image: vllm/vllm-openai-rocm:nightly-bf610c2f56764e1b30bc6065f4ceace3d6e59036
   model: amd/Kimi-K2.5-MXFP4
   model-prefix: kimik2.5
   runner: mi355x-disagg
@@ -1465,7 +1573,7 @@ kimik2.5-fp4-mi355x-vllm-disagg:
           - "DECODE_NODES=2"
 
 minimaxm2.5-fp8-mi355x-vllm-disagg:
-  image: vllm/vllm-openai-rocm:v0.22.0
+  image: vllm/vllm-openai-rocm:nightly-a6682d1d259cca69a9ae737ea5608fbbe7520031
   model: MiniMaxAI/MiniMax-M2.5
   model-prefix: minimaxm2.5
   runner: mi355x-disagg
@@ -1858,6 +1966,7 @@ dsr1-fp4-mi355x-sglang-disagg-1k1k-mtp:
           - "DECODE_NODES=1"
           - "DECODE_MTP_SIZE=1"
   
+
 dsr1-fp4-mi355x-sglang-disagg-8k1k-mtp:
   image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260529
   model: amd/DeepSeek-R1-0528-MXFP4-v2
@@ -1968,6 +2077,7 @@ dsr1-fp4-mi355x-sglang-disagg-8k1k-mtp:
           - "DECODE_NODES=1"
           - "DECODE_MTP_SIZE=3"
 
+
       # 1*DEP8 + 1*DEP8
       - spec-decoding: "mtp"
         conc-list: [ 128 ]
@@ -2025,8 +2135,72 @@ dsr1-fp4-mi355x-sglang-disagg-8k1k-mtp:
           - "DECODE_NODES=1"
           - "DECODE_MTP_SIZE=1"
 
+
+# DSv4 PD-disaggregation on MI355X via SGLang + MoRI. Structure mirrors
+# dsr1-fp4-mi355x-sglang-disagg but only the isl 8192 / osl 1024 scenario, with two
+# topology families captured from the validated manual recipe (see
+# dsv4_mi355x_sglang_disagg_plan.md):
+#   - pure-TP 1P1D (TP8, mori KV transfer)
+#   - DEP 1P1D     (TP8/EP8/DP8, mori KV transfer + mori MoE a2a, dp-attention)
+# DSv4-specific serving knobs (attention-backend dsv4, page-size 256, unified_kv_triton,
+# AITER indexer, deepseekv4 parsers) live in amd_utils/{models.yaml,env.sh}; the bench
+# client uses --dsv4 framing (amd_utils/bench.sh). STP only for now (reference recipe has
+# no spec decoding); MTP is a follow-up.
+dsv4-fp4-mi355x-sglang-disagg:
+  image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260610
+  model: deepseek-ai/DeepSeek-V4-Pro
+  model-prefix: dsv4
+  runner: mi355x-disagg
+  precision: fp4
+  framework: sglang-disagg
+  multinode: true
+  disagg: true
+  scenarios:
+    fixed-seq-len:
+    - isl: 8192
+      osl: 1024
+      search-space:
+      # non-MTP configurations
+      # 1P1D pure TP8  (mori KV transfer)
+      - spec-decoding: "none"
+        conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256 ]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=1"
+          - "DECODE_MTP_SIZE=0"
+
+      # 1P1D DEP8  (mori KV transfer + mori MoE a2a, dp-attention)
+      - spec-decoding: "none"
+        conc-list: [ 512, 768, 1024 ]
+        prefill:
+          num-worker: 2
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "PREFILL_NODES=2"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "DECODE_NODES=1"
+          - "DECODE_MTP_SIZE=0"
+
 dsv4-fp4-mi355x-sglang:
-  image: rocm/sgl-dev:rocm720-mi35x-f96ac98-20260526-DSv4
+  image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260610
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: mi355x
@@ -2081,6 +2255,25 @@ dsv4-fp4-mi355x-sglang-mtp:
       - { tp: 8, dp-attn: true, conc-start: 64, conc-end: 2048, spec-decoding: mtp }
       - { tp: 8, dp-attn: false, conc-start: 1, conc-end: 32, spec-decoding: mtp }
 
+# DSv4 on MI355X via vLLM, using the official vllm/vllm-openai-rocm
+# nightly image. DSv4 base ROCm support (vllm-project/vllm#40871) merged
+# on 2026-05-05, so any nightly built after that includes the
+# DeepseekV4ForCausalLM model class.
+#
+# IMPORTANT: pin to a digest-suffixed nightly tag rather than the
+# floating `:nightly`. launch_mi355x-amds.sh caches enroot squashfs
+# files keyed on the image string and short-circuits re-import if the
+# file already exists, so the floating tag silently keeps a stale build
+# even after Docker Hub updates `:nightly`.
+#
+# DeepSeek-V4-Pro is FP4+FP8 mixed (FP4 MoE expert weights, FP8 for the
+# rest); InferenceX classifies this as fp4 — same as the sister sglang
+# and atom DSv4 mi355x entries below. Image and serving flags follow the
+# validated recipe from vllm-project/recipes#433: AITER+AITER_LINEAR, mp
+# executor, triton_unfused MoE (required for the FP4 expert format),
+# async scheduling, max-num-seqs=128, max-num-batched-tokens=8192,
+# gpu-mem-util=0.6. TP8 sweeps conc 4-64; DEP8 has a single conc=64
+# probe to validate the ROCm DP+EP path.
 dsv4-fp4-mi355x-vllm:
   image: vllm/vllm-openai-rocm:v0.22.0
   model: deepseek-ai/DeepSeek-V4-Pro
@@ -2174,23 +2367,6 @@ dsv4-fp4-mi355x-atom-mtp:
       search-space:
       - { tp: 8, ep: 1, conc-start: 1, conc-end: 1024, spec-decoding: mtp }
 
-# target
-dsv4-fp4-mi355x-atom-agentic-lmcache:
-  image: rocm/atom-dev:nightly_202606101557
-  model: deepseek-ai/DeepSeek-V4-Pro
-  model-prefix: dsv4
-  runner: mi355x
-  precision: fp4
-  framework: atom
-  multinode: false
-  scenarios:
-    agentic-coding:
-    - duration: 1800
-      search-space:
-      - { tp: 8, ep: 1, offloading: none, conc-list:    [52] }
-      #- { tp: 8, ep: 1, offloading: none, conc-list:    [44, 48, 52, 56, 60] }
-      #- { tp: 8, ep: 1, offloading: lmcache, conc-list:    [44, 48, 52, 56, 60] }
-
 qwen3.5-bf16-mi325x-sglang-mtp:
   image: lmsysorg/sglang:v0.5.12-rocm720-mi30x
   model: Qwen/Qwen3.5-397B-A17B
@@ -2286,6 +2462,44 @@ glm5-fp8-mi325x-sglang-mtp:
       search-space:
       - { tp: 8, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp }
 
+# ============================================================================
+# Net-new agentic recipes from chore/agentx-v0.3 (no overlap with main entries).
+# Recipes that ALREADY existed on main were intentionally left at main's version
+# to preserve main behavior; PR-branch modifications to those recipes are NOT
+# brought in here.
+# ============================================================================
+
+qwen3.5-fp8-mi355x-sglang-agentic-hicache:
+  image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260521
+  model: Qwen/Qwen3.5-397B-A17B-FP8
+  model-prefix: qwen3.5
+  runner: mi355x
+  precision: fp8
+  framework: sglang
+  multinode: false
+  scenarios:
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] }
+      - { tp: 8, ep: 1, offloading: hicache, conc-list: [16, 32, 48, 64] }
+
+dsv4-fp4-mi355x-vllm-agentic:
+  image: vllm/vllm-openai-rocm:v0.21.0
+  model: deepseek-ai/DeepSeek-V4-Pro
+  model-prefix: dsv4
+  runner: mi355x
+  precision: fp4
+  framework: vllm
+  multinode: false
+  scenarios:
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      - { tp: 8, offloading: none, conc-list: [1, 2, 4] }
+      - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 10, 12, 16] }
+      - { tp: 4, ep: 4, dp-attn: true, offloading: none, conc-list: [16, 24, 32, 40, 48] }
+
 dsr1-fp4-mi355x-sglang-disagg-mtp:
   image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260519
   model: amd/DeepSeek-R1-0528-MXFP4-v2
@@ -2514,214 +2728,20 @@ dsr1-fp4-mi355x-sglang-disagg-mtp:
           - "DECODE_NODES=1"
           - "DECODE_MTP_SIZE=1"
       
-qwen3.5-fp8-mi355x-sglang-agentic:
-  image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260414
-  model: Qwen/Qwen3.5-397B-A17B-FP8
-  model-prefix: qwen3.5
-  runner: mi355x
-  precision: fp8
-  framework: sglang
-  multinode: false
-  scenarios:
-    agentic-coding:
-    - duration: 1800
-      search-space:
-      - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] }
-
-glm5.1-fp4-mi355x-sglang-agentic:
-  image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415
-  model: amd/GLM-5.1-MXFP4
-  model-prefix: glm5.1
-  runner: mi355x
-  precision: fp4
-  framework: sglang
-  multinode: false
-  scenarios:
-    agentic-coding:
-    - duration: 1800
-      search-space:
-      # sglang manages KV eviction; mi355x glm5.1 caps at tp=4 conc=16 in fixed-seq, so cap conservatively
-      - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] }
-
-# target
-glm5.1-fp4-mi355x-sglang-agentic-hicache:
-  image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260529
-  model: amd/GLM-5.1-MXFP4
-  model-prefix: glm5.1
-  runner: mi355x
-  precision: fp4
-  framework: sglang
-  multinode: false
-  scenarios:
-    agentic-coding:
-    - duration: 1800
-      search-space:
-      # sglang manages KV eviction; mi355x glm5.1 caps at tp=4 conc=16 in fixed-seq, so cap conservatively
-      - { tp: 2, ep: 1, offloading: none, conc-list: [4, 8, 16, 32, 40, 48] }
-      - { tp: 2, ep: 1, offloading: hicache, conc-list: [4, 8, 16, 32, 40, 48] }
-
-kimik2.5-fp4-mi355x-vllm-agentic:
-  image: vllm/vllm-openai-rocm:v0.22.0
-  model: amd/Kimi-K2.5-MXFP4
-  model-prefix: kimik2.5
-  runner: mi355x
-  precision: fp4
-  framework: vllm
-  multinode: false
-  scenarios:
-    agentic-coding:
-    - duration: 1800
-      search-space:
-      - { tp: 8, offloading: none, conc-list: [1, 2, 4, 8, 16, 24, 32, 40, 48] }
-      # CPU offload only above the KV cliff. Lower concurrencies fit
-      # entirely on-GPU, so paying the offload-path overhead there would
-      # just slow them down without measuring anything new.
-      - { tp: 8, offloading: cpu,  conc-list: [32, 40, 48, 56] }
-      # TP=4 probe: half-node layout doubles per-GPU weight footprint
-      # (~62 GB on MI355X's 288 GB HBM, plenty of headroom). Restrict to
-      # cliff-region concurrencies on both offload modes so we can directly
-      # compare TP=4 vs TP=8 at the same conc points.
-      - { tp: 4, offloading: none, conc-list: [16, 24, 32, 40] }
-      - { tp: 4, offloading: cpu,  conc-list: [16, 24, 32, 40] }
-
-# target
-kimik2.5-fp4-mi355x-vllm-agentic-lmcache:
-  image: vllm/vllm-openai-rocm:v0.22.0
-  model: amd/Kimi-K2.5-MXFP4
-  model-prefix: kimik2.5
-  runner: mi355x
-  precision: fp4
-  framework: vllm
-  multinode: false
-  scenarios:
-    agentic-coding:
-    - duration: 1800
-      search-space:
-      - { tp: 4, ep: 1, offloading: none, conc-list:    [1, 2, 4, 8, 16, 32, 40, 48, 56, 64] }
-      - { tp: 4, ep: 1, offloading: lmcache, conc-list: [1, 2, 4, 8, 16, 32, 40, 48, 56, 64] }
-
-# target
-kimik2.5-fp4-mi355x-vllm-agentic-lmcache-060226DRAM1500GB:
-  image: vllm/vllm-openai-rocm:v0.22.0
-  model: amd/Kimi-K2.5-MXFP4
-  model-prefix: kimik2.5
-  runner: mi355x
-  precision: fp4
-  framework: vllm
-  multinode: false
-  scenarios:
-    agentic-coding:
-    - duration: 1800
-      search-space:
-      #- { tp: 4, ep: 1, offloading: none, conc-list:    [1, 2, 4, 8, 16, 32, 40, 48, 56, 64] }
-      - { tp: 4, ep: 1, offloading: lmcache, conc-list: [1, 2, 4, 8, 16, 32, 40, 48, 56, 64] }
-
-minimaxm2.5-fp8-mi355x-vllm-agentic:
-  image: vllm/vllm-openai-rocm:v0.22.0
-  model: MiniMaxAI/MiniMax-M2.5
-  model-prefix: minimaxm2.5
-  runner: mi355x
-  precision: fp8
-  framework: vllm
-  multinode: false
-  scenarios:
-    agentic-coding:
-    # MI355X tp=4 ep=4: compute ceiling ~60 (empirical), KV cliff ~91 (analytical).
-    # Compute saturates first; cpu offload likely won't help, but worth confirming.
-    # AMD uses native OffloadingConnector (NOT SimpleCPUOffloadConnector).
-    - duration: 1800
-      search-space:
-      - { tp: 4, ep: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 48, 56, 64, 72, 96] }
-      - { tp: 4, ep: 4, offloading: cpu,  conc-list: [48, 56, 64, 72, 96] }
-
-# target
-minimaxm2.5-fp8-mi355x-vllm-agentic-lmcache:
-  image: vllm/vllm-openai-rocm:v0.22.0
-  model: MiniMaxAI/MiniMax-M2.5
-  model-prefix: minimaxm2.5
-  runner: mi355x
-  precision: fp8
-  framework: vllm
-  multinode: false
-  scenarios:
-    agentic-coding:
-    # MI355X tp=4 ep=4: compute ceiling ~60 (empirical), KV cliff ~91 (analytical).
-    # Compute saturates first; cpu offload likely won't help, but worth confirming.
-    # AMD uses native OffloadingConnector (NOT SimpleCPUOffloadConnector).
-    - duration: 1800
-      search-space:
-      - { tp: 2, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 40, 48] }
-      - { tp: 2, ep: 1, offloading: lmcache, conc-list: [1, 2, 4, 8, 16, 32, 40, 48] }
-
-minimaxm2.5-fp8-mi300x-vllm-agentic:
-  image: vllm/vllm-openai-rocm:v0.22.0
-  model: MiniMaxAI/MiniMax-M2.5
-  model-prefix: minimaxm2.5
-  runner: mi300x
-  precision: fp8
-  framework: vllm
-  multinode: false
-  scenarios:
-    agentic-coding:
-    # MI300X tp=4: compute ceiling ~25 (estimated, between H100 and H200);
-    # KV cliff ~52. Compute saturates first.
-    # AMD uses native OffloadingConnector (NOT SimpleCPUOffloadConnector).
-    - duration: 1800
-      search-space:
-      - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 20, 24, 28, 32, 40, 48] }
-      - { tp: 4, offloading: cpu,  conc-list: [16, 20, 24, 28, 32] }
-
-minimaxm2.5-fp8-mi325x-vllm-agentic:
-  image: vllm/vllm-openai-rocm:v0.22.0
-  model: MiniMaxAI/MiniMax-M2.5
-  model-prefix: minimaxm2.5
-  runner: mi325x
-  precision: fp8
-  framework: vllm
-  multinode: false
-  scenarios:
-    agentic-coding:
-    # MI325X tp=4: cloned from MI300X recipe (slightly faster compute,
-    # similar HBM profile). Compute saturates first; cpu-offload window
-    # exercises the SimpleCPUOffloadConnector path enabled by the rocm
-    # nightly. Mirror MI300X conc grid for cross-vendor comparability.
-    - duration: 1800
-      search-space:
-      - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 20, 24, 28, 32, 40, 48] }
-      - { tp: 4, offloading: cpu,  conc-list: [16, 20, 24, 28, 32] }
-
-# target
-qwen3.5-fp8-mi355x-sglang-agentic-hicache:
-  image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260531
-  model: Qwen/Qwen3.5-397B-A17B-FP8
-  model-prefix: qwen3.5
-  runner: mi355x
-  precision: fp8
-  framework: sglang
-  multinode: false
-  scenarios:
-    agentic-coding:
-    - duration: 1800
-      search-space:
-      - { tp: 4, ep: 1, offloading: none, conc-list: [8, 16, 32, 40, 48, 56, 72] }
-      - { tp: 4, ep: 1, offloading: hicache, conc-list: [8, 16, 32, 40, 48, 56, 72] }
-
-dsv4-fp4-mi355x-vllm-agentic:
-  image: vllm/vllm-openai-rocm:v0.22.0
-  model: deepseek-ai/DeepSeek-V4-Pro
-  model-prefix: dsv4
-  runner: mi355x
-  precision: fp4
-  framework: vllm
-  multinode: false
-  scenarios:
-    agentic-coding:
-    - duration: 1800
-      search-space:
-      - { tp: 8, offloading: none, conc-list: [1, 2, 4] }
-      - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 10, 12, 16] }
-      - { tp: 4, ep: 4, dp-attn: true, offloading: none, conc-list: [16, 24, 32, 40, 48] }
 
+# DSv4-Pro FP4 on MI355X via SGLang. Uses a rocm720 mi35x image built off the
+# amd/deepseek_v4 branch in sgl-project/sglang; the SHA is encoded in the
+# image tag, so bumping sglang is just an image tag bump here. Sweeps
+# DP-attention on/off and EP=8.
+
+# Diverged from dsv4-fp4-mi355x-sglang (agentic-coding sibling). Reasons below;
+# the original dsv4-fp4-mi355x-sglang entry is left identical to origin/main so
+# its fixed-seq-len sweep is unaffected.
+#   - scenarios: replaced fixed-seq-len with agentic-coding.
+# Image is identical to the base entry (rocm/sgl-dev DSv4 build).
+# CONC ranges mirror dsv4-fp4-b200-vllm-agentic for cross-hardware
+# comparability. Offload sweep is none-only (SGLang has no equivalent of
+# vLLM's SimpleCPUOffloadConnector path that we exercise on b200).
 dsv4-fp4-mi355x-sglang-agentic:
   image: rocm/sgl-dev:rocm720-mi35x-0363e6c-20260509-DSv4
   model: deepseek-ai/DeepSeek-V4-Pro
@@ -2737,21 +2757,22 @@ dsv4-fp4-mi355x-sglang-agentic:
       - { tp: 8, offloading: none, conc-list: [16, 32, 64] }
       - { tp: 8, dp-attn: true, offloading: none, conc-list: [64, 128, 256] }
 
-# target
-dsv4-fp4-mi355x-sglang-agentic-hicache:
-  image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260610
-  model: deepseek-ai/DeepSeek-V4-Pro
-  model-prefix: dsv4
-  runner: mi355x
-  precision: fp4
-  framework: sglang
-  multinode: false
-  scenarios:
-    agentic-coding:
-    - duration: 1800
-      search-space:
-        #DPA, conc>=64
-      #- { tp: 8, dp-attn: false, offloading: none, conc-list: [4, 8, 16, 32, 48, 64, 128, 196, 256]  }
-      #- { tp: 8, dp-attn: false, offloading: hicache, conc-list: [4, 8, 16, 32, 48, 64, 128, 196, 256]  }
-      #- { tp: 8, dp-attn: false, offloading: none, conc-list: [64]  }
-      - { tp: 8, dp-attn: false, offloading: hicache, conc-list: [64]  }
\ No newline at end of file
+# DSv4 on MI355X via vLLM, using the official vllm/vllm-openai-rocm
+# nightly image. DSv4 base ROCm support (vllm-project/vllm#40871) merged
+# on 2026-05-05, so any nightly built after that includes the
+# DeepseekV4ForCausalLM model class.
+#
+# IMPORTANT: pin to a digest-suffixed nightly tag rather than the
+# floating `:nightly`. launch_mi355x-amds.sh caches enroot squashfs
+# files keyed on the image string and short-circuits re-import if the
+# file already exists, so the floating tag silently keeps a stale build
+# even after Docker Hub updates `:nightly`.
+#
+# DeepSeek-V4-Pro is FP4+FP8 mixed (FP4 MoE expert weights, FP8 for the
+# rest); InferenceX classifies this as fp4 — same as the sister sglang
+# and atom DSv4 mi355x entries below. Image and serving flags follow the
+# validated recipe from vllm-project/recipes#433: AITER+AITER_LINEAR, mp
+# executor, triton_unfused MoE (required for the FP4 expert format),
+# async scheduling, max-num-seqs=128, max-num-batched-tokens=8192,
+# gpu-mem-util=0.6. TP8 sweeps conc 4-64; DEP8 has a single conc=64
+# probe to validate the ROCm DP+EP path.
diff --git a/benchmarks/multi_node/amd_utils/bench.sh b/benchmarks/multi_node/amd_utils/bench.sh
index 05384f435..d198a4ddd 100755
--- a/benchmarks/multi_node/amd_utils/bench.sh
+++ b/benchmarks/multi_node/amd_utils/bench.sh
@@ -79,7 +79,12 @@ for max_concurrency in "${chosen_concurrencies[@]}"; do
     if [[ "$ENGINE" == "vllm-disagg" ]]; then
         extra_flags="--trust-remote-code --tokenizer $MODEL_PATH"
     else
-        if [ "$IS_MTP" = "true" ]; then
+        # DeepSeek-V4-Pro ships no jinja chat_template, so --use-chat-template crashes;
+        # --dsv4 applies the DSv4 <bos><User>...<Assistant><think> framing instead
+        # (chat-formatted inputs are required for correct EAGLE/MTP acceptance too).
+        if [[ "$model_name" == "DeepSeek-V4-Pro" ]]; then
+            extra_flags="--dsv4"
+        elif [ "$IS_MTP" = "true" ]; then
             extra_flags="--use-chat-template"
         fi
     fi
diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh
index 71d2653bd..6b0e4206a 100755
--- a/benchmarks/multi_node/amd_utils/env.sh
+++ b/benchmarks/multi_node/amd_utils/env.sh
@@ -228,6 +228,61 @@ $1 == "DSCP" && $2 == ":" && $NF == p {
         fi
     fi
 
+    # =========================================================================
+    # DeepSeek-V4-Pro PD recipe overrides
+    # Placed at the end of the SGLang env block so it wins over the global
+    # MoRI/SGLang defaults set above. Mirrors the validated DSv4 manual PD
+    # commands (see dsv4_mi355x_sglang_disagg_plan.md §2). Only the SGLang/MoRI
+    # env knobs are pinned here; CLI flags live in models.yaml and the cluster
+    # NIC/socket vars (NCCL_IB_HCA, *_SOCKET_IFNAME, IBDEVICES) stay runner-derived.
+    # =========================================================================
+    if [[ "$MODEL_NAME" == "DeepSeek-V4-Pro" ]]; then
+        # MoRI dispatch/combine dtypes: auto for both roles (not the fp8 split default)
+        export SGLANG_MORI_DISPATCH_DTYPE=auto
+        export MORI_COMBINE_DTYPE_PREFILL=auto
+        export MORI_COMBINE_DTYPE_DECODE=auto
+
+        # Per-role MoRI dispatch sizing (used by the harness chunked/MoE math)
+        export MORI_MAX_DISPATCH_TOKENS_PREFILL=8192
+        export MORI_MAX_DISPATCH_TOKENS_DECODE=64
+        export MORI_MOE_MAX_INPUT_TOKENS_PREFILL=2048
+        export MORI_MOE_MAX_INPUT_TOKENS_DECODE=332
+
+        # PER_RANK dispatch tokens are pinned independently of the sizing above
+        # (16384 prefill / 128 decode in the reference recipe). server_sglang.sh
+        # prefers these over the MORI_MAX_DISPATCH_TOKENS_* coupling when set.
+        export MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK_PREFILL=16384
+        export MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK_DECODE=128
+
+        # Fixed inter-kernel switch threshold (not derived). NOTE: the DP+EP path in
+        # server_sglang.sh recomputes this dynamically for the DEP topology.
+        export SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD=4096
+
+        # Overlap plan stream on for DSv4 (global default is 0)
+        export SGLANG_ENABLE_OVERLAP_PLAN_STREAM=0
+
+        # DSv4 model kernel routing (mirrors the single-node / manual PD recipe)
+        export SGLANG_DEFAULT_THINKING=1
+        export SGLANG_DSV4_REASONING_EFFORT=max
+        export SGLANG_USE_ROCM700A=0
+        export SGLANG_HACK_FLASHMLA_BACKEND=unified_kv_triton
+        export SGLANG_OPT_DEEPGEMM_HC_PRENORM=false
+        export SGLANG_OPT_USE_FUSED_COMPRESS=true
+        export SGLANG_OPT_USE_FUSED_COMPRESS_TRITON=true
+        export SGLANG_OPT_FP8_WO_A_GEMM=false
+        export SGLANG_OPT_USE_JIT_INDEXER_METADATA=false
+        export SGLANG_OPT_USE_TOPK_V2=false
+        export SGLANG_OPT_USE_AITER_INDEXER=true
+        export SGLANG_OPT_USE_TILELANG_INDEXER=false
+        export SGLANG_OPT_USE_TILELANG_MHC_PRE=false
+        export SGLANG_OPT_USE_TILELANG_MHC_POST=false
+        export SGLANG_FP8_PAGED_MQA_LOGITS_TORCH=1
+        export SGLANG_OPT_USE_MULTI_STREAM_OVERLAP=false
+        export SGLANG_ROCM_USE_MULTI_STREAM=false
+        export AITER_BF16_FP8_MOE_BOUND=0
+        export SGLANG_EAGER_INPUT_NO_COPY=true
+    fi
+
     # FIXME: WA for latest upstream 0305 image
     export PYTHONPATH=/sgl-workspace/aiter:${PYTHONPATH}
 
diff --git a/benchmarks/multi_node/amd_utils/models.yaml b/benchmarks/multi_node/amd_utils/models.yaml
index 605a377be..e68c448ce 100644
--- a/benchmarks/multi_node/amd_utils/models.yaml
+++ b/benchmarks/multi_node/amd_utils/models.yaml
@@ -349,3 +349,38 @@ DeepSeek-R1-0528-MXFP4-v2:
       max_running_requests: 128
       chunked_prefill_size: 262144
       cuda_graph_bs_range: "1-128"
+
+# DeepSeek-V4-Pro PD-disaggregation recipe (MI355X, SGLang + MoRI).
+# KV transfer = mori for both topologies (pure-TP and DEP); the DP path additionally
+# routes the MoE all-to-all through mori (--moe-a2a-backend mori) with dp-attention.
+# DSv4-specific kernel routing (unified_kv_triton, AITER indexer, fp8 wo_a fallback,
+# thinking/reasoning-effort, dispatch dtypes, per-role PER_RANK dispatch tokens) is set
+# in env.sh's DeepSeek-V4-Pro block. The bench client uses --dsv4 framing (bench.sh).
+# prefill.disable_cuda_graph routes prefill to --disable-cuda-graph; decode keeps
+# --cuda-graph-bs. See dsv4_mi355x_sglang_disagg_plan.md.
+DeepSeek-V4-Pro:
+  base_flags: "--decode-log-interval 100 --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend dsv4 --page-size 256 --swa-full-tokens-ratio 0.1 --disable-shared-experts-fusion --tool-call-parser deepseekv4 --reasoning-parser deepseek-v4 --disaggregation-transfer-backend mori"
+  dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
+  prefill:
+    mem_fraction_static: 0.8
+    disable_radix_cache: true
+    disable_cuda_graph: true
+    dp:
+      max_running_requests: 1024
+      chunked_prefill_size: 131072
+      context_length: 9217
+      max_total_tokens: 262144
+    no_dp:
+      max_running_requests: 128
+      chunked_prefill_size: 131072
+      context_length: 9217
+      max_total_tokens: 262144
+  decode:
+    mem_fraction_static: 0.85
+    prefill_round_robin_balance: true
+    dp:
+      max_running_requests: 1024
+      cuda_graph_bs_range: "1-128"
+    no_dp:
+      max_running_requests: 128
+      cuda_graph_bs_range: "1-128"
diff --git a/benchmarks/multi_node/amd_utils/server_sglang.sh b/benchmarks/multi_node/amd_utils/server_sglang.sh
index c28ccab41..38fbdfc8e 100755
--- a/benchmarks/multi_node/amd_utils/server_sglang.sh
+++ b/benchmarks/multi_node/amd_utils/server_sglang.sh
@@ -125,6 +125,7 @@ decode = m.get('decode', {})
 
 print(f'PREFILL_MEM_FRACTION_STATIC=\"{prefill.get(\"mem_fraction_static\", 0.8)}\"')
 print(f'PREFILL_DISABLE_RADIX_CACHE=\"{prefill.get(\"disable_radix_cache\", True)}\"')
+print(f'PREFILL_DISABLE_CUDA_GRAPH=\"{prefill.get(\"disable_cuda_graph\", False)}\"')
 
 dp = prefill.get('dp', {})
 no_dp = prefill.get('no_dp', {})
@@ -136,6 +137,8 @@ print(f'PREFILL_MAX_TOTAL_TOKENS_DP=\"{dp.get(\"max_total_tokens\", \"\")}\"')
 print(f'PREFILL_ENABLE_TWO_BATCH_OVERLAP_DP=\"{dp.get(\"enable_two_batch_overlap\", False)}\"')
 print(f'PREFILL_MAX_RUNNING_REQUESTS_NO_DP=\"{no_dp.get(\"max_running_requests\", 128)}\"')
 print(f'PREFILL_CHUNKED_PREFILL_SIZE_NO_DP=\"{eval_formula(no_dp.get(\"chunked_prefill_size\", 262144))}\"')
+print(f'PREFILL_CONTEXT_LENGTH_NO_DP=\"{no_dp.get(\"context_length\", \"\")}\"')
+print(f'PREFILL_MAX_TOTAL_TOKENS_NO_DP=\"{no_dp.get(\"max_total_tokens\", \"\")}\"')
 s, e = parse_range(no_dp.get('cuda_graph_bs_range', '1-128'), 1, 128)
 print(f'PREFILL_CUDA_GRAPH_BS_NO_DP_START=\"{s}\"')
 print(f'PREFILL_CUDA_GRAPH_BS_NO_DP_END=\"{e}\"')
@@ -183,8 +186,8 @@ else
     prefill_cuda_graph_bs=($(seq $PREFILL_CUDA_GRAPH_BS_NO_DP_START $PREFILL_CUDA_GRAPH_BS_NO_DP_END))
     prefill_max_running_requests=$PREFILL_MAX_RUNNING_REQUESTS_NO_DP
     prefill_chunked_prefill_size=$PREFILL_CHUNKED_PREFILL_SIZE_NO_DP
-    prefill_context_length=""
-    prefill_max_total_tokens=""
+    prefill_context_length=$PREFILL_CONTEXT_LENGTH_NO_DP
+    prefill_max_total_tokens=$PREFILL_MAX_TOTAL_TOKENS_NO_DP
     prefill_enable_two_batch_overlap="false"
 fi
 
@@ -222,7 +225,12 @@ if [[ "$DECODE_ENABLE_DP" == "true" ]] && [[ "$DECODE_ENABLE_EP" == "true" ]]; t
 fi
 
 # Build the composed config strings (equivalent to the old MODEL_PREFILL_CONFIGS / MODEL_DECODE_CONFIGS)
-PREFILL_MODE_FLAGS="--mem-fraction-static ${PREFILL_MEM_FRACTION_STATIC} --max-running-requests ${prefill_max_running_requests} --chunked-prefill-size ${prefill_chunked_prefill_size} --cuda-graph-bs ${prefill_cuda_graph_bs[*]} "
+# disable_cuda_graph (model-level) routes prefill to --disable-cuda-graph instead of --cuda-graph-bs.
+if [[ "$PREFILL_DISABLE_CUDA_GRAPH" == "True" ]] || [[ "$PREFILL_DISABLE_CUDA_GRAPH" == "true" ]]; then
+    PREFILL_MODE_FLAGS="--mem-fraction-static ${PREFILL_MEM_FRACTION_STATIC} --max-running-requests ${prefill_max_running_requests} --chunked-prefill-size ${prefill_chunked_prefill_size} --disable-cuda-graph "
+else
+    PREFILL_MODE_FLAGS="--mem-fraction-static ${PREFILL_MEM_FRACTION_STATIC} --max-running-requests ${prefill_max_running_requests} --chunked-prefill-size ${prefill_chunked_prefill_size} --cuda-graph-bs ${prefill_cuda_graph_bs[*]} "
+fi
 if [[ "$PREFILL_DISABLE_RADIX_CACHE" == "True" ]] || [[ "$PREFILL_DISABLE_RADIX_CACHE" == "true" ]]; then
     PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --disable-radix-cache"
 fi
@@ -418,7 +426,7 @@ if [ "$NODE_RANK" -eq 0 ]; then
         PREFILL_MORI_MOE_ENV="SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_PREFILL}"
     fi
     set +x
-    PREFILL_CMD="SGLANG_MORI_COMBINE_DTYPE=${MORI_COMBINE_DTYPE_PREFILL} ${PREFILL_SDMA_ENV} ${PREFILL_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \
+    PREFILL_CMD="SGLANG_MORI_COMBINE_DTYPE=${MORI_COMBINE_DTYPE_PREFILL} ${PREFILL_SDMA_ENV} ${PREFILL_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK_PREFILL:-${MORI_MAX_DISPATCH_TOKENS_PREFILL}} python3 -m sglang.launch_server \
         --model-path $MODEL_DIR/$MODEL_NAME \
         --disaggregation-mode prefill \
         --disaggregation-ib-device ${IBDEVICES} \
@@ -650,7 +658,7 @@ elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$NODE_OFFSET" ]; then
         PREFILL_MORI_MOE_ENV="SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_PREFILL}"
     fi
     set +x
-    PREFILL_CMD="SGLANG_MORI_COMBINE_DTYPE=${MORI_COMBINE_DTYPE_PREFILL} ${PREFILL_SDMA_ENV} ${PREFILL_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \
+    PREFILL_CMD="SGLANG_MORI_COMBINE_DTYPE=${MORI_COMBINE_DTYPE_PREFILL} ${PREFILL_SDMA_ENV} ${PREFILL_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK_PREFILL:-${MORI_MAX_DISPATCH_TOKENS_PREFILL}} python3 -m sglang.launch_server \
         --model-path $MODEL_DIR/${MODEL_NAME} \
         --disaggregation-mode prefill \
         --disaggregation-ib-device ${IBDEVICES} \
@@ -718,7 +726,7 @@ else
         DECODE_MORI_MOE_ENV="SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_DECODE}"
     fi
     set +x
-    DECODE_CMD="SGLANG_MORI_COMBINE_DTYPE=${MORI_COMBINE_DTYPE_DECODE} ${DECODE_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE} python3 -m sglang.launch_server \
+    DECODE_CMD="SGLANG_MORI_COMBINE_DTYPE=${MORI_COMBINE_DTYPE_DECODE} ${DECODE_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK_DECODE:-${MORI_MAX_DISPATCH_TOKENS_DECODE}} python3 -m sglang.launch_server \
         --model-path ${MODEL_DIR}/${MODEL_NAME} \
         --disaggregation-mode decode \
         --disaggregation-ib-device ${IBDEVICES} \
diff --git a/benchmarks/multi_node/amd_utils/submit.sh b/benchmarks/multi_node/amd_utils/submit.sh
index fa3d65418..c264293a7 100755
--- a/benchmarks/multi_node/amd_utils/submit.sh
+++ b/benchmarks/multi_node/amd_utils/submit.sh
@@ -47,6 +47,10 @@ Required environment variables:
   MODEL_NAME       Model name directory
   CONTAINER_IMAGE  Docker image name (e.g., vllm_disagg_pd:latest)
   RUNNER_NAME      Runner identifier (for job name)
+
+Optional environment variables:
+  DRY_RUN          1 = echo composed server/router launch commands instead of
+                   running them (preview a recipe against a real allocation).
 USAGE
 }
 
@@ -125,6 +129,12 @@ export BENCH_MAX_CONCURRENCY=${CONCURRENCIES}
 export BENCH_REQUEST_RATE=${REQUEST_RATE}
 export BENCH_RANDOM_RANGE_RATIO=${RANDOM_RANGE_RATIO:-0.8}
 
+# DRY_RUN=1 makes server_sglang.sh echo the composed prefill/decode/router launch
+# commands instead of executing them (useful for previewing a recipe against a real
+# allocation). Threaded here → job.slurm → Docker (-e DRY_RUN) → server_sglang.sh.
+# sbatch defaults to --export=ALL, so exporting it is what carries it into the job.
+export DRY_RUN="${DRY_RUN:-0}"
+
 # Eval-related env vars (threaded from workflow → runner → here → job.slurm → Docker)
 export RUN_EVAL="${RUN_EVAL:-false}"
 export EVAL_ONLY="${EVAL_ONLY:-false}"
diff --git a/benchmarks/multi_node/dsv4_fp4_mi355x_sglang-disagg.sh b/benchmarks/multi_node/dsv4_fp4_mi355x_sglang-disagg.sh
new file mode 100755
index 000000000..d17d1a323
--- /dev/null
+++ b/benchmarks/multi_node/dsv4_fp4_mi355x_sglang-disagg.sh
@@ -0,0 +1,83 @@
+#!/usr/bin/env bash
+
+source "$(dirname "$0")/../benchmark_lib.sh"
+
+check_env_vars \
+    CONC_LIST \
+    ISL \
+    OSL \
+    IMAGE \
+    SPEC_DECODING \
+    MODEL_PATH \
+    PREFILL_NUM_WORKERS \
+    PREFILL_TP \
+    PREFILL_EP \
+    PREFILL_DP_ATTN \
+    DECODE_NUM_WORKERS \
+    DECODE_TP \
+    DECODE_EP \
+    DECODE_DP_ATTN \
+    PREFILL_NODES \
+    DECODE_NODES \
+    RANDOM_RANGE_RATIO \
+    FRAMEWORK
+
+if [[ -n "$SLURM_JOB_ID" ]]; then
+  echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+fi
+
+set -x
+
+# Use upstreamed multi_node scripts (no external clone needed)
+cd "$GITHUB_WORKSPACE/benchmarks/multi_node/amd_utils" || exit 1
+
+# Set up SGL launch script-specific environment variables
+export TIME_LIMIT="08:00:00"
+export MODEL_PATH=$MODEL_PATH
+export MODEL_NAME=$MODEL_NAME
+export CONTAINER_IMAGE=$IMAGE
+
+if [[ "${PREFILL_EP:-1}" -eq 1 ]]; then
+export PREFILL_ENABLE_EP=false
+else
+export PREFILL_ENABLE_EP=true
+fi
+
+if [[ "$PREFILL_DP_ATTN" == "true" ]]; then
+export PREFILL_ENABLE_DP=true
+else
+export PREFILL_ENABLE_DP=false
+fi
+
+if [[ "${DECODE_EP:-1}" -eq 1 ]]; then
+export DECODE_ENABLE_EP=false
+else
+export DECODE_ENABLE_EP=true
+fi
+
+if [[ "$DECODE_DP_ATTN" == "true" ]]; then
+export DECODE_ENABLE_DP=true
+else
+export DECODE_ENABLE_DP=false
+fi
+
+# Launch jobs based on ISL/OSL
+# Replace ' ' in CONC_LIST with 'x' such that the concurrency list is represented
+# by a list of numbers delimited by 'x'. This is because of how the underlying launch script
+# expects the concurrencies.
+JOB_ID=$(bash ./submit.sh $PREFILL_NODES \
+    $PREFILL_NUM_WORKERS \
+    $DECODE_NODES \
+    $DECODE_NUM_WORKERS \
+    $ISL $OSL "${CONC_LIST// /x}" inf \
+    ${PREFILL_ENABLE_EP} ${PREFILL_ENABLE_DP} \
+    ${DECODE_ENABLE_EP} ${DECODE_ENABLE_DP} \
+    ${PREFILL_TP} ${DECODE_TP} \
+    ${RANDOM_RANGE_RATIO})
+
+if [[ $? -ne 0 ]]; then
+    echo "Failed to submit job" >&2
+    exit 1
+fi
+
+echo "$JOB_ID"

From a9e1304067230bb9cbad4eff41ef8eeb58321e1a Mon Sep 17 00:00:00 2001
From: Theresa Shan <theresa.shan@amd.com>
Date: Mon, 15 Jun 2026 15:30:02 +0000
Subject: [PATCH 14/21] recipe for agentic dsr1 fp4 and dsv4 fp4

Signed-off-by: thshan@amd.com <thshan@amd.com@mia1-p01-g07.mia.tensorwave.lan>
Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .github/configs/amd-master.yaml               | 119 ++++++++++++
 .github/workflows/run-sweep.yml               |   1 +
 .../agentic/dsr1_fp4_mi355x_sglang-disagg.sh  | 176 ++++++++++++++++++
 .../agentic/dsv4_fp4_mi355x_sglang-disagg.sh  | 160 ++++++++++++++++
 benchmarks/multi_node/amd_utils/env.sh        |  32 +++-
 benchmarks/multi_node/amd_utils/job.slurm     |  73 +++++++-
 benchmarks/multi_node/amd_utils/models.yaml   |   1 +
 .../multi_node/amd_utils/server_sglang.sh     |  41 +++-
 .../multi_node/amd_utils/trace_replay.sh      |  93 +++++++++
 utils/matrix_logic/generate_sweep_configs.py  |   2 +
 utils/matrix_logic/validation.py              |   3 +
 11 files changed, 689 insertions(+), 12 deletions(-)
 create mode 100755 benchmarks/multi_node/agentic/dsr1_fp4_mi355x_sglang-disagg.sh
 create mode 100755 benchmarks/multi_node/agentic/dsv4_fp4_mi355x_sglang-disagg.sh
 create mode 100644 benchmarks/multi_node/amd_utils/trace_replay.sh

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 80c14f58b..ec5dc2b70 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -2776,3 +2776,122 @@ dsv4-fp4-mi355x-sglang-agentic:
 # async scheduling, max-num-seqs=128, max-num-batched-tokens=8192,
 # gpu-mem-util=0.6. TP8 sweeps conc 4-64; DEP8 has a single conc=64
 # probe to validate the ROCm DP+EP path.
+
+# target
+dsv4-fp4-mi355x-sglang-agentic-hicache:
+  image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260610
+  model: deepseek-ai/DeepSeek-V4-Pro
+  model-prefix: dsv4
+  runner: mi355x
+  precision: fp4
+  framework: sglang
+  multinode: false
+  scenarios:
+    agentic-coding:
+    - duration: 1800
+      search-space:
+        #DPA, conc>=64
+      #- { tp: 8, dp-attn: false, offloading: none, conc-list: [4, 8, 16, 32, 48, 64, 128, 196, 256]  }
+      #- { tp: 8, dp-attn: false, offloading: hicache, conc-list: [4, 8, 16, 32, 48, 64, 128, 196, 256]  }
+      #- { tp: 8, dp-attn: false, offloading: none, conc-list: [64]  }
+      - { tp: 8, dp-attn: false, offloading: hicache, conc-list: [64]  }
+
+dsr1-fp4-mi355x-sglang-disagg-agentic-hicache:
+  image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260609
+  model: amd/DeepSeek-R1-0528-MXFP4-v2
+  model-prefix: dsr1
+  runner: mi355x
+  precision: fp4
+  framework: sglang-disagg
+  multinode: true
+  disagg: true
+  scenarios:
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      - spec-decoding: "none"
+        conc-list: [ 1 ]
+        offloading: none
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=1"
+          - "DECODE_MTP_SIZE=0"
+      - spec-decoding: "none"
+        conc-list: [ 1 ]
+        offloading: hicache
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=1"
+          - "DECODE_MTP_SIZE=0"
+
+dsv4-fp4-mi355x-sglang-disagg-agentic-hicache:
+  image: lmsysorg/sglang-rocm:v0.5.13-rocm720-mi35x-20260614
+  model: deepseek-ai/DeepSeek-V4-Pro
+  model-prefix: dsv4
+  runner: mi355x-disagg
+  precision: fp4
+  framework: sglang-disagg
+  multinode: true
+  disagg: true
+  scenarios:
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      - spec-decoding: "none"
+        conc-list: [ 1 ]
+        offloading: none
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=1"
+          - "DECODE_MTP_SIZE=0"
+      - spec-decoding: "none"
+        conc-list: [ 1 ]
+        offloading: hicache
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=1"
+          - "DECODE_MTP_SIZE=0"
diff --git a/.github/workflows/run-sweep.yml b/.github/workflows/run-sweep.yml
index 3533e8175..6de3d92ea 100644
--- a/.github/workflows/run-sweep.yml
+++ b/.github/workflows/run-sweep.yml
@@ -416,6 +416,7 @@ jobs:
             decode-additional-settings: ${{ toJson(matrix.config.decode.additional-settings) }}
             conc: ${{ matrix.config.conc }}
             duration: ${{ matrix.config.duration }}
+            offloading: ${{ matrix.config.offloading }}
             run-eval: false
             scenario-type: agentic-coding
 
diff --git a/benchmarks/multi_node/agentic/dsr1_fp4_mi355x_sglang-disagg.sh b/benchmarks/multi_node/agentic/dsr1_fp4_mi355x_sglang-disagg.sh
new file mode 100755
index 000000000..c82aea48c
--- /dev/null
+++ b/benchmarks/multi_node/agentic/dsr1_fp4_mi355x_sglang-disagg.sh
@@ -0,0 +1,176 @@
+#!/usr/bin/env bash
+
+# Agentic trace-replay recipe for a disaggregated SGLang server on MI355X
+# (DeepSeek-R1-0528 MXFP4-v2, 1P1D TP8).
+#
+# CI-style sibling of dsr1_fp8_mi355x_sglang-disagg.sh: driven entirely by
+# environment variables and submits a SLURM job via submit.sh. The agentic /
+# HiCache-offload configuration is ported from local_test_dsr1_agentic_offload.sh
+# and is fully env-overridable so a YAML config can tune it.
+
+source "$(dirname "$0")/../benchmark_lib.sh"
+
+check_env_vars \
+    CONC_LIST \
+    ISL \
+    OSL \
+    IMAGE \
+    SPEC_DECODING \
+    MODEL_PATH \
+    PREFILL_NUM_WORKERS \
+    PREFILL_TP \
+    PREFILL_EP \
+    PREFILL_DP_ATTN \
+    DECODE_NUM_WORKERS \
+    DECODE_TP \
+    DECODE_EP \
+    DECODE_DP_ATTN \
+    PREFILL_NODES \
+    DECODE_NODES \
+    RANDOM_RANGE_RATIO \
+    DURATION \
+    OFFLOADING \
+    IS_AGENTIC \
+    FRAMEWORK
+
+if [[ -n "$SLURM_JOB_ID" ]]; then
+  echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+fi
+
+set -x
+
+# Use upstreamed multi_node scripts (no external clone needed)
+cd "$GITHUB_WORKSPACE/benchmarks/multi_node/amd_utils" || exit 1
+
+# Set up SGL launch script-specific environment variables
+export TIME_LIMIT="${TIME_LIMIT:-08:00:00}"
+export MODEL_PATH=$MODEL_PATH
+export MODEL_NAME=$MODEL_NAME
+export CONTAINER_IMAGE=$IMAGE
+
+# ── Identity / result naming ──
+export MODEL_PREFIX="${MODEL_PREFIX:-dsr1}"
+export PRECISION="${PRECISION:-fp4}"
+export RESULT_FILENAME="${RESULT_FILENAME:-${RUNNER_NAME:-dsr1-fp4-agentic}}"
+
+# ── Agentic benchmark params ──
+# DURATION threads through submit.sh -> job.slurm -> Docker -> bench.sh.
+# CONC_LIST drives the concurrency sweep (submit.sh splits on 'x').
+export DURATION="${DURATION:-1800}"
+export MAX_MODEL_LEN="${MAX_MODEL_LEN:-163840}"
+
+# ── Aiter fault mitigations (ROCm/ROCm#6023) ──
+export SGLANG_AITER_MLA_PERSIST="${SGLANG_AITER_MLA_PERSIST:-1}"
+# 1 => append --disable-custom-all-reduce to prefill+decode (Aiter fault mitigation).
+export DISABLE_CUSTOM_ALL_REDUCE="${DISABLE_CUSTOM_ALL_REDUCE:-1}"
+
+# # ── Hugging Face cache persistence ──
+# # Persist the HF Hub/datasets cache across runs so traces aren't re-downloaded.
+# export HF_CACHE_HOST_DIR="${HF_CACHE_HOST_DIR:-$HOME/.cache/huggingface}"
+# mkdir -p "${HF_CACHE_HOST_DIR}"
+# export EXTRA_DOCKER_MOUNTS="${EXTRA_DOCKER_MOUNTS:-} -v ${HF_CACHE_HOST_DIR}:/root/.cache/huggingface"
+# # HF auth token: provide via the environment/CI secrets (do NOT hardcode here).
+# export HF_TOKEN="${HF_TOKEN:-}"
+# if [[ -n "${HF_TOKEN:-}" && -z "${HUGGING_FACE_HUB_TOKEN:-}" ]]; then
+#   export HUGGING_FACE_HUB_TOKEN="$HF_TOKEN"
+# fi
+
+# ── In-tree sglang patches ──
+# mori_conn.py targets hybrid-state bugs (GLM-5, Qwen3.5) not present in
+# DSR1-MXFP4-v2 (pure MLA). Skip the auto-apply in job.slurm.
+export MORI_CONN_PATCH="${MORI_CONN_PATCH:-skip}"
+
+# ── KV cache offloading (HiCache) ──
+# OFFLOADING=hicache (default for this recipe) | none. HICACHE_TIER:
+#   L2 -> GPU + CPU-DRAM host pool only.   L3 -> + Mooncake distributed KV store.
+export OFFLOADING="${OFFLOADING:-hicache}"
+export HICACHE_TIER="${HICACHE_TIER:-L3}"
+export HICACHE_TOTAL_CPU_DRAM_GB="${HICACHE_TOTAL_CPU_DRAM_GB:-64}"
+export HICACHE_HOST_POOL_COUNT="${HICACHE_HOST_POOL_COUNT:-1}"
+export HICACHE_PAGE_SIZE="${HICACHE_PAGE_SIZE:-64}"
+# Per-rank L2 host pool in GB (100GB/rank x TP8 = ~800GB pinned host DRAM/node).
+export HICACHE_SIZE_GB="${HICACHE_SIZE_GB:-100}"
+
+# ── HiCache layout/backend driven by HICACHE_TIER ──
+# Each tier has a canonical (layout, io_backend, write_policy, storage_backend)
+# combo (mirrors server_sglang.sh build_hicache_flags). Any var set explicitly
+# in the environment wins over the tier default.
+#   L3 (Mooncake): page_first + direct + write_through        + storage=mooncake
+#   L2 (CPU DRAM): layer_first + kernel + write_through_selective + storage=none
+if [[ "${HICACHE_TIER^^}" == "L3" ]]; then
+  export HICACHE_MEM_LAYOUT="${HICACHE_MEM_LAYOUT:-page_first}"
+  export HICACHE_IO_BACKEND="${HICACHE_IO_BACKEND:-direct}"
+  export HICACHE_WRITE_POLICY="${HICACHE_WRITE_POLICY:-write_through}"
+  export HICACHE_STORAGE_BACKEND="${HICACHE_STORAGE_BACKEND:-mooncake}"
+else
+  export HICACHE_MEM_LAYOUT="${HICACHE_MEM_LAYOUT:-layer_first}"
+  export HICACHE_IO_BACKEND="${HICACHE_IO_BACKEND:-direct}"
+  # write_through_selective evicts only under GPU memory pressure, giving mori
+  # time to complete RDMA KV transfers before pages are freed. write_through
+  # evicts immediately and races with mori → GPU memory access faults.
+  export HICACHE_WRITE_POLICY="${HICACHE_WRITE_POLICY:-write_through_selective}"
+  export HICACHE_STORAGE_BACKEND="${HICACHE_STORAGE_BACKEND:-}"
+fi
+export HICACHE_DECODE="${HICACHE_DECODE:-0}"
+# Shared nodes: use non-default Mooncake ports to avoid colliding with other users.
+export MC_MASTER_PORT="${MC_MASTER_PORT:-58137}"
+export MC_METRICS_PORT="${MC_METRICS_PORT:-19003}"
+export MC_PATCH_HOSTPOOL="${MC_PATCH_HOSTPOOL:-1}"
+export MC_PROTOCOL="${MC_PROTOCOL:-tcp}"
+export MC_GLOBAL_SEG="${MC_GLOBAL_SEG:-30gb}"
+export MC_DEVICE="${MC_DEVICE:-rdma0}"
+export MC_MASTER_ADDR="${MC_MASTER_ADDR:-}"
+
+# ── MoRIIO RDMA Send Queue tuning (headroom for conc>=8) ──
+export MORI_IO_SQ_BACKOFF_TIMEOUT_US="${MORI_IO_SQ_BACKOFF_TIMEOUT_US:-500000}"
+export MORI_IO_QP_MAX_SEND_WR="${MORI_IO_QP_MAX_SEND_WR:-32768}"
+
+# ── SGLang PD router policy + server metrics ──
+export PREFILL_ROUTER_POLICY="${PREFILL_ROUTER_POLICY:-random}"
+export ENABLE_METRICS="${ENABLE_METRICS:-1}"
+
+# ── MTP ──
+export DECODE_MTP_SIZE="${DECODE_MTP_SIZE:-0}"
+
+# Derive EP/DP enable flags from the topology inputs (same as the fixed-seq recipe).
+if [[ "${PREFILL_EP:-1}" -eq 1 ]]; then
+export PREFILL_ENABLE_EP=false
+else
+export PREFILL_ENABLE_EP=true
+fi
+
+if [[ "$PREFILL_DP_ATTN" == "true" ]]; then
+export PREFILL_ENABLE_DP=true
+else
+export PREFILL_ENABLE_DP=false
+fi
+
+if [[ "${DECODE_EP:-1}" -eq 1 ]]; then
+export DECODE_ENABLE_EP=false
+else
+export DECODE_ENABLE_EP=true
+fi
+
+if [[ "$DECODE_DP_ATTN" == "true" ]]; then
+export DECODE_ENABLE_DP=true
+else
+export DECODE_ENABLE_DP=false
+fi
+
+# Launch the job. CONC_LIST is space-delimited in YAML; submit.sh wants 'x'.
+JOB_ID=$(bash ./submit.sh $PREFILL_NODES \
+    $PREFILL_NUM_WORKERS \
+    $DECODE_NODES \
+    $DECODE_NUM_WORKERS \
+    $ISL $OSL "${CONC_LIST// /x}" inf \
+    ${PREFILL_ENABLE_EP} ${PREFILL_ENABLE_DP} \
+    ${DECODE_ENABLE_EP} ${DECODE_ENABLE_DP} \
+    ${PREFILL_TP} ${DECODE_TP} \
+    ${RANDOM_RANGE_RATIO})
+
+if [[ $? -ne 0 ]]; then
+    echo "Failed to submit job" >&2
+    exit 1
+fi
+
+echo "$JOB_ID"
diff --git a/benchmarks/multi_node/agentic/dsv4_fp4_mi355x_sglang-disagg.sh b/benchmarks/multi_node/agentic/dsv4_fp4_mi355x_sglang-disagg.sh
new file mode 100755
index 000000000..45c2c96b7
--- /dev/null
+++ b/benchmarks/multi_node/agentic/dsv4_fp4_mi355x_sglang-disagg.sh
@@ -0,0 +1,160 @@
+#!/usr/bin/env bash
+
+# Agentic trace-replay recipe for a disaggregated SGLang server on MI355X
+# (DeepSeek-V4-Pro FP4, 1P1D TP8).
+#
+# CI-style sibling of dsr1_fp4_mi355x_sglang-disagg.sh: driven entirely by
+# environment variables and submits a SLURM job via submit.sh. The agentic /
+# HiCache-offload configuration mirrors the DSR1 recipe but uses DSV4-Pro
+# specific flags (dsv4 attention backend, page-size 256, SWA settings).
+
+source "$(dirname "$0")/../benchmark_lib.sh"
+
+check_env_vars \
+    CONC_LIST \
+    ISL \
+    OSL \
+    IMAGE \
+    SPEC_DECODING \
+    MODEL_PATH \
+    PREFILL_NUM_WORKERS \
+    PREFILL_TP \
+    PREFILL_EP \
+    PREFILL_DP_ATTN \
+    DECODE_NUM_WORKERS \
+    DECODE_TP \
+    DECODE_EP \
+    DECODE_DP_ATTN \
+    PREFILL_NODES \
+    DECODE_NODES \
+    RANDOM_RANGE_RATIO \
+    DURATION \
+    OFFLOADING \
+    IS_AGENTIC \
+    FRAMEWORK
+
+if [[ -n "$SLURM_JOB_ID" ]]; then
+  echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+fi
+
+set -x
+
+# Use upstreamed multi_node scripts (no external clone needed)
+cd "$GITHUB_WORKSPACE/benchmarks/multi_node/amd_utils" || exit 1
+
+# Set up SGL launch script-specific environment variables
+export TIME_LIMIT="${TIME_LIMIT:-08:00:00}"
+export MODEL_PATH=$MODEL_PATH
+export MODEL_NAME=$MODEL_NAME
+export CONTAINER_IMAGE=$IMAGE
+
+# ── Identity / result naming ──
+export MODEL_PREFIX="${MODEL_PREFIX:-dsv4}"
+export PRECISION="${PRECISION:-fp4}"
+export RESULT_FILENAME="${RESULT_FILENAME:-${RUNNER_NAME:-dsv4-fp4-agentic}}"
+
+# ── Agentic benchmark params ──
+export DURATION="${DURATION:-1800}"
+# DSV4-Pro max model len for agentic traces (matches single-node recipe).
+export MAX_MODEL_LEN="${MAX_MODEL_LEN:-262144}"
+
+# ── In-tree sglang patches ──
+# mori_conn.py targets hybrid-state bugs (GLM-5, Qwen3.5). DSV4-Pro uses a
+# pure MoE/DSA architecture without hybrid state; skip to avoid interference.
+export MORI_CONN_PATCH="${MORI_CONN_PATCH:-skip}"
+
+# ── Aiter fault mitigation ──
+# --disable-custom-all-reduce avoids a known aiter fault on MI355X.
+export DISABLE_CUSTOM_ALL_REDUCE="${DISABLE_CUSTOM_ALL_REDUCE:-0}"
+
+# ── KV cache offloading (HiCache) ──
+# OFFLOADING=hicache | none (passed from YAML; default none for disagg).
+# HICACHE_TIER: L2 -> GPU + CPU-DRAM host pool. L3 -> + Mooncake store.
+export OFFLOADING="${OFFLOADING:-none}"
+export HICACHE_TIER="${HICACHE_TIER:-L3}"
+export HICACHE_TOTAL_CPU_DRAM_GB="${HICACHE_TOTAL_CPU_DRAM_GB:-64}"
+export HICACHE_HOST_POOL_COUNT="${HICACHE_HOST_POOL_COUNT:-1}"
+# DSV4 uses page-size 256 (set in models.yaml); HiCache must match.
+export HICACHE_PAGE_SIZE="${HICACHE_PAGE_SIZE:-256}"
+# Per-rank L2 host pool in GB.
+export HICACHE_SIZE_GB="${HICACHE_SIZE_GB:-100}"
+
+# ── HiCache layout/backend by tier ──
+#   L3 (Mooncake): page_first + direct + write_through     + storage=mooncake
+#   L2 (CPU DRAM): layer_first + direct + write_through_selective + storage=none
+# NOTE: write_through_selective evicts only under GPU memory pressure, avoiding
+# the mori RDMA race that causes GPU memory access faults with write_through.
+if [[ "${HICACHE_TIER^^}" == "L3" ]]; then
+  export HICACHE_MEM_LAYOUT="${HICACHE_MEM_LAYOUT:-page_first}"
+  export HICACHE_IO_BACKEND="${HICACHE_IO_BACKEND:-direct}"
+  export HICACHE_WRITE_POLICY="${HICACHE_WRITE_POLICY:-write_through}"
+  export HICACHE_STORAGE_BACKEND="${HICACHE_STORAGE_BACKEND:-mooncake}"
+else
+  export HICACHE_MEM_LAYOUT="${HICACHE_MEM_LAYOUT:-layer_first}"
+  export HICACHE_IO_BACKEND="${HICACHE_IO_BACKEND:-direct}"
+  export HICACHE_WRITE_POLICY="${HICACHE_WRITE_POLICY:-write_through_selective}"
+  export HICACHE_STORAGE_BACKEND="${HICACHE_STORAGE_BACKEND:-}"
+fi
+export HICACHE_DECODE="${HICACHE_DECODE:-0}"
+# Shared nodes: use non-default Mooncake ports to avoid collisions.
+export MC_MASTER_PORT="${MC_MASTER_PORT:-58137}"
+export MC_METRICS_PORT="${MC_METRICS_PORT:-19003}"
+export MC_PATCH_HOSTPOOL="${MC_PATCH_HOSTPOOL:-1}"
+export MC_PROTOCOL="${MC_PROTOCOL:-tcp}"
+export MC_GLOBAL_SEG="${MC_GLOBAL_SEG:-30gb}"
+export MC_DEVICE="${MC_DEVICE:-rdma0}"
+export MC_MASTER_ADDR="${MC_MASTER_ADDR:-}"
+
+# ── MoRIIO RDMA Send Queue tuning ──
+export MORI_IO_SQ_BACKOFF_TIMEOUT_US="${MORI_IO_SQ_BACKOFF_TIMEOUT_US:-500000}"
+export MORI_IO_QP_MAX_SEND_WR="${MORI_IO_QP_MAX_SEND_WR:-32768}"
+
+# ── SGLang PD router policy + server metrics ──
+export PREFILL_ROUTER_POLICY="${PREFILL_ROUTER_POLICY:-random}"
+export ENABLE_METRICS="${ENABLE_METRICS:-1}"
+
+# ── MTP ──
+export DECODE_MTP_SIZE="${DECODE_MTP_SIZE:-0}"
+
+# Derive EP/DP enable flags from the topology inputs.
+if [[ "${PREFILL_EP:-1}" -eq 1 ]]; then
+export PREFILL_ENABLE_EP=false
+else
+export PREFILL_ENABLE_EP=true
+fi
+
+if [[ "$PREFILL_DP_ATTN" == "true" ]]; then
+export PREFILL_ENABLE_DP=true
+else
+export PREFILL_ENABLE_DP=false
+fi
+
+if [[ "${DECODE_EP:-1}" -eq 1 ]]; then
+export DECODE_ENABLE_EP=false
+else
+export DECODE_ENABLE_EP=true
+fi
+
+if [[ "$DECODE_DP_ATTN" == "true" ]]; then
+export DECODE_ENABLE_DP=true
+else
+export DECODE_ENABLE_DP=false
+fi
+
+# Launch the job. CONC_LIST is space-delimited in YAML; submit.sh wants 'x'.
+JOB_ID=$(bash ./submit.sh $PREFILL_NODES \
+    $PREFILL_NUM_WORKERS \
+    $DECODE_NODES \
+    $DECODE_NUM_WORKERS \
+    $ISL $OSL "${CONC_LIST// /x}" inf \
+    ${PREFILL_ENABLE_EP} ${PREFILL_ENABLE_DP} \
+    ${DECODE_ENABLE_EP} ${DECODE_ENABLE_DP} \
+    ${PREFILL_TP} ${DECODE_TP} \
+    ${RANDOM_RANGE_RATIO})
+
+if [[ $? -ne 0 ]]; then
+    echo "Failed to submit job" >&2
+    exit 1
+fi
+
+echo "$JOB_ID"
diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh
index 6b0e4206a..8854949be 100755
--- a/benchmarks/multi_node/amd_utils/env.sh
+++ b/benchmarks/multi_node/amd_utils/env.sh
@@ -12,6 +12,17 @@ set -x
 ENGINE="${ENGINE:-sglang-disagg}"
 export PYTHONDONTWRITEBYTECODE=1
 
+# HiCache / Mooncake settings are delivered via a bind-mounted config file (see
+# job.slurm) instead of individual docker -e flags. Source it with auto-export so
+# the values land in the environment before the "${VAR:-default}" fallbacks below
+# apply. Guarded so non-container / single-node runs without the mount still work.
+if [[ -f /config/hicache_mc.env ]]; then
+    set -a
+    source /config/hicache_mc.env
+    set +a
+    echo "[INFO] Loaded HiCache/Mooncake config from /config/hicache_mc.env"
+fi
+
 # =============================================================================
 # Shared: IBDEVICES detection
 # =============================================================================
@@ -125,15 +136,22 @@ else
 
     export SGLANG_USE_AITER=1
     export AITER_LOG_LEVEL=ERROR
+    # Align with mori-scheduler/scripts/multi_node reference: persist the AITER MLA
+    # workspace (MLA prefill path) and enable the MXFP4 MoE scale-factor for this
+    # MXFP4 model. Overridable.
+    export SGLANG_AITER_MLA_PERSIST="${SGLANG_AITER_MLA_PERSIST:-1}"
+    export AITER_MXFP4_MOE_SF="${AITER_MXFP4_MOE_SF:-1}"
 
     export SGLANG_MORI_DISPATCH_DTYPE=auto
     export MORI_COMBINE_DTYPE_PREFILL=fp8_direct_cast
     export MORI_COMBINE_DTYPE_DECODE=fp8
     export SGLANG_MORI_QP_PER_TRANSFER=4
     export SGLANG_MORI_NUM_WORKERS=4
-    export MORI_IO_SQ_BACKOFF_TIMEOUT_US=50000
+    # Keep these as overridable defaults (not hard assignments), otherwise
+    # later tuning blocks cannot raise them for high-concurrency runs.
+    export MORI_IO_SQ_BACKOFF_TIMEOUT_US="${MORI_IO_SQ_BACKOFF_TIMEOUT_US:-500000}"
 
-    export MORI_IO_QP_MAX_SEND_WR=16384
+    export MORI_IO_QP_MAX_SEND_WR="${MORI_IO_QP_MAX_SEND_WR:-16384}"
     export MORI_IO_QP_MAX_CQE=32768
     export MORI_IO_QP_MAX_SGE=4
 
@@ -151,6 +169,8 @@ else
 
     # Disable allocating memory in one pass
     export MORI_SHMEM_MODE=ISOLATION
+    # mori shmem heap size (matches mori-scheduler reference). Overridable.
+    export MORI_SHMEM_HEAP_SIZE="${MORI_SHMEM_HEAP_SIZE:-1G}"
 
     # Enable spec v2
     export SGLANG_ENABLE_SPEC_V2=1
@@ -177,6 +197,14 @@ else
     # 1 mirrors router logs to stdout via tee (useful for live debugging).
     export SGLANG_ROUTER_STDOUT_LOGS="${SGLANG_ROUTER_STDOUT_LOGS:-0}"
 
+    # MoRIIO SQ tuning defaults (can be overridden by caller env).
+    # Keep explicit exports here so tuned values are guaranteed to reach the
+    # sglang.launch_server process even if upstream env threading regresses.
+    export MORI_IO_SQ_BACKOFF_TIMEOUT_US="${MORI_IO_SQ_BACKOFF_TIMEOUT_US:-500000}"
+    export MORI_IO_QP_MAX_SEND_WR="${MORI_IO_QP_MAX_SEND_WR:-}"
+
+    export MC_TE_METRIC=1
+
     # QoS/DSCP configuration
     # Priority order: 1) Set by runner, 2) Detect via nicctl, 3) Detect from hostname
     if [[ -n "$MORI_RDMA_TC" ]]; then
diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm
index 5e8e67606..2dc6227ee 100755
--- a/benchmarks/multi_node/amd_utils/job.slurm
+++ b/benchmarks/multi_node/amd_utils/job.slurm
@@ -201,11 +201,41 @@ else
         fi
     }
 
-    if check_model_path "$MODEL_DIR/$MODEL_NAME" "$MODEL_DIR"; then
+    # Extract hf_dir from models.yaml (same as vllm-disagg path above)
+    SGL_DISK_DIR_NAME=$(awk '/^'"$MODEL_NAME"':/{found=1; next}
+        found && /^[^ ]/{exit}
+        found && /hf_dir:/{gsub(/[" ]/, "", $2); print $2; exit}' "$MODELS_YAML")
+    SGL_DISK_DIR_NAME="${SGL_DISK_DIR_NAME:-$MODEL_NAME}"
+
+    # Prefer the caller-supplied MODEL_PATH (recipe scripts set this explicitly);
+    # fall back to MODEL_DIR/hf_dir then MODEL_DIR/MODEL_NAME.
+    if [[ -n "${MODEL_PATH:-}" && "$MODEL_PATH" != "$MODEL_DIR" ]]; then
+        # Caller already resolved the path (e.g. MODEL_PATH=/it-share/hf_cache/models--...)
+        # Use it directly if it exists on all nodes, otherwise try subdirectory combos.
+        if check_model_path "$MODEL_PATH" "MODEL_PATH (caller-supplied)"; then
+            echo "Selected MODEL_PATH: $MODEL_PATH (caller-supplied, available on all nodes)"
+        elif check_model_path "$MODEL_PATH/$SGL_DISK_DIR_NAME" "$MODEL_PATH/$SGL_DISK_DIR_NAME"; then
+            MODEL_PATH="$MODEL_PATH/$SGL_DISK_DIR_NAME"
+            echo "Selected MODEL_PATH: $MODEL_PATH (available on all nodes)"
+        elif check_model_path "$MODEL_PATH/$MODEL_NAME" "$MODEL_PATH/$MODEL_NAME"; then
+            MODEL_PATH="$MODEL_PATH/$MODEL_NAME"
+            echo "Selected MODEL_PATH: $MODEL_PATH (available on all nodes)"
+        else
+            echo "FATAL ERROR: Model '$MODEL_NAME' not found on ALL allocated nodes in:"
+            echo "  - $MODEL_PATH"
+            echo "  - $MODEL_PATH/$SGL_DISK_DIR_NAME"
+            echo "  - $MODEL_PATH/$MODEL_NAME"
+            exit 1
+        fi
+    elif check_model_path "$MODEL_DIR/$SGL_DISK_DIR_NAME" "$MODEL_DIR/$SGL_DISK_DIR_NAME"; then
+        MODEL_PATH="$MODEL_DIR/$SGL_DISK_DIR_NAME"
+        echo "Selected MODEL_PATH: $MODEL_PATH (available on all nodes)"
+    elif check_model_path "$MODEL_DIR/$MODEL_NAME" "$MODEL_DIR"; then
         MODEL_PATH="$MODEL_DIR/$MODEL_NAME"
         echo "Selected MODEL_PATH: $MODEL_PATH (available on all nodes)"
     else
         echo "FATAL ERROR: Model '$MODEL_NAME' not found on ALL allocated nodes in:"
+        echo "  - $MODEL_DIR/$SGL_DISK_DIR_NAME"
         echo "  - $MODEL_DIR/$MODEL_NAME"
         exit 1
     fi
@@ -362,7 +392,6 @@ DOCKER_ENV_COMMON=(
     -e BENCH_NUM_PROMPTS_MULTIPLIER=\$BENCH_NUM_PROMPTS_MULTIPLIER
     -e BENCH_MAX_CONCURRENCY=\$BENCH_MAX_CONCURRENCY
     -e TQDM_MININTERVAL=\$TQDM_MININTERVAL
-    -e DRY_RUN=\$DRY_RUN
     -e BENCHMARK_LOGS_DIR=/benchmark_logs
     -e ENGINE=\$ENGINE
     -e WS_PATH=${WS_PATH}
@@ -378,11 +407,25 @@ DOCKER_ENV_COMMON=(
     -e PREFILL_TP_SIZE=\$PREFILL_TP_SIZE
     -e PREFILL_ENABLE_EP=\$PREFILL_ENABLE_EP
     -e PREFILL_ENABLE_DP=\$PREFILL_ENABLE_DP
+    -e PREFILL_CONTEXT_LENGTH=\${PREFILL_CONTEXT_LENGTH:-}
+    -e PREFILL_CHUNKED_PREFILL_SIZE=\${PREFILL_CHUNKED_PREFILL_SIZE:-}
+    -e SGLANG_AITER_MLA_PERSIST=\${SGLANG_AITER_MLA_PERSIST:-0}
+    -e DISABLE_CUSTOM_ALL_REDUCE=\${DISABLE_CUSTOM_ALL_REDUCE:-0}
+    -e MAX_MODEL_LEN=\${MAX_MODEL_LEN:-}
+    -e DURATION=\${DURATION:-1800}
+    -e IS_AGENTIC=\${IS_AGENTIC:-0}
+    -e OFFLOADING=\${OFFLOADING:-none}
+    -e ENABLE_METRICS=\${ENABLE_METRICS:-0}
+    -e PREFILL_ROUTER_POLICY=\${PREFILL_ROUTER_POLICY:-random}
+    -e DECODE_ROUTER_POLICY=\${DECODE_ROUTER_POLICY:-random}
+    -e MORI_IO_SQ_BACKOFF_TIMEOUT_US=\${MORI_IO_SQ_BACKOFF_TIMEOUT_US:-}
+    -e MORI_IO_QP_MAX_SEND_WR=\${MORI_IO_QP_MAX_SEND_WR:-}
     -e DECODE_TP_SIZE=\$DECODE_TP_SIZE
     -e DECODE_ENABLE_EP=\$DECODE_ENABLE_EP
     -e DECODE_ENABLE_DP=\$DECODE_ENABLE_DP
     -e DECODE_MTP_SIZE=\$DECODE_MTP_SIZE
     -e IS_MULTINODE=\$IS_MULTINODE
+    -e DRY_RUN=\${DRY_RUN:-0}
 )
 
 # Engine-specific env vars
@@ -408,6 +451,31 @@ else
     )
 fi
 
+# HiCache / Mooncake settings are delivered via a bind-mounted config file rather
+# than a long list of docker -e flags. Write it once to the shared benchmark-logs
+# dir (already a host path, visible on every node) and mount it read-only at
+# /config/hicache_mc.env, where env.sh sources it before applying its defaults.
+# Empty values are preserved so env.sh's "${VAR:-default}" fallbacks still apply.
+HICACHE_MC_CONFIG="${BENCHMARK_LOGS_DIR}/hicache_mc_${SLURM_JOB_ID}.env"
+cat > "$HICACHE_MC_CONFIG" <<EOF
+HICACHE_SIZE_GB=${HICACHE_SIZE_GB:-}
+HICACHE_TOTAL_CPU_DRAM_GB=${HICACHE_TOTAL_CPU_DRAM_GB:-}
+HICACHE_HOST_POOL_COUNT=${HICACHE_HOST_POOL_COUNT:-}
+HICACHE_PAGE_SIZE=${HICACHE_PAGE_SIZE:-}
+HICACHE_IO_BACKEND=${HICACHE_IO_BACKEND:-}
+HICACHE_MEM_LAYOUT=${HICACHE_MEM_LAYOUT:-}
+HICACHE_WRITE_POLICY=${HICACHE_WRITE_POLICY:-}
+HICACHE_STORAGE_BACKEND=${HICACHE_STORAGE_BACKEND:-}
+HICACHE_DECODE=${HICACHE_DECODE:-}
+MC_MASTER_PORT=${MC_MASTER_PORT:-}
+MC_METRICS_PORT=${MC_METRICS_PORT:-}
+MC_PROTOCOL=${MC_PROTOCOL:-}
+MC_GLOBAL_SEG=${MC_GLOBAL_SEG:-}
+MC_DEVICE=${MC_DEVICE:-}
+MC_MASTER_ADDR=${MC_MASTER_ADDR:-}
+EOF
+echo "[config] wrote HiCache/Mooncake settings -> $HICACHE_MC_CONFIG"
+
 # Engine-specific container filter for pre-clean
 CONT_FILTER="name=^container_${ENGINE}_"
 
@@ -489,6 +557,7 @@ fi
     -v /tmp:/run_logs \
     -v ${BENCHMARK_LOGS_DIR}:/benchmark_logs \
     -v ${DI_REPO_DIR}:${DOCKER_MOUNT_PATH} \
+    -v ${HICACHE_MC_CONFIG}:/config/hicache_mc.env:ro \
     ${EXTRA_DOCKER_MOUNTS:-} \
     ${DOCKER_ENV_COMMON[*]} \
     ${DOCKER_ENV_ENGINE[*]} \
diff --git a/benchmarks/multi_node/amd_utils/models.yaml b/benchmarks/multi_node/amd_utils/models.yaml
index e68c448ce..60ce93752 100644
--- a/benchmarks/multi_node/amd_utils/models.yaml
+++ b/benchmarks/multi_node/amd_utils/models.yaml
@@ -359,6 +359,7 @@ DeepSeek-R1-0528-MXFP4-v2:
 # prefill.disable_cuda_graph routes prefill to --disable-cuda-graph; decode keeps
 # --cuda-graph-bs. See dsv4_mi355x_sglang_disagg_plan.md.
 DeepSeek-V4-Pro:
+  hf_dir: "models--deepseek-ai--DeepSeek-V4-Pro"
   base_flags: "--decode-log-interval 100 --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend dsv4 --page-size 256 --swa-full-tokens-ratio 0.1 --disable-shared-experts-fusion --tool-call-parser deepseekv4 --reasoning-parser deepseek-v4 --disaggregation-transfer-backend mori"
   dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
   prefill:
diff --git a/benchmarks/multi_node/amd_utils/server_sglang.sh b/benchmarks/multi_node/amd_utils/server_sglang.sh
index 38fbdfc8e..67b10a2fd 100755
--- a/benchmarks/multi_node/amd_utils/server_sglang.sh
+++ b/benchmarks/multi_node/amd_utils/server_sglang.sh
@@ -419,6 +419,12 @@ if [ "$NODE_RANK" -eq 0 ]; then
 
     echo "================================================"
 
+    # Install LAUNCH_PREFIX dependencies if needed (e.g. numactl missing in container)
+    if [[ "${LAUNCH_PREFIX:-}" == *numactl* ]] && ! command -v numactl &>/dev/null; then
+        echo "[server_sglang] Installing numactl (required by LAUNCH_PREFIX)..."
+        apt-get install -y -q numactl >/dev/null 2>&1 || { echo "[server_sglang] WARNING: numactl install failed"; }
+    fi
+
     # start the head prefill server
     PREFILL_MORI_MOE_ENV=""
     set -x
@@ -426,7 +432,7 @@ if [ "$NODE_RANK" -eq 0 ]; then
         PREFILL_MORI_MOE_ENV="SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_PREFILL}"
     fi
     set +x
-    PREFILL_CMD="SGLANG_MORI_COMBINE_DTYPE=${MORI_COMBINE_DTYPE_PREFILL} ${PREFILL_SDMA_ENV} ${PREFILL_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK_PREFILL:-${MORI_MAX_DISPATCH_TOKENS_PREFILL}} python3 -m sglang.launch_server \
+    PREFILL_CMD="SGLANG_MORI_COMBINE_DTYPE=${MORI_COMBINE_DTYPE_PREFILL} ${PREFILL_SDMA_ENV} ${PREFILL_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK_PREFILL:-${MORI_MAX_DISPATCH_TOKENS_PREFILL}} MORI_IO_SQ_BACKOFF_TIMEOUT_US=${MORI_IO_SQ_BACKOFF_TIMEOUT_US} MORI_IO_QP_MAX_SEND_WR=${MORI_IO_QP_MAX_SEND_WR} ${LAUNCH_PREFIX:-} python3 -m sglang.launch_server \
         --model-path $MODEL_DIR/$MODEL_NAME \
         --disaggregation-mode prefill \
         --disaggregation-ib-device ${IBDEVICES} \
@@ -520,11 +526,24 @@ if [ "$NODE_RANK" -eq 0 ]; then
         export IS_MTP=false
     fi
 
-    # n_prefill n_decode prefill_gpus decode_gpus model_dir model_name log_path isl osl concurrency_list req_rate random_range_ratio num_prompts_multiplier
-    BENCH_CMD="bash $SGLANG_WS_PATH/bench.sh ${xP} ${yD} $((PREFILL_TP_SIZE*xP)) $((DECODE_TP_SIZE*yD)) \
-        $MODEL_DIR $MODEL_NAME /run_logs/slurm_job-${SLURM_JOB_ID} ${BENCH_INPUT_LEN} \
-        ${BENCH_OUTPUT_LEN} "${BENCH_MAX_CONCURRENCY}" ${BENCH_REQUEST_RATE} \
-        ${BENCH_RANDOM_RANGE_RATIO} ${BENCH_NUM_PROMPTS_MULTIPLIER}"
+    # Select the benchmark runner.
+    # IS_AGENTIC=1/true  → agentic trace replay (trace_replay.sh)
+    # IS_AGENTIC unset/0 → fixed-seq-len throughput benchmark (bench.sh)
+    if [[ "${IS_AGENTIC:-0}" == "1" || "${IS_AGENTIC:-}" == "true" ]]; then
+        # trace_replay.sh signature: model_path model_name concurrency_list log_path
+        BENCH_CMD="bash $SGLANG_WS_PATH/trace_replay.sh \
+            $MODEL_DIR $MODEL_NAME $BENCH_MAX_CONCURRENCY /run_logs/slurm_job-${SLURM_JOB_ID}"
+        echo "Benchmark runner: trace_replay.sh (agentic, OFFLOADING=${OFFLOADING:-none}, CONC=${BENCH_MAX_CONCURRENCY})"
+    else
+        # bench.sh signature:
+        # n_prefill n_decode prefill_gpus decode_gpus model_dir model_name log_path
+        # isl osl concurrency_list req_rate random_range_ratio num_prompts_multiplier
+        BENCH_CMD="bash $SGLANG_WS_PATH/bench.sh ${xP} ${yD} $((PREFILL_TP_SIZE*xP)) $((DECODE_TP_SIZE*yD)) \
+            $MODEL_DIR $MODEL_NAME /run_logs/slurm_job-${SLURM_JOB_ID} ${BENCH_INPUT_LEN} \
+            ${BENCH_OUTPUT_LEN} \"${BENCH_MAX_CONCURRENCY}\" ${BENCH_REQUEST_RATE} \
+            ${BENCH_RANDOM_RANGE_RATIO} ${BENCH_NUM_PROMPTS_MULTIPLIER}"
+        echo "Benchmark runner: bench.sh (fixed-seq-len)"
+    fi
 
     if [[ "${EVAL_ONLY:-false}" == "true" ]]; then
         echo "EVAL_ONLY mode: skipping throughput benchmark"
@@ -658,7 +677,7 @@ elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$NODE_OFFSET" ]; then
         PREFILL_MORI_MOE_ENV="SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_PREFILL}"
     fi
     set +x
-    PREFILL_CMD="SGLANG_MORI_COMBINE_DTYPE=${MORI_COMBINE_DTYPE_PREFILL} ${PREFILL_SDMA_ENV} ${PREFILL_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK_PREFILL:-${MORI_MAX_DISPATCH_TOKENS_PREFILL}} python3 -m sglang.launch_server \
+    PREFILL_CMD="SGLANG_MORI_COMBINE_DTYPE=${MORI_COMBINE_DTYPE_PREFILL} ${PREFILL_SDMA_ENV} ${PREFILL_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK_PREFILL:-${MORI_MAX_DISPATCH_TOKENS_PREFILL}} MORI_IO_SQ_BACKOFF_TIMEOUT_US=${MORI_IO_SQ_BACKOFF_TIMEOUT_US} MORI_IO_QP_MAX_SEND_WR=${MORI_IO_QP_MAX_SEND_WR} ${LAUNCH_PREFIX:-} python3 -m sglang.launch_server \
         --model-path $MODEL_DIR/${MODEL_NAME} \
         --disaggregation-mode prefill \
         --disaggregation-ib-device ${IBDEVICES} \
@@ -720,13 +739,19 @@ else
     echo "Decode node rank: $RANK"
     echo "Decode parallelism: TP=${DECODE_TP_SIZE}, EP enabled: ${DECODE_ENABLE_EP}, DP enabled: ${DECODE_ENABLE_DP}"
 
+    # Install LAUNCH_PREFIX dependencies if needed (e.g. numactl missing in container)
+    if [[ "${LAUNCH_PREFIX:-}" == *numactl* ]] && ! command -v numactl &>/dev/null; then
+        echo "[server_sglang] Installing numactl (required by LAUNCH_PREFIX)..."
+        apt-get install -y -q numactl >/dev/null 2>&1 || { echo "[server_sglang] WARNING: numactl install failed"; }
+    fi
+
     DECODE_MORI_MOE_ENV=""
     set -x
     if [[ -n "$MORI_MOE_MAX_INPUT_TOKENS_DECODE" ]]; then
         DECODE_MORI_MOE_ENV="SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_DECODE}"
     fi
     set +x
-    DECODE_CMD="SGLANG_MORI_COMBINE_DTYPE=${MORI_COMBINE_DTYPE_DECODE} ${DECODE_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK_DECODE:-${MORI_MAX_DISPATCH_TOKENS_DECODE}} python3 -m sglang.launch_server \
+    DECODE_CMD="SGLANG_MORI_COMBINE_DTYPE=${MORI_COMBINE_DTYPE_DECODE} ${DECODE_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK_DECODE:-${MORI_MAX_DISPATCH_TOKENS_DECODE}} MORI_IO_SQ_BACKOFF_TIMEOUT_US=${MORI_IO_SQ_BACKOFF_TIMEOUT_US} MORI_IO_QP_MAX_SEND_WR=${MORI_IO_QP_MAX_SEND_WR} ${LAUNCH_PREFIX:-} python3 -m sglang.launch_server \
         --model-path ${MODEL_DIR}/${MODEL_NAME} \
         --disaggregation-mode decode \
         --disaggregation-ib-device ${IBDEVICES} \
diff --git a/benchmarks/multi_node/amd_utils/trace_replay.sh b/benchmarks/multi_node/amd_utils/trace_replay.sh
new file mode 100644
index 000000000..d3e6b2547
--- /dev/null
+++ b/benchmarks/multi_node/amd_utils/trace_replay.sh
@@ -0,0 +1,93 @@
+#!/bin/bash
+# Dual-Engine Disaggregated Benchmark Runner
+#
+# ENGINE=sglang (default): SGLang benchmark
+# ENGINE=vllm:             vLLM benchmark
+#
+# Produces JSON result files via benchmark_serving.py so that the CI pipeline
+# can collect and process results.
+#
+# Usage: bash bench.sh <n_prefill> <n_decode> <prefill_gpus> <decode_gpus> \
+#            <model_dir> <model_name> <log_path> <isl> <osl> \
+#            <concurrency_list> <req_rate> <random_range_ratio> <num_prompts_multiplier>
+
+ENGINE="${ENGINE:-sglang-disagg}"
+
+model_path=$1
+model_name=$2
+concurrency_list=${3:-"1"}
+MODEL_PATH="${MODEL_PATH:-${model_path}/${model_name}}"
+# vllm-disagg uses --served-model-name MODEL_NAME; sglang defaults to MODEL_PATH
+if [[ "$ENGINE" == "vllm-disagg" ]]; then
+    MODEL="${MODEL_NAME:-${MODEL_PATH}}"
+else
+    MODEL="${MODEL_PATH}"
+fi
+log_path=${4:-/run_logs}
+
+# Split BENCH_MAX_CONCURRENCY (x-delimited, e.g. "8x16x32") into an array.
+# Falls back to 1 if unset so the loop always runs at least once.
+IFS='x' read -r -a chosen_concurrencies <<< "${concurrency_list}"
+
+
+ROUTER_PORT="${ROUTER_PORT:-30000}"
+
+export TRANSFORMERS_VERBOSITY=error
+export TOKENIZERS_PARALLELISM=false
+
+# echo "Config ${chosen_isl}; ${chosen_osl}; ${chosen_concurrencies[0]}; ${chosen_req_rate}"
+
+RESULT_DIR="${RESULT_DIR:-${log_path}/agentic}"
+mkdir -p "$RESULT_DIR"
+
+source "$(dirname "$0")/../../benchmark_lib.sh"
+
+# REPO_ROOT="$(cd "$(dirname "$0")/../../.." && pwd)"
+
+PORT="${ROUTER_PORT}"
+MODEL="${MODEL:-${BENCH_MODEL}}"
+DURATION="${DURATION:-1800}"
+export MODEL DURATION MAX_MODEL_LEN
+RESULT_DIR="${RESULT_DIR:-${profile_folder}}"
+# RESULT_FILENAME_BASE="${RESULT_FILENAME:-agentic_bench}"
+
+mkdir -p "$RESULT_DIR"
+
+export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060826_256k
+resolve_trace_source
+install_agentic_deps
+
+ANY_FAILED=0
+for max_concurrency in "${chosen_concurrencies[@]}"; do
+
+    echo "=========================================="
+    echo "Agentic trace replay: conc=$max_concurrency"
+    echo "=========================================="
+
+    # Write artifacts directly into RESULT_DIR (flat), consistent with the
+    # single-node agentic recipes and agentic_srt.sh. The CI matrix explodes
+    # agentic runs to one concurrency per job, so the per-conc loop runs once.
+    CONC_RESULT_DIR="$RESULT_DIR/conc${max_concurrency}"
+    mkdir -p "$CONC_RESULT_DIR"
+
+    CONC="$max_concurrency"
+    USERS="$max_concurrency"
+    export CONC USERS
+    build_replay_cmd "$CONC_RESULT_DIR"
+
+    # Per-conc result name consumed by write_agentic_result_json/process_agentic_result.py.
+    # export RESULT_FILENAME="${RESULT_FILENAME_BASE}_conc${max_concurrency}"
+    if ! run_agentic_replay_and_write_outputs "$CONC_RESULT_DIR"; then
+        echo "WARNING: agentic trace replay for conc=$max_concurrency failed (replay or validation) after writing available results" >&2
+        ANY_FAILED=1
+    fi
+    
+    echo "-----------------------------------------"
+
+done
+
+# export RESULT_FILENAME="$RESULT_FILENAME_BASE"
+
+if [ "$ANY_FAILED" -ne 0 ]; then
+    echo "WARNING: at least one conc had a non-zero exit; per-conc result files were still written when possible." >&2
+fi
diff --git a/utils/matrix_logic/generate_sweep_configs.py b/utils/matrix_logic/generate_sweep_configs.py
index 53efcca9f..ff679a828 100644
--- a/utils/matrix_logic/generate_sweep_configs.py
+++ b/utils/matrix_logic/generate_sweep_configs.py
@@ -437,6 +437,7 @@ def generate_full_sweep(args, all_config_data, runner_data):
                                 Fields.PREFILL.value: prefill,
                                 Fields.DECODE.value: decode,
                                 Fields.CONC.value: conc,
+                                Fields.OFFLOADING.value: offloading,
                                 Fields.DURATION.value: duration,
                                 Fields.EXP_NAME.value: (
                                     f"{model_code}_p{prefill[Fields.NUM_WORKER.value]}x{prefill[Fields.TP.value]}"
@@ -845,6 +846,7 @@ def generate_test_config_sweep(args, all_config_data, runner_data=None):
                                 Fields.PREFILL.value: prefill,
                                 Fields.DECODE.value: decode,
                                 Fields.CONC.value: conc,
+                                Fields.OFFLOADING.value: offloading,
                                 Fields.DURATION.value: duration,
                                 Fields.EXP_NAME.value: (
                                     f"{model_code}_p{prefill[Fields.NUM_WORKER.value]}x{prefill[Fields.TP.value]}"
diff --git a/utils/matrix_logic/validation.py b/utils/matrix_logic/validation.py
index 4e3f0bbd7..0bc8efedc 100644
--- a/utils/matrix_logic/validation.py
+++ b/utils/matrix_logic/validation.py
@@ -180,6 +180,9 @@ class MultiNodeAgenticMatrixEntry(BaseModel):
     prefill: WorkerConfig
     decode: WorkerConfig
     conc: int
+    offloading: Literal["none", "cpu", "ssd", "lmcache", "lmcache-mp", "hicache"] = Field(
+        default="none", alias=Fields.OFFLOADING.value
+    )
     duration: int = Field(default=1800, alias=Fields.DURATION.value)
     exp_name: str = Field(alias=Fields.EXP_NAME.value)
     disagg: bool

From 3b74ce537b103d422624d9d1a513e830c389692d Mon Sep 17 00:00:00 2001
From: billishyahao <bill.he@amd.com>
Date: Fri, 12 Jun 2026 05:35:20 +0000
Subject: [PATCH 15/21] fix image

---
 .github/configs/amd-master.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index ec5dc2b70..e25384931 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -2141,13 +2141,13 @@ dsr1-fp4-mi355x-sglang-disagg-8k1k-mtp:
 # topology families captured from the validated manual recipe (see
 # dsv4_mi355x_sglang_disagg_plan.md):
 #   - pure-TP 1P1D (TP8, mori KV transfer)
-#   - DEP 1P1D     (TP8/EP8/DP8, mori KV transfer + mori MoE a2a, dp-attention)
+#   - DEP 2P1D     (TP8/EP8/DP8, mori KV transfer + mori MoE a2a, dp-attention)
 # DSv4-specific serving knobs (attention-backend dsv4, page-size 256, unified_kv_triton,
 # AITER indexer, deepseekv4 parsers) live in amd_utils/{models.yaml,env.sh}; the bench
 # client uses --dsv4 framing (amd_utils/bench.sh). STP only for now (reference recipe has
 # no spec decoding); MTP is a follow-up.
 dsv4-fp4-mi355x-sglang-disagg:
-  image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260610
+  image: rocm/sgl-dev:sglang-0.5.12.post1-rocm720-mi35x-mori-0610-dsv4
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: mi355x-disagg

From 3f6f20f565dba1c162379390d4e6239e84cd9a9f Mon Sep 17 00:00:00 2001
From: billishyahao <bill.he@amd.com>
Date: Fri, 12 Jun 2026 06:08:12 +0000
Subject: [PATCH 16/21] fix the image

---
 .github/configs/amd-master.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index e25384931..8c4dcf0b8 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -2147,7 +2147,7 @@ dsr1-fp4-mi355x-sglang-disagg-8k1k-mtp:
 # client uses --dsv4 framing (amd_utils/bench.sh). STP only for now (reference recipe has
 # no spec decoding); MTP is a follow-up.
 dsv4-fp4-mi355x-sglang-disagg:
-  image: rocm/sgl-dev:sglang-0.5.12.post1-rocm720-mi35x-mori-0610-dsv4
+  image: rocm/sgl-dev:sglang-0.5.12.post1-rocm720-mi35x-mori-0610-dsv4-ep
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: mi355x-disagg

From 31b392a752889fdee9c0da12b09af373171a4d81 Mon Sep 17 00:00:00 2001
From: billishyahao <bill.he@amd.com>
Date: Fri, 12 Jun 2026 06:41:59 +0000
Subject: [PATCH 17/21] fix

---
 benchmarks/multi_node/amd_utils/env.sh           | 4 ++--
 benchmarks/multi_node/amd_utils/server_sglang.sh | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh
index 8854949be..3ca0308b1 100755
--- a/benchmarks/multi_node/amd_utils/env.sh
+++ b/benchmarks/multi_node/amd_utils/env.sh
@@ -273,8 +273,8 @@ $1 == "DSCP" && $2 == ":" && $NF == p {
         # Per-role MoRI dispatch sizing (used by the harness chunked/MoE math)
         export MORI_MAX_DISPATCH_TOKENS_PREFILL=8192
         export MORI_MAX_DISPATCH_TOKENS_DECODE=64
-        export MORI_MOE_MAX_INPUT_TOKENS_PREFILL=2048
-        export MORI_MOE_MAX_INPUT_TOKENS_DECODE=332
+        # export MORI_MOE_MAX_INPUT_TOKENS_PREFILL=2048
+        # export MORI_MOE_MAX_INPUT_TOKENS_DECODE=332
 
         # PER_RANK dispatch tokens are pinned independently of the sizing above
         # (16384 prefill / 128 decode in the reference recipe). server_sglang.sh
diff --git a/benchmarks/multi_node/amd_utils/server_sglang.sh b/benchmarks/multi_node/amd_utils/server_sglang.sh
index 67b10a2fd..46e12a64a 100755
--- a/benchmarks/multi_node/amd_utils/server_sglang.sh
+++ b/benchmarks/multi_node/amd_utils/server_sglang.sh
@@ -196,7 +196,7 @@ if [[ "$PREFILL_ENABLE_DP" == "true" ]] && [[ "$PREFILL_ENABLE_EP" == "true" ]];
     prefill_max_running_requests=$BENCH_MAX_CONC_VALUE
     prefill_dp_ranks=$PREFILL_TP_SIZE
     # MORI_MAX_DISPATCH_TOKENS_PREFILL stays at 8192 (no change)
-    MORI_MOE_MAX_INPUT_TOKENS_PREFILL=$((MORI_MAX_DISPATCH_TOKENS_PREFILL * prefill_dp_ranks / 2))
+    # MORI_MOE_MAX_INPUT_TOKENS_PREFILL=$((MORI_MAX_DISPATCH_TOKENS_PREFILL * prefill_dp_ranks / 2))
     echo "[DP+EP override] Prefill: max-running-requests=$prefill_max_running_requests, MOE_MAX_INPUT=$MORI_MOE_MAX_INPUT_TOKENS_PREFILL"
 fi
 
@@ -217,7 +217,7 @@ if [[ "$DECODE_ENABLE_DP" == "true" ]] && [[ "$DECODE_ENABLE_EP" == "true" ]]; t
     decode_max_running_requests=$BENCH_MAX_CONC_VALUE
     decode_dp_ranks=$DECODE_TP_SIZE
     MORI_MAX_DISPATCH_TOKENS_DECODE=$((BENCH_MAX_CONC_VALUE / decode_dp_ranks))
-    MORI_MOE_MAX_INPUT_TOKENS_DECODE=$((MORI_MAX_DISPATCH_TOKENS_DECODE * decode_dp_ranks * 7 / 10))
+    # MORI_MOE_MAX_INPUT_TOKENS_DECODE=$((MORI_MAX_DISPATCH_TOKENS_DECODE * decode_dp_ranks * 7 / 10))
     # Update derived variable
     SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD=$((MORI_MAX_DISPATCH_TOKENS_DECODE * 2))
     export SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD
@@ -253,7 +253,7 @@ fi
 
 if [[ "$DECODE_MTP_SIZE" -gt 0 ]]; then
     MORI_MAX_DISPATCH_TOKENS_DECODE=$((MORI_MAX_DISPATCH_TOKENS_DECODE * (DECODE_MTP_SIZE + 1)))
-    MORI_MOE_MAX_INPUT_TOKENS_DECODE=$((MORI_MOE_MAX_INPUT_TOKENS_DECODE * (DECODE_MTP_SIZE + 1)))
+    # MORI_MOE_MAX_INPUT_TOKENS_DECODE=$((MORI_MOE_MAX_INPUT_TOKENS_DECODE * (DECODE_MTP_SIZE + 1)))
 fi
 
 # =============================================================================

From dbbfc64521847376eb35812718058e061385c2c9 Mon Sep 17 00:00:00 2001
From: billishyahao <bill.he@amd.com>
Date: Fri, 12 Jun 2026 07:53:01 +0000
Subject: [PATCH 18/21] fix

---
 benchmarks/multi_node/amd_utils/env.sh      | 4 ++--
 benchmarks/multi_node/amd_utils/models.yaml | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh
index 3ca0308b1..ed48813e2 100755
--- a/benchmarks/multi_node/amd_utils/env.sh
+++ b/benchmarks/multi_node/amd_utils/env.sh
@@ -273,8 +273,8 @@ $1 == "DSCP" && $2 == ":" && $NF == p {
         # Per-role MoRI dispatch sizing (used by the harness chunked/MoE math)
         export MORI_MAX_DISPATCH_TOKENS_PREFILL=8192
         export MORI_MAX_DISPATCH_TOKENS_DECODE=64
-        # export MORI_MOE_MAX_INPUT_TOKENS_PREFILL=2048
-        # export MORI_MOE_MAX_INPUT_TOKENS_DECODE=332
+        unset MORI_MOE_MAX_INPUT_TOKENS_PREFILL
+        unset MORI_MOE_MAX_INPUT_TOKENS_DECODE
 
         # PER_RANK dispatch tokens are pinned independently of the sizing above
         # (16384 prefill / 128 decode in the reference recipe). server_sglang.sh
diff --git a/benchmarks/multi_node/amd_utils/models.yaml b/benchmarks/multi_node/amd_utils/models.yaml
index 60ce93752..28e785222 100644
--- a/benchmarks/multi_node/amd_utils/models.yaml
+++ b/benchmarks/multi_node/amd_utils/models.yaml
@@ -360,7 +360,7 @@ DeepSeek-R1-0528-MXFP4-v2:
 # --cuda-graph-bs. See dsv4_mi355x_sglang_disagg_plan.md.
 DeepSeek-V4-Pro:
   hf_dir: "models--deepseek-ai--DeepSeek-V4-Pro"
-  base_flags: "--decode-log-interval 100 --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend dsv4 --page-size 256 --swa-full-tokens-ratio 0.1 --disable-shared-experts-fusion --tool-call-parser deepseekv4 --reasoning-parser deepseek-v4 --disaggregation-transfer-backend mori"
+  base_flags: "--decode-log-interval 100 --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend dsv4 --page-size 256 --swa-full-tokens-ratio 0.1 --disable-shared-experts-fusion --tool-call-parser deepseekv4 --reasoning-parser deepseek-v4 --disaggregation-transfer-backend mori"
   dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
   prefill:
     mem_fraction_static: 0.8

From 836f3968cb8cfd53dc1dcf3c445a25edf27a7616 Mon Sep 17 00:00:00 2001
From: billishyahao <bill.he@amd.com>
Date: Mon, 15 Jun 2026 01:02:57 +0000
Subject: [PATCH 19/21] bump image

---
 .github/configs/amd-master.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 8c4dcf0b8..ef6b96150 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -2147,7 +2147,7 @@ dsr1-fp4-mi355x-sglang-disagg-8k1k-mtp:
 # client uses --dsv4 framing (amd_utils/bench.sh). STP only for now (reference recipe has
 # no spec decoding); MTP is a follow-up.
 dsv4-fp4-mi355x-sglang-disagg:
-  image: rocm/sgl-dev:sglang-0.5.12.post1-rocm720-mi35x-mori-0610-dsv4-ep
+  image: lmsysorg/sglang-rocm:v0.5.13-rocm720-mi35x-20260614
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: mi355x-disagg

From c21ad067aac39c0206e2eccfe7807d448b429d1b Mon Sep 17 00:00:00 2001
From: Theresa Shan <theresa.shan@amd.com>
Date: Tue, 16 Jun 2026 06:56:23 +0000
Subject: [PATCH 20/21] remove numactl

Signed-off-by: Theresa Shan <theresa.shan@amd.com>
---
 benchmarks/multi_node/amd_utils/server_sglang.sh | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/benchmarks/multi_node/amd_utils/server_sglang.sh b/benchmarks/multi_node/amd_utils/server_sglang.sh
index 46e12a64a..9fba74661 100755
--- a/benchmarks/multi_node/amd_utils/server_sglang.sh
+++ b/benchmarks/multi_node/amd_utils/server_sglang.sh
@@ -419,12 +419,6 @@ if [ "$NODE_RANK" -eq 0 ]; then
 
     echo "================================================"
 
-    # Install LAUNCH_PREFIX dependencies if needed (e.g. numactl missing in container)
-    if [[ "${LAUNCH_PREFIX:-}" == *numactl* ]] && ! command -v numactl &>/dev/null; then
-        echo "[server_sglang] Installing numactl (required by LAUNCH_PREFIX)..."
-        apt-get install -y -q numactl >/dev/null 2>&1 || { echo "[server_sglang] WARNING: numactl install failed"; }
-    fi
-
     # start the head prefill server
     PREFILL_MORI_MOE_ENV=""
     set -x
@@ -739,11 +733,6 @@ else
     echo "Decode node rank: $RANK"
     echo "Decode parallelism: TP=${DECODE_TP_SIZE}, EP enabled: ${DECODE_ENABLE_EP}, DP enabled: ${DECODE_ENABLE_DP}"
 
-    # Install LAUNCH_PREFIX dependencies if needed (e.g. numactl missing in container)
-    if [[ "${LAUNCH_PREFIX:-}" == *numactl* ]] && ! command -v numactl &>/dev/null; then
-        echo "[server_sglang] Installing numactl (required by LAUNCH_PREFIX)..."
-        apt-get install -y -q numactl >/dev/null 2>&1 || { echo "[server_sglang] WARNING: numactl install failed"; }
-    fi
 
     DECODE_MORI_MOE_ENV=""
     set -x

From e37fbc253728b8a6fda0bfd2d0da53382227bfee Mon Sep 17 00:00:00 2001
From: Theresa Shan <theresa.shan@amd.com>
Date: Tue, 16 Jun 2026 13:55:33 +0000
Subject: [PATCH 21/21] update dsv4 recipe

Signed-off-by: Theresa Shan <theresa.shan@amd.com>
---
 .github/configs/amd-master.yaml               | 36 +++++-----
 .../agentic/dsv4_fp4_mi355x_sglang-disagg.sh  | 67 ++++++++++---------
 benchmarks/multi_node/amd_utils/models.yaml   |  7 +-
 .../multi_node/amd_utils/server_sglang.sh     | 11 +++
 4 files changed, 67 insertions(+), 54 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index ef6b96150..7dc64c714 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -2877,21 +2877,21 @@ dsv4-fp4-mi355x-sglang-disagg-agentic-hicache:
           additional-settings:
           - "DECODE_NODES=1"
           - "DECODE_MTP_SIZE=0"
-      - spec-decoding: "none"
-        conc-list: [ 1 ]
-        offloading: hicache
-        prefill:
-          num-worker: 1
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "PREFILL_NODES=1"
-        decode:
-          num-worker: 1
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "DECODE_NODES=1"
-          - "DECODE_MTP_SIZE=0"
+      # - spec-decoding: "none"
+      #   conc-list: [ 1 ]
+      #   offloading: hicache
+      #   prefill:
+      #     num-worker: 1
+      #     tp: 8
+      #     ep: 1
+      #     dp-attn: false
+      #     additional-settings:
+      #     - "PREFILL_NODES=1"
+      #   decode:
+      #     num-worker: 1
+      #     tp: 8
+      #     ep: 1
+      #     dp-attn: false
+      #     additional-settings:
+      #     - "DECODE_NODES=1"
+      #     - "DECODE_MTP_SIZE=0"
diff --git a/benchmarks/multi_node/agentic/dsv4_fp4_mi355x_sglang-disagg.sh b/benchmarks/multi_node/agentic/dsv4_fp4_mi355x_sglang-disagg.sh
index 45c2c96b7..beba7dd46 100755
--- a/benchmarks/multi_node/agentic/dsv4_fp4_mi355x_sglang-disagg.sh
+++ b/benchmarks/multi_node/agentic/dsv4_fp4_mi355x_sglang-disagg.sh
@@ -71,39 +71,42 @@ export DISABLE_CUSTOM_ALL_REDUCE="${DISABLE_CUSTOM_ALL_REDUCE:-0}"
 # OFFLOADING=hicache | none (passed from YAML; default none for disagg).
 # HICACHE_TIER: L2 -> GPU + CPU-DRAM host pool. L3 -> + Mooncake store.
 export OFFLOADING="${OFFLOADING:-none}"
-export HICACHE_TIER="${HICACHE_TIER:-L3}"
-export HICACHE_TOTAL_CPU_DRAM_GB="${HICACHE_TOTAL_CPU_DRAM_GB:-64}"
-export HICACHE_HOST_POOL_COUNT="${HICACHE_HOST_POOL_COUNT:-1}"
-# DSV4 uses page-size 256 (set in models.yaml); HiCache must match.
-export HICACHE_PAGE_SIZE="${HICACHE_PAGE_SIZE:-256}"
-# Per-rank L2 host pool in GB.
-export HICACHE_SIZE_GB="${HICACHE_SIZE_GB:-100}"
-
-# ── HiCache layout/backend by tier ──
-#   L3 (Mooncake): page_first + direct + write_through     + storage=mooncake
-#   L2 (CPU DRAM): layer_first + direct + write_through_selective + storage=none
-# NOTE: write_through_selective evicts only under GPU memory pressure, avoiding
-# the mori RDMA race that causes GPU memory access faults with write_through.
-if [[ "${HICACHE_TIER^^}" == "L3" ]]; then
-  export HICACHE_MEM_LAYOUT="${HICACHE_MEM_LAYOUT:-page_first}"
-  export HICACHE_IO_BACKEND="${HICACHE_IO_BACKEND:-direct}"
-  export HICACHE_WRITE_POLICY="${HICACHE_WRITE_POLICY:-write_through}"
-  export HICACHE_STORAGE_BACKEND="${HICACHE_STORAGE_BACKEND:-mooncake}"
-else
-  export HICACHE_MEM_LAYOUT="${HICACHE_MEM_LAYOUT:-layer_first}"
-  export HICACHE_IO_BACKEND="${HICACHE_IO_BACKEND:-direct}"
-  export HICACHE_WRITE_POLICY="${HICACHE_WRITE_POLICY:-write_through_selective}"
-  export HICACHE_STORAGE_BACKEND="${HICACHE_STORAGE_BACKEND:-}"
+# HiCache/Mooncake tunables only matter when KV offloading is enabled.
+if [[ "$OFFLOADING" == "hicache" ]]; then
+  export HICACHE_TIER="${HICACHE_TIER:-L2}"
+  export HICACHE_TOTAL_CPU_DRAM_GB="${HICACHE_TOTAL_CPU_DRAM_GB:-64}"
+  export HICACHE_HOST_POOL_COUNT="${HICACHE_HOST_POOL_COUNT:-1}"
+  # DSV4 uses page-size 256 (set in models.yaml); HiCache must match.
+  export HICACHE_PAGE_SIZE="${HICACHE_PAGE_SIZE:-256}"
+  # Per-rank L2 host pool in GB.
+  export HICACHE_SIZE_GB="${HICACHE_SIZE_GB:-100}"
+
+  # ── HiCache layout/backend by tier ──
+  #   L3 (Mooncake): page_first + direct + write_through     + storage=mooncake
+  #   L2 (CPU DRAM): layer_first + direct + write_through_selective + storage=none
+  # NOTE: write_through_selective evicts only under GPU memory pressure, avoiding
+  # the mori RDMA race that causes GPU memory access faults with write_through.
+  if [[ "${HICACHE_TIER^^}" == "L3" ]]; then
+    export HICACHE_MEM_LAYOUT="${HICACHE_MEM_LAYOUT:-page_first}"
+    export HICACHE_IO_BACKEND="${HICACHE_IO_BACKEND:-direct}"
+    export HICACHE_WRITE_POLICY="${HICACHE_WRITE_POLICY:-write_through}"
+    export HICACHE_STORAGE_BACKEND="${HICACHE_STORAGE_BACKEND:-mooncake}"
+  else
+    export HICACHE_MEM_LAYOUT="${HICACHE_MEM_LAYOUT:-layer_first}"
+    export HICACHE_IO_BACKEND="${HICACHE_IO_BACKEND:-direct}"
+    export HICACHE_WRITE_POLICY="${HICACHE_WRITE_POLICY:-write_through_selective}"
+    export HICACHE_STORAGE_BACKEND="${HICACHE_STORAGE_BACKEND:-}"
+  fi
+  export HICACHE_DECODE="${HICACHE_DECODE:-0}"
+  # Shared nodes: use non-default Mooncake ports to avoid collisions.
+  export MC_MASTER_PORT="${MC_MASTER_PORT:-58137}"
+  export MC_METRICS_PORT="${MC_METRICS_PORT:-19003}"
+  export MC_PATCH_HOSTPOOL="${MC_PATCH_HOSTPOOL:-1}"
+  export MC_PROTOCOL="${MC_PROTOCOL:-tcp}"
+  export MC_GLOBAL_SEG="${MC_GLOBAL_SEG:-30gb}"
+  export MC_DEVICE="${MC_DEVICE:-rdma0}"
+  export MC_MASTER_ADDR="${MC_MASTER_ADDR:-}"
 fi
-export HICACHE_DECODE="${HICACHE_DECODE:-0}"
-# Shared nodes: use non-default Mooncake ports to avoid collisions.
-export MC_MASTER_PORT="${MC_MASTER_PORT:-58137}"
-export MC_METRICS_PORT="${MC_METRICS_PORT:-19003}"
-export MC_PATCH_HOSTPOOL="${MC_PATCH_HOSTPOOL:-1}"
-export MC_PROTOCOL="${MC_PROTOCOL:-tcp}"
-export MC_GLOBAL_SEG="${MC_GLOBAL_SEG:-30gb}"
-export MC_DEVICE="${MC_DEVICE:-rdma0}"
-export MC_MASTER_ADDR="${MC_MASTER_ADDR:-}"
 
 # ── MoRIIO RDMA Send Queue tuning ──
 export MORI_IO_SQ_BACKOFF_TIMEOUT_US="${MORI_IO_SQ_BACKOFF_TIMEOUT_US:-500000}"
diff --git a/benchmarks/multi_node/amd_utils/models.yaml b/benchmarks/multi_node/amd_utils/models.yaml
index 28e785222..35cba0cd8 100644
--- a/benchmarks/multi_node/amd_utils/models.yaml
+++ b/benchmarks/multi_node/amd_utils/models.yaml
@@ -357,9 +357,8 @@ DeepSeek-R1-0528-MXFP4-v2:
 # thinking/reasoning-effort, dispatch dtypes, per-role PER_RANK dispatch tokens) is set
 # in env.sh's DeepSeek-V4-Pro block. The bench client uses --dsv4 framing (bench.sh).
 # prefill.disable_cuda_graph routes prefill to --disable-cuda-graph; decode keeps
-# --cuda-graph-bs. See dsv4_mi355x_sglang_disagg_plan.md.
+# --cuda-graph-bs. See dsv4_mi355x_sglang_disagg_plan.md. hf_dir: "models--deepseek-ai--DeepSeek-V4-Pro"
 DeepSeek-V4-Pro:
-  hf_dir: "models--deepseek-ai--DeepSeek-V4-Pro"
   base_flags: "--decode-log-interval 100 --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend dsv4 --page-size 256 --swa-full-tokens-ratio 0.1 --disable-shared-experts-fusion --tool-call-parser deepseekv4 --reasoning-parser deepseek-v4 --disaggregation-transfer-backend mori"
   dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
   prefill:
@@ -369,12 +368,12 @@ DeepSeek-V4-Pro:
     dp:
       max_running_requests: 1024
       chunked_prefill_size: 131072
-      context_length: 9217
+      context_length: 1000000
       max_total_tokens: 262144
     no_dp:
       max_running_requests: 128
       chunked_prefill_size: 131072
-      context_length: 9217
+      context_length: 1000000
       max_total_tokens: 262144
   decode:
     mem_fraction_static: 0.85
diff --git a/benchmarks/multi_node/amd_utils/server_sglang.sh b/benchmarks/multi_node/amd_utils/server_sglang.sh
index 9fba74661..2f6c8145e 100755
--- a/benchmarks/multi_node/amd_utils/server_sglang.sh
+++ b/benchmarks/multi_node/amd_utils/server_sglang.sh
@@ -231,9 +231,14 @@ if [[ "$PREFILL_DISABLE_CUDA_GRAPH" == "True" ]] || [[ "$PREFILL_DISABLE_CUDA_GR
 else
     PREFILL_MODE_FLAGS="--mem-fraction-static ${PREFILL_MEM_FRACTION_STATIC} --max-running-requests ${prefill_max_running_requests} --chunked-prefill-size ${prefill_chunked_prefill_size} --cuda-graph-bs ${prefill_cuda_graph_bs[*]} "
 fi
+
 if [[ "$PREFILL_DISABLE_RADIX_CACHE" == "True" ]] || [[ "$PREFILL_DISABLE_RADIX_CACHE" == "true" ]]; then
     PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --disable-radix-cache"
 fi
+# Agentic runs: keep radix/prefix cache enabled by replacing --disable-radix-cache with empty.
+if [[ "${IS_AGENTIC:-0}" == "1" || "${IS_AGENTIC:-}" == "true" ]]; then
+    PREFILL_MODE_FLAGS="${PREFILL_MODE_FLAGS//--disable-radix-cache/}"
+fi
 if [[ -n "$prefill_context_length" ]]; then
     PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --context-length ${prefill_context_length}"
 fi
@@ -367,6 +372,12 @@ build_server_config() {
 PREFILL_SERVER_CONFIG=$(build_server_config "prefill" "$MODEL_NAME" "$PREFILL_TP_SIZE" "$PREFILL_ENABLE_EP" "$PREFILL_ENABLE_DP" "$DECODE_MTP_SIZE")
 DECODE_SERVER_CONFIG=$(build_server_config "decode" "$MODEL_NAME" "$DECODE_TP_SIZE" "$DECODE_ENABLE_EP" "$DECODE_ENABLE_DP" "$DECODE_MTP_SIZE")
 
+# Expose Prometheus /metrics on the servers when requested (ENABLE_METRICS=1).
+if [[ "${ENABLE_METRICS:-0}" == "1" ]]; then
+    [[ "$PREFILL_SERVER_CONFIG" != *"--enable-metrics"* ]] && PREFILL_SERVER_CONFIG="$PREFILL_SERVER_CONFIG --enable-metrics"
+    [[ "$DECODE_SERVER_CONFIG" != *"--enable-metrics"* ]] && DECODE_SERVER_CONFIG="$DECODE_SERVER_CONFIG --enable-metrics"
+fi
+
 if [[ -n "$MODEL_NAME" ]]; then
     echo "Using model-specific configuration for: $MODEL_NAME"
 fi