From 01cc2af8be436575d31120587edc63e957a333a9 Mon Sep 17 00:00:00 2001 From: seungrokj Date: Thu, 11 Jun 2026 15:05:02 +0900 Subject: [PATCH 01/21] [AMD] agentic: add hicache/lmcache configs, update agentic scripts for mi355x models Co-Authored-By: Claude Sonnet 4.6 --- .github/configs/amd-master.yaml | 113 ++- .../single_node/agentic/glm5.1_fp4_mi355x.sh | 96 ++- .../agentic/kimik2.5_fp4_mi355x.sh | 661 ++---------------- .../agentic/minimaxm2.5_fp4_mi355x.sh | 272 +++++++ .../agentic/minimaxm2.5_fp8_mi355x.sh | 256 ++++++- .../single_node/agentic/qwen3.5_fp4_mi355x.sh | 150 ++++ .../single_node/agentic/qwen3.5_fp8_mi355x.sh | 101 ++- runners/launch_mi355x-amds.sh | 2 +- 8 files changed, 980 insertions(+), 671 deletions(-) create mode 100755 benchmarks/single_node/agentic/minimaxm2.5_fp4_mi355x.sh create mode 100755 benchmarks/single_node/agentic/qwen3.5_fp4_mi355x.sh diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index a50d37eab..ee8718506 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -433,6 +433,22 @@ qwen3.5-fp4-mi355x-sglang: - { tp: 2, conc-start: 4, conc-end: 256 } - { tp: 4, conc-start: 4, conc-end: 16 } +# target +qwen3.5-fp4-mi355x-sglang-agentic-hicache: + image: lmsysorg/sglang:v0.5.12-rocm720-mi35x + model: amd/Qwen3.5-397B-A17B-MXFP4 + model-prefix: qwen3.5 + runner: mi355x + precision: fp4 + framework: sglang + multinode: false + scenarios: + agentic-coding: + - duration: 1800 + search-space: + - { tp: 2, ep: 1, offloading: none, conc-list: [8, 16, 32, 40, 48, 56, 72] } + - { tp: 2, ep: 1, offloading: hicache, conc-list: [8, 16, 32, 40, 48, 56, 72] } + qwen3.5-fp4-mi355x-atom: image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post model: amd/Qwen3.5-397B-A17B-MXFP4 @@ -872,6 +888,22 @@ minimaxm2.5-fp4-mi355x-atom: - { tp: 4, conc-start: 4, conc-end: 128 } - { tp: 8, conc-start: 4, conc-end: 16 } +# target +minimaxm2.5-fp4-mi355x-vllm-agentic-lmcache: + image: vllm/vllm-openai-rocm:v0.22.0 + model: amd/MiniMax-M2.5-MXFP4 + model-prefix: minimaxm2.5 + runner: mi355x + precision: fp4 + framework: vllm + multinode: false + scenarios: + agentic-coding: + - duration: 1800 + search-space: + - { tp: 1, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 40, 48] } + - { tp: 1, ep: 1, offloading: lmcache, conc-list: [1, 2, 4, 8, 16, 32, 40, 48] } + minimaxm2.5-fp4-mi355x-vllm: image: vllm/vllm-openai-rocm:v0.22.0 model: amd/MiniMax-M2.5-MXFP4 @@ -2494,6 +2526,23 @@ glm5.1-fp4-mi355x-sglang-agentic: # sglang manages KV eviction; mi355x glm5.1 caps at tp=4 conc=16 in fixed-seq, so cap conservatively - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] } +# target +glm5.1-fp4-mi355x-sglang-agentic-hicache: + image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260529 + model: amd/GLM-5.1-MXFP4 + model-prefix: glm5.1 + runner: mi355x + precision: fp4 + framework: sglang + multinode: false + scenarios: + agentic-coding: + - duration: 1800 + search-space: + # sglang manages KV eviction; mi355x glm5.1 caps at tp=4 conc=16 in fixed-seq, so cap conservatively + - { tp: 2, ep: 1, offloading: none, conc-list: [4, 8, 16, 32, 40, 48] } + - { tp: 2, ep: 1, offloading: hicache, conc-list: [4, 8, 16, 32, 40, 48] } + kimik2.5-fp4-mi355x-vllm-agentic: image: vllm/vllm-openai-rocm:v0.22.0 model: amd/Kimi-K2.5-MXFP4 @@ -2518,8 +2567,40 @@ kimik2.5-fp4-mi355x-vllm-agentic: - { tp: 4, offloading: none, conc-list: [16, 24, 32, 40] } - { tp: 4, offloading: cpu, conc-list: [16, 24, 32, 40] } +# target +kimik2.5-fp4-mi355x-vllm-agentic-lmcache: + image: vllm/vllm-openai-rocm:v0.22.0 + model: amd/Kimi-K2.5-MXFP4 + model-prefix: kimik2.5 + runner: mi355x + precision: fp4 + framework: vllm + multinode: false + scenarios: + agentic-coding: + - duration: 1800 + search-space: + - { tp: 4, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 40, 48, 56, 64] } + - { tp: 4, ep: 1, offloading: lmcache, conc-list: [1, 2, 4, 8, 16, 32, 40, 48, 56, 64] } + +# target +kimik2.5-fp4-mi355x-vllm-agentic-lmcache-060226DRAM1500GB: + image: vllm/vllm-openai-rocm:v0.22.0 + model: amd/Kimi-K2.5-MXFP4 + model-prefix: kimik2.5 + runner: mi355x + precision: fp4 + framework: vllm + multinode: false + scenarios: + agentic-coding: + - duration: 1800 + search-space: + #- { tp: 4, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 40, 48, 56, 64] } + - { tp: 4, ep: 1, offloading: lmcache, conc-list: [1, 2, 4, 8, 16, 32, 40, 48, 56, 64] } + minimaxm2.5-fp8-mi355x-vllm-agentic: - image: vllm/vllm-openai-rocm:v0.22.1 + image: vllm/vllm-openai-rocm:v0.22.0 model: MiniMaxAI/MiniMax-M2.5 model-prefix: minimaxm2.5 runner: mi355x @@ -2536,8 +2617,27 @@ minimaxm2.5-fp8-mi355x-vllm-agentic: - { tp: 4, ep: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 48, 56, 64, 72, 96] } - { tp: 4, ep: 4, offloading: cpu, conc-list: [48, 56, 64, 72, 96] } +# target +minimaxm2.5-fp8-mi355x-vllm-agentic-lmcache: + image: vllm/vllm-openai-rocm:v0.22.0 + model: MiniMaxAI/MiniMax-M2.5 + model-prefix: minimaxm2.5 + runner: mi355x + precision: fp8 + framework: vllm + multinode: false + scenarios: + agentic-coding: + # MI355X tp=4 ep=4: compute ceiling ~60 (empirical), KV cliff ~91 (analytical). + # Compute saturates first; cpu offload likely won't help, but worth confirming. + # AMD uses native OffloadingConnector (NOT SimpleCPUOffloadConnector). + - duration: 1800 + search-space: + - { tp: 2, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 40, 48] } + - { tp: 2, ep: 1, offloading: lmcache, conc-list: [1, 2, 4, 8, 16, 32, 40, 48] } + minimaxm2.5-fp8-mi300x-vllm-agentic: - image: vllm/vllm-openai-rocm:v0.22.1 + image: vllm/vllm-openai-rocm:v0.22.0 model: MiniMaxAI/MiniMax-M2.5 model-prefix: minimaxm2.5 runner: mi300x @@ -2555,7 +2655,7 @@ minimaxm2.5-fp8-mi300x-vllm-agentic: - { tp: 4, offloading: cpu, conc-list: [16, 20, 24, 28, 32] } minimaxm2.5-fp8-mi325x-vllm-agentic: - image: vllm/vllm-openai-rocm:v0.22.1 + image: vllm/vllm-openai-rocm:v0.22.0 model: MiniMaxAI/MiniMax-M2.5 model-prefix: minimaxm2.5 runner: mi325x @@ -2573,8 +2673,9 @@ minimaxm2.5-fp8-mi325x-vllm-agentic: - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 20, 24, 28, 32, 40, 48] } - { tp: 4, offloading: cpu, conc-list: [16, 20, 24, 28, 32] } +# target qwen3.5-fp8-mi355x-sglang-agentic-hicache: - image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260521 + image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260531 model: Qwen/Qwen3.5-397B-A17B-FP8 model-prefix: qwen3.5 runner: mi355x @@ -2585,8 +2686,8 @@ qwen3.5-fp8-mi355x-sglang-agentic-hicache: agentic-coding: - duration: 1800 search-space: - - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] } - - { tp: 8, ep: 1, offloading: hicache, conc-list: [16, 32, 48, 64] } + - { tp: 4, ep: 1, offloading: none, conc-list: [8, 16, 32, 40, 48, 56, 72] } + - { tp: 4, ep: 1, offloading: hicache, conc-list: [8, 16, 32, 40, 48, 56, 72] } dsv4-fp4-mi355x-vllm-agentic: image: vllm/vllm-openai-rocm:v0.22.0 diff --git a/benchmarks/single_node/agentic/glm5.1_fp4_mi355x.sh b/benchmarks/single_node/agentic/glm5.1_fp4_mi355x.sh index 3b85a31cd..6bea8dddd 100755 --- a/benchmarks/single_node/agentic/glm5.1_fp4_mi355x.sh +++ b/benchmarks/single_node/agentic/glm5.1_fp4_mi355x.sh @@ -2,18 +2,29 @@ set -euo pipefail set -x -# Agentic trace replay benchmark for GLM-5.1 FP4 on MI355X using SGLang. +# Agentic trace replay benchmark for Qwen3.5 FP8 on MI300X using SGLang. +# +# Base server recipe follows the upstream MI300X reference +# (benchmarks/single_node/qwen3.5_fp8_mi300x.sh, the "AMD Andy" recipe): +# aiter attention backend, aiter allreduce fusion, mem-fraction 0.75. +# The agentic harness (resolve_trace_source / build_replay_cmd / +# run_agentic_replay_and_write_outputs) replaces run_benchmark_serving, and +# --disable-radix-cache is dropped because agentic replay needs prefix reuse. # # Required env vars: -# MODEL, TP, CONC, RESULT_DIR +# MODEL, TP, CONC, OFFLOADING, TOTAL_CPU_DRAM_GB, RESULT_DIR, DURATION, EP_SIZE +# +# OFFLOADING values: +# none - SGLang GPU KV with the default RadixAttention prefix cache. +# hicache - SGLang HiCache with a local CPU hierarchical cache on top of radix. source "$(dirname "$0")/../../benchmark_lib.sh" -check_env_vars MODEL TP CONC RESULT_DIR DURATION +check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR EP_SIZE DP_ATTENTION -if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then - MAX_MODEL_LEN=131072 -fi +PORT=${PORT:-8888} +DURATION=${DURATION:-1800} +EP_SIZE=${EP_SIZE:-1} if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" @@ -30,8 +41,16 @@ else hf download "$MODEL" export MODEL_PATH="$MODEL" fi + rocm-smi || true amd-smi || true +# ---- Resolve traces and install deps ---------------------------------------- +# Cap the replay corpus at 256k (470 traces, max in+out <= 256k) instead of the +# unfiltered 052726 corpus whose ~1M-token traces get rejected and add no perf +# signal at high concurrency. +#export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k +#060226 +export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060226_256k # ---- Resolve traces and install deps ---------------------------------------- resolve_trace_source @@ -48,26 +67,85 @@ mkdir -p "$RESULT_DIR" pip install -U transformers +CACHE_ARGS=() +WARMUP_ARGS=() +CUDA_GRAPH_MAX_BS="$CONC" +case "$OFFLOADING" in + none) + # Leave SGLang's default RadixAttention prefix cache on — agentic + # replay needs it; --disable-radix-cache would zero the hit rate. + ;; + hicache) + # GLM-5.1 FP4 uses a standard transformer (no hybrid Mamba path), + # so one HiCache host pool per TP rank is sufficient. + # The node-total DRAM budget divides by TP and host-pool count. + TOTAL_CPU_DRAM_GB=3000 + HICACHE_HOST_POOL_COUNT="${HICACHE_HOST_POOL_COUNT:-1}" + HICACHE_MAX_SIZE_GB_PER_RANK_POOL="${HICACHE_MAX_SIZE_GB_PER_RANK_POOL:-${HICACHE_MAX_SIZE_GB_PER_RANK:-500}}" + HICACHE_WRITE_POLICY="${HICACHE_WRITE_POLICY:-write_through_selective}" + # GLM-5.1 uses standard paged attention (no no_buffer scheduler constraint), + # so page_size can be left at the default. Keep the safer direct/layer_first + # copy path on ROCm. + HICACHE_PAGE_SIZE="${HICACHE_PAGE_SIZE:-1}" + HICACHE_IO_BACKEND="${HICACHE_IO_BACKEND:-direct}" + HICACHE_MEM_LAYOUT="${HICACHE_MEM_LAYOUT:-layer_first}" + HICACHE_SIZE_GB="${HICACHE_SIZE_GB:-$((TOTAL_CPU_DRAM_GB / TP / HICACHE_HOST_POOL_COUNT))}" + if [ "$HICACHE_SIZE_GB" -gt "$HICACHE_MAX_SIZE_GB_PER_RANK_POOL" ]; then + HICACHE_SIZE_GB="$HICACHE_MAX_SIZE_GB_PER_RANK_POOL" + fi + if [ "$HICACHE_SIZE_GB" -lt 1 ]; then + echo "Error: computed HICACHE_SIZE_GB=$HICACHE_SIZE_GB from TOTAL_CPU_DRAM_GB=$TOTAL_CPU_DRAM_GB, TP=$TP, HICACHE_HOST_POOL_COUNT=$HICACHE_HOST_POOL_COUNT" >&2 + exit 1 + fi + echo "HiCache CPU pool: ${HICACHE_SIZE_GB} GB per rank per host pool across TP=${TP}, host_pool_count=${HICACHE_HOST_POOL_COUNT}" + CACHE_ARGS=( + --page-size "$HICACHE_PAGE_SIZE" + --enable-hierarchical-cache + --hicache-size "$HICACHE_SIZE_GB" + --hicache-io-backend "$HICACHE_IO_BACKEND" + --hicache-mem-layout "$HICACHE_MEM_LAYOUT" + --hicache-write-policy "$HICACHE_WRITE_POLICY" + ) + # HiCache startup reaches API readiness but SGLang's internal warmup + # request can time out on this path; let aiperf own benchmark traffic. + WARMUP_ARGS=(--skip-server-warmup) + # Don't force ROCm graph capture at every high concurrency point; conc=16 + # is the highest known-good capture size for this model/server path. + HICACHE_CUDA_GRAPH_MAX_BS="${HICACHE_CUDA_GRAPH_MAX_BS:-16}" + if [ "$HICACHE_CUDA_GRAPH_MAX_BS" -lt "$CUDA_GRAPH_MAX_BS" ]; then + CUDA_GRAPH_MAX_BS="$HICACHE_CUDA_GRAPH_MAX_BS" + fi + ;; + *) + echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, hicache)" >&2 + exit 1 + ;; +esac + echo "Starting SGLang server..." export PYTHONNOUSERSITE=1 +pip install -U transformers python3 -m sglang.launch_server \ - --model-path "$MODEL_PATH" --served-model-name "$MODEL" \ + --model-path "$MODEL_PATH" \ + --served-model-name "$MODEL" \ --host=0.0.0.0 \ --port $PORT \ --tensor-parallel-size $TP \ --trust-remote-code \ --cuda-graph-max-bs $CONC \ --max-running-requests $CONC \ - --context-length $MAX_MODEL_LEN \ --mem-fraction-static 0.85 \ --tool-call-parser glm47 \ --reasoning-parser glm45 \ --model-loader-extra-config '{"enable_multithread_load": true, "num_threads": 8}' \ --nsa-prefill-backend tilelang \ --nsa-decode-backend tilelang \ + --watchdog-timeout 1200 \ --kv-cache-dtype fp8_e4m3 \ --tokenizer-worker-num $((TP*2)) \ + "${CACHE_ARGS[@]}" \ + "${WARMUP_ARGS[@]}" \ --enable-metrics > "$SERVER_LOG" 2>&1 & SERVER_PID=$! echo "Server PID: $SERVER_PID" @@ -77,4 +155,4 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S # ---- Run benchmark ---------------------------------------------------------- build_replay_cmd "$RESULT_DIR" -run_agentic_replay_and_write_outputs "$RESULT_DIR" +run_agentic_replay_and_write_outputs "$RESULT_DIR" \ No newline at end of file diff --git a/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh b/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh index 139b12256..b3211ff49 100755 --- a/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh +++ b/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh @@ -14,15 +14,11 @@ set -x source "$(dirname "$0")/../../benchmark_lib.sh" -check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION EP_SIZE - -# Kimi-K2.5 advertises a 262144-token context window in vLLM 0.21.0. -# Matrix defaults may export MAX_MODEL_LEN=0 to mean "server default"; for this -# script we need the concrete value so AgentX filters prompt+max_tokens against -# the same limit vLLM enforces. -if [[ -z "${MAX_MODEL_LEN:-}" || "$MAX_MODEL_LEN" == "0" ]]; then - MAX_MODEL_LEN=262144 -fi +check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR EP_SIZE DP_ATTENTION + +PORT=${PORT:-8888} +DURATION=${DURATION:-1800} +EP_SIZE=${EP_SIZE:-1} if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" @@ -44,546 +40,22 @@ else hf download "$MODEL" export MODEL_PATH="$MODEL" fi + rocm-smi || true amd-smi || true +# ---- Resolve traces and install deps ---------------------------------------- +# Cap the replay corpus at 256k (470 traces, max in+out <= 256k) instead of the +# unfiltered 052726 corpus whose ~1M-token traces get rejected and add no perf +# signal at high concurrency. +#export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k +#060226 +export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060226_256k + # ---- Resolve traces and install deps ---------------------------------------- resolve_trace_source install_agentic_deps -# Install amd-quark for MXFP4 (manual install due to ROCm vLLM bug) -pip install amd-quark - -# Disable AITER RMSNorm for TP < 8 due to accuracy issues -if [ "${TP}" -lt 8 ]; then - export VLLM_ROCM_USE_AITER_RMSNORM=0 -fi - -write_lmcache_rocm_mp_patch() { - local patch_dir="$1" - mkdir -p "$patch_dir" - cat > "$patch_dir/sitecustomize.py" <<'PY' -"""Runtime compatibility for LMCache MP on ROCm Kimi MLA KV caches.""" - -import os -import threading - -if os.environ.get("LMCACHE_ROCM_DEMAND_PINNED_ALLOCATOR") == "1": - import builtins - import sys - - _orig_import = builtins.__import__ - - def _patch_lazy_memory_allocator(_lazy_memory_allocator) -> None: - _LazyMemoryAllocator = _lazy_memory_allocator.LazyMemoryAllocator - - if getattr(_LazyMemoryAllocator, "_agentic_rocm_demand_patch", False): - return - - _orig_init = _LazyMemoryAllocator.__init__ - _orig_allocate = _LazyMemoryAllocator.allocate - _orig_batched_allocate = _LazyMemoryAllocator.batched_allocate - - def _expand_to(self, target_size: int) -> None: - target_size = min( - self._final_size, - _lazy_memory_allocator.align_to(target_size, self.PIN_CHUNK_SIZE), - ) - lock = self._agentic_rocm_demand_expand_lock - with lock: - if target_size <= self._curr_size: - return - - start_size = self._curr_size - while self._curr_size < target_size: - commit_start = self._curr_size - commit_target = min(target_size, self._curr_size + self.COMMIT_SIZE) - while self._curr_size < commit_target: - self._pin_memory_chunk(self._curr_size, self.PIN_CHUNK_SIZE) - self._curr_size += self.PIN_CHUNK_SIZE - self._commit_expansion(self._curr_size - commit_start) - - self._log_expansion_progress(self._curr_size - start_size) - - def _retry_with_demand_expansion(self, allocate_once): - obj = allocate_once() - step_gb = float(os.environ.get("LMCACHE_ROCM_DEMAND_PINNED_STEP_GB", "64")) - step_bytes = max(self.COMMIT_SIZE, int(step_gb * (1024**3))) - - while obj is None and self._curr_size < self._final_size: - _expand_to(self, self._curr_size + step_bytes) - obj = allocate_once() - - return obj - - def _patched_init(self, *args, **kwargs): - _orig_init(self, *args, **kwargs) - self._agentic_rocm_demand_expand_lock = threading.Lock() - - # LMCache MP's upstream LazyMemoryAllocator currently expands to - # the final pinned size in a background thread. On ROCm Kimi TP4, - # vLLM reaches KV-cache registration only after that 2.5 TB pool - # is fully pinned, and the server-side IPC open path can stall - # before acknowledging register_kv_caches. Keep the same final - # capacity, but pin/commit extra host memory only when L1 - # allocations actually need it. - self._stop_expand.set() - self._expand_thread.join() - _lazy_memory_allocator.logger.info( - "Agentic ROCm patch: using demand-driven LMCache pinned " - "memory expansion; final capacity remains %s MB", - self._final_size >> 20, - ) - - def _patched_allocate( - self, - shapes, - dtypes, - fmt=_lazy_memory_allocator.MemoryFormat.UNDEFINED, - allocator_type=None, - ): - return _retry_with_demand_expansion( - self, - lambda: _orig_allocate(self, shapes, dtypes, fmt, allocator_type), - ) - - def _patched_batched_allocate( - self, - shapes, - dtypes, - batch_size, - fmt=_lazy_memory_allocator.MemoryFormat.UNDEFINED, - allocator_type=None, - ): - return _retry_with_demand_expansion( - self, - lambda: _orig_batched_allocate( - self, shapes, dtypes, batch_size, fmt, allocator_type - ), - ) - - _LazyMemoryAllocator.__init__ = _patched_init - _LazyMemoryAllocator.allocate = _patched_allocate - _LazyMemoryAllocator.batched_allocate = _patched_batched_allocate - _LazyMemoryAllocator._agentic_rocm_demand_patch = True - - def _patch_l1_memory_manager(_memory_manager) -> None: - _L1MemoryManager = getattr(_memory_manager, "L1MemoryManager", None) - _LazyMemoryAllocator = getattr(_memory_manager, "LazyMemoryAllocator", None) - if _L1MemoryManager is None or _LazyMemoryAllocator is None: - return - if getattr(_L1MemoryManager, "_agentic_rocm_final_capacity_patch", False): - return - - _orig_get_memory_usage = _L1MemoryManager.get_memory_usage - - def _patched_get_memory_usage(self): - allocator = getattr(self, "_allocator", None) - if isinstance(allocator, _LazyMemoryAllocator): - address_manager = allocator.get_address_manager() - used_size = ( - address_manager.get_heap_size() - address_manager.get_free_size() - ) - return used_size, allocator._final_size - return _orig_get_memory_usage(self) - - _L1MemoryManager.get_memory_usage = _patched_get_memory_usage - _L1MemoryManager._agentic_rocm_final_capacity_patch = True - - def _maybe_patch_lazy_memory_allocator() -> None: - module = sys.modules.get("lmcache.v1.lazy_memory_allocator") - if module is not None and hasattr(module, "LazyMemoryAllocator"): - _patch_lazy_memory_allocator(module) - - def _maybe_patch_l1_memory_manager() -> None: - module = sys.modules.get("lmcache.v1.distributed.memory_manager") - if module is not None and hasattr(module, "L1MemoryManager"): - _patch_l1_memory_manager(module) - - def _agentic_rocm_import(name, globals=None, locals=None, fromlist=(), level=0): - module = _orig_import(name, globals, locals, fromlist, level) - if name == "lmcache.v1.lazy_memory_allocator" or ( - name.startswith("lmcache") and "lmcache.v1.lazy_memory_allocator" in sys.modules - ): - _maybe_patch_lazy_memory_allocator() - if name == "lmcache.v1.distributed.memory_manager" or ( - name.startswith("lmcache") - and "lmcache.v1.distributed.memory_manager" in sys.modules - ): - _maybe_patch_l1_memory_manager() - return module - - builtins.__import__ = _agentic_rocm_import - _maybe_patch_lazy_memory_allocator() - _maybe_patch_l1_memory_manager() - -if os.environ.get("LMCACHE_ROCM_MP_BLOCK_FALLBACK") == "1": - import torch - import lmcache.non_cuda_equivalents as lmc - - if not hasattr(lmc, "multi_layer_block_kv_transfer"): - _DTYPE_BY_NAME = { - "bfloat16": torch.bfloat16, - "float16": torch.float16, - "float32": torch.float32, - } - - def _dtype_from_env() -> torch.dtype: - name = os.environ.get("LMCACHE_ROCM_MP_BLOCK_FALLBACK_DTYPE", "bfloat16") - try: - return _DTYPE_BY_NAME[name] - except KeyError as exc: - raise ValueError(f"Unsupported LMCache ROCm fallback dtype: {name}") from exc - - def _paged_view(ptr: int, shape_desc, dtype: torch.dtype, device: torch.device) -> torch.Tensor: - block_stride = shape_desc.block_stride_elems or ( - shape_desc.bs * shape_desc.nh * shape_desc.hs - ) - base = lmc._tensor_from_ptr( - ptr, - (shape_desc.nb * block_stride,), - dtype, - device, - ) - return torch.as_strided( - base, - (shape_desc.nb, shape_desc.bs, shape_desc.nh * shape_desc.hs), - (block_stride, shape_desc.nh * shape_desc.hs, 1), - ) - - def _tmp_view(ptr: int, shape_desc, num_layers: int, chunk_slots: int, dtype: torch.dtype, device: torch.device) -> torch.Tensor: - return lmc._tensor_from_ptr( - ptr, - (shape_desc.kv_size, num_layers, chunk_slots, shape_desc.nh * shape_desc.hs), - dtype, - device, - ) - - def multi_layer_block_kv_transfer( - group_kv_pointers, - tmp_buffer_ptrs, - block_ids, - paged_memory_device, - direction, - shape_desc, - lmcache_chunk_size, - gpu_kv_format, - skip_blocks=0, - ) -> None: - # Kimi K2.5 uses vLLM MLA: one KV tensor per layer with - # shape [num_blocks, block_size, hidden_size]. LMCache's Python - # fallback has no block-transfer entrypoint yet, so implement the - # same gather/scatter contract with torch indexing on ROCm. - if shape_desc.kv_size != 1: - raise NotImplementedError( - "ROCm LMCache MP block fallback currently supports MLA KV caches only" - ) - - dtype = _dtype_from_env() - device = ( - paged_memory_device - if isinstance(paged_memory_device, torch.device) - else torch.device(paged_memory_device) - ) - num_layers = int(group_kv_pointers.numel()) - blocks_per_chunk = lmcache_chunk_size // shape_desc.bs - direction_name = getattr(direction, "name", str(direction)) - - for chunk_idx, tmp_ptr in enumerate(tmp_buffer_ptrs): - start = chunk_idx * blocks_per_chunk - end = start + blocks_per_chunk - chunk_blocks = block_ids[start:end].to(device=device, dtype=torch.long) - - dest_slot_offset = 0 - if skip_blocks and chunk_idx == 0: - chunk_blocks = chunk_blocks[int(skip_blocks):] - dest_slot_offset = int(skip_blocks) * shape_desc.bs - if chunk_blocks.numel() == 0: - continue - - num_slots = int(chunk_blocks.numel()) * shape_desc.bs - tmp = _tmp_view( - int(tmp_ptr), - shape_desc, - num_layers, - lmcache_chunk_size, - dtype, - device, - ) - - for layer_idx in range(num_layers): - paged = _paged_view( - int(group_kv_pointers[layer_idx].item()), - shape_desc, - dtype, - device, - ) - tmp_slice = tmp[ - 0, - layer_idx, - dest_slot_offset : dest_slot_offset + num_slots, - :, - ] - if direction_name == "D2H": - gathered = paged.index_select(0, chunk_blocks).reshape( - num_slots, shape_desc.nh * shape_desc.hs - ) - tmp_slice.copy_(gathered) - elif direction_name == "H2D": - src = tmp_slice.reshape( - int(chunk_blocks.numel()), - shape_desc.bs, - shape_desc.nh * shape_desc.hs, - ) - paged.index_copy_(0, chunk_blocks, src) - else: - raise ValueError(f"Unsupported transfer direction: {direction}") - - lmc.multi_layer_block_kv_transfer = multi_layer_block_kv_transfer - -# ---- Chunked KV loading (prevents GPU block exhaustion at high concurrency) ---- -if os.environ.get("CHUNKED_LMCACHE_MAX_TOKENS_PER_LOAD", "0") != "0": - import chunked_connector_patch # noqa: F401 - -# ---- vLLM scheduler assertion fix (stale KV transfer notifications) ---- -import scheduler_assertion_patch # noqa: F401 -PY -} - -write_chunked_connector_patch() { - local patch_dir="$1" - mkdir -p "$patch_dir" - cat > "$patch_dir/chunked_connector_patch.py" <<'PY' -""" -Monkey-patch for LMCacheMPConnector to add chunked KV loading. - -Fixes GPU block exhaustion deadlock at high concurrency by capping -the number of external tokens reported AND retrieved per scheduling step. - -Usage: set CHUNKED_LMCACHE_MAX_TOKENS_PER_LOAD= and import this -module from sitecustomize.py before LMCache is loaded. -""" - -import logging -import os -import sys -import builtins - -logger = logging.getLogger("chunked_lmcache_patch") - -_MAX_TOKENS = int(os.environ.get("CHUNKED_LMCACHE_MAX_TOKENS_PER_LOAD", "32768")) - -# Per-request chunk tracking (module-level, survives across calls) -_chunk_state: dict[str, dict] = {} - - -def _apply_patch(): - """Patch LMCacheMPConnector in-place.""" - mod = sys.modules.get("lmcache.integration.vllm.lmcache_mp_connector") - if mod is None: - return - cls = getattr(mod, "LMCacheMPConnector", None) - if cls is None or getattr(cls, "_chunked_patch_applied", False): - return - - LMCacheMPRequestState = getattr(mod, "LMCacheMPRequestState", None) - _orig_get_matched = cls.get_num_new_matched_tokens - _orig_get_finished = cls.get_finished - - def _get_blocks_per_chunk(self): - block_size = getattr(self, "block_size", 1) - return max(1, _MAX_TOKENS // block_size) - - def _patched_get_num_new_matched_tokens(self, request, num_computed_tokens): - full_match = _orig_get_matched(self, request, num_computed_tokens) - if full_match <= 0 or _MAX_TOKENS <= 0: - return full_match - - req_id = request.request_id - block_size = getattr(self, "block_size", 1) - blocks_per_chunk = _get_blocks_per_chunk(self) - full_match_blocks = full_match // block_size - - state = _chunk_state.get(req_id) - if state is None or state.get("num_computed_at_start") != num_computed_tokens: - state = { - "full_match_blocks": full_match_blocks, - "chunk_end_blocks": 0, - "num_computed_at_start": num_computed_tokens, - "lookup_done": False, - } - _chunk_state[req_id] = state - - if state["lookup_done"]: - return 0 - - remaining = state["full_match_blocks"] - state["chunk_end_blocks"] - if remaining <= 0: - state["lookup_done"] = True - return 0 - - this_chunk = min(remaining, blocks_per_chunk) - state["chunk_end_blocks"] += this_chunk - if state["chunk_end_blocks"] >= state["full_match_blocks"]: - state["lookup_done"] = True - - capped = this_chunk * block_size - if capped < full_match: - logger.debug( - "Chunked LMCache: req %s capped %d -> %d tokens " - "(chunk %d/%d blocks)", - req_id, full_match, capped, this_chunk, full_match_blocks, - ) - - # Cap the tracker's hit blocks to match what we report - tracker = getattr(request, "kv_transfer_params", None) - if tracker is not None: - orig_hits = getattr(tracker, "num_lmcache_hit_blocks", 0) - if orig_hits > this_chunk: - tracker.num_lmcache_hit_blocks = this_chunk - - return capped - - def _patched_get_finished(self, scheduler_output): - result = _orig_get_finished(self, scheduler_output) - # Clean up chunk state for finished requests. - # vLLM passes scheduler_output as a set of request-ID strings - # (not a SchedulerOutput object), so iterate directly when it - # is a set/frozenset; fall back to the attribute path for - # forward compatibility. - if isinstance(scheduler_output, (set, frozenset)): - finished = scheduler_output - else: - finished = getattr(scheduler_output, "finished_req_ids", []) - for req in finished: - _chunk_state.pop(req, None) - return result - - cls.get_num_new_matched_tokens = _patched_get_num_new_matched_tokens - cls.get_finished = _patched_get_finished - cls._chunked_patch_applied = True - logger.info( - "Chunked LMCache connector patch applied " - "(max_tokens_per_load=%d)", _MAX_TOKENS, - ) - - -_orig_import = builtins.__import__ - - -def _patching_import(name, *args, **kwargs): - module = _orig_import(name, *args, **kwargs) - if ( - name == "lmcache.integration.vllm.lmcache_mp_connector" - or ( - name.startswith("lmcache") - and "lmcache.integration.vllm.lmcache_mp_connector" in sys.modules - ) - ): - _apply_patch() - return module - - -builtins.__import__ = _patching_import -_apply_patch() -PY -} - -write_scheduler_assertion_patch() { - local patch_dir="$1" - mkdir -p "$patch_dir" - cat > "$patch_dir/scheduler_assertion_patch.py" <<'PY' -""" -Patch vLLM scheduler to handle stale finished_recving gracefully. - -The assertion at scheduler.py crashes when a KV transfer reports -"finished recving" but the request is already in RUNNING state. -This happens when transfers complete asynchronously and the scheduler -has already moved the request forward. - -Fix: Instead of asserting, log a warning and skip. -""" - -import logging -import sys -import builtins - -logger = logging.getLogger("scheduler_assertion_patch") - - -def _apply_patch(): - """Patch vLLM scheduler's _update_from_kv_xfer_finished.""" - sched_mod = sys.modules.get("vllm.v1.core.sched.scheduler") - if sched_mod is None: - return - req_mod = sys.modules.get("vllm.v1.request") - if req_mod is None: - return - Scheduler = getattr(sched_mod, "Scheduler", None) - RequestStatus = getattr(req_mod, "RequestStatus", None) - if Scheduler is None or RequestStatus is None: - return - if getattr(Scheduler, "_kv_xfer_patch_applied", False): - return - - _orig_update = Scheduler._update_from_kv_xfer_finished - - def _patched_update(self, kv_connector_output): - if self.connector is not None: - self.connector.update_connector_output(kv_connector_output) - for req_id in kv_connector_output.finished_recving or (): - if req_id not in self.requests: - continue - req = self.requests[req_id] - if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS: - self.finished_recving_kv_req_ids.add(req_id) - elif RequestStatus.is_finished(req.status): - self._free_blocks(self.requests[req_id]) - else: - logger.warning( - "Stale finished_recving for req %s in status %s; skipping.", - req_id, req.status.name, - ) - for req_id in kv_connector_output.finished_sending or (): - if req_id not in self.requests: - continue - self._free_blocks(self.requests[req_id]) - - Scheduler._update_from_kv_xfer_finished = _patched_update - Scheduler._kv_xfer_patch_applied = True - logger.info("Scheduler KV transfer assertion patch applied") - - -_orig_import = builtins.__import__ - - -def _patching_import(name, *args, **kwargs): - module = _orig_import(name, *args, **kwargs) - if ( - name == "vllm.v1.core.sched.scheduler" - or ( - name.startswith("vllm") - and "vllm.v1.core.sched.scheduler" in sys.modules - ) - ): - _apply_patch() - return module - - -builtins.__import__ = _patching_import -_apply_patch() -PY -} - -# Workaround for MEC FW <177 RCCL memory reclaim issue -version=$(rocm-smi --showfw 2>/dev/null | grep MEC | head -n 1 | awk '{print $NF}') -if [[ "$version" == "" || ${version:-0} -lt 177 ]]; then - export HSA_NO_SCRATCH_RECLAIM=1 -fi - -export VLLM_ROCM_USE_AITER=1 -export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 - # ---- Server config ---------------------------------------------------------- SERVER_LOG="$RESULT_DIR/server.log" LMCACHE_LOG="$RESULT_DIR/lmcache_server.log" @@ -591,6 +63,8 @@ mkdir -p "$RESULT_DIR" OFFLOAD_ARGS=() PREFIX_CACHE_ARGS=() + +# ---- Lmcache config ---------------------------------------------------------- LMCACHE_PID="" cleanup_lmcache_server() { @@ -648,7 +122,9 @@ case "$OFFLOADING" in # MI355X nodes have ~2.7 TiB of host DRAM available for offload; # reserve 2.5 TB for the offload pool (leaves ~200 GB headroom for # worker RSS / page cache / slurm cgroup). - TOTAL_CPU_DRAM_GB=2500 + #TODO: fix + TOTAL_CPU_DRAM_GB=3000 + TOTAL_CPU_DRAM_PARTITION_GB="${TOTAL_CPU_DRAM_PARTITION_GB:-$((TOTAL_CPU_DRAM_GB / (8 / TP)))}" # Use vLLM's regular native KV-offload path (OffloadingConnector), # NOT the SimpleCPUOffloadConnector. The "native" backend resolves to # OffloadingConnector by default; setting VLLM_USE_SIMPLE_KV_OFFLOAD=1 @@ -659,7 +135,7 @@ case "$OFFLOADING" in # (vllm/config/vllm.py:662). OFFLOAD_ARGS=( --kv_offloading_backend native - --kv_offloading_size "$TOTAL_CPU_DRAM_GB" + --kv_offloading_size "$TOTAL_CPU_DRAM_PARTITION_GB" --disable-hybrid-kv-cache-manager ) ;; @@ -667,74 +143,20 @@ case "$OFFLOADING" in { set +x; } 2>/dev/null unset VLLM_USE_SIMPLE_KV_OFFLOAD - agentic_pip_install --quiet --no-cache-dir lmcache - # LMCache's current dependency chain can install NVIDIA/CUDA NIXL and - # CuPy packages on ROCm. vLLM 0.21.0 treats ROCm as "cuda-like", and - # during Kimi fused-MoE model inspection it imports nixl_ep whenever - # that module is importable, even when this run is not using EP/NIXL - # kernels. The CUDA extension then fails immediately on AMD nodes with - # "ImportError: libcuda.so.1". - # - # LMCache MP also uses CuPy stream APIs while registering vLLM's KV - # caches. The CUDA CuPy wheel imports on ROCm, but it fails at runtime - # with cudaErrorInsufficientDriver when LMCache touches the stream. Use - # the ROCm 7 CuPy wheel so the same API dispatches through HIP. - python3 -m pip uninstall -y \ - nixl nixl-cu12 nixl-cu13 nixl_ep \ - >/dev/null 2>&1 || true - python3 -m pip uninstall -y \ - cupy cupy-cuda11x cupy-cuda12x cupy-cuda13x \ - >/dev/null 2>&1 || true - agentic_pip_install --quiet --no-cache-dir cupy-rocm-7-0 - python3 - <<'PY' -import importlib.util -import sys - -spec = importlib.util.find_spec("nixl_ep") -if spec is not None: - locations = ", ".join(spec.submodule_search_locations or [spec.origin or "unknown"]) - print( - "Error: nixl_ep is still importable after LMCache install; " - "this ROCm Kimi run would import a CUDA-only nixl_ep module. " - f"location={locations}", - file=sys.stderr, - ) - sys.exit(1) - -try: - from cupy_backends.cuda.api import runtime as cupy_runtime -except Exception as exc: - print(f"Error: failed to import CuPy runtime after ROCm CuPy install: {exc}", file=sys.stderr) - sys.exit(1) - -if not getattr(cupy_runtime, "is_hip", False): - print( - "Error: CuPy is still using the CUDA backend after installing " - "cupy-rocm-7-0; LMCache MP would fail during KV-cache registration.", - file=sys.stderr, - ) - sys.exit(1) -PY - LMCACHE_ROCM_PATCH_DIR="$RESULT_DIR/lmcache_rocm_patch" - write_lmcache_rocm_mp_patch "$LMCACHE_ROCM_PATCH_DIR" - write_chunked_connector_patch "$LMCACHE_ROCM_PATCH_DIR" - write_scheduler_assertion_patch "$LMCACHE_ROCM_PATCH_DIR" - export LMCACHE_ROCM_MP_BLOCK_FALLBACK=1 - export LMCACHE_ROCM_MP_BLOCK_FALLBACK_DTYPE=bfloat16 - export LMCACHE_ROCM_DEMAND_PINNED_ALLOCATOR=1 - # Cap external KV tokens loaded per scheduling step to prevent GPU - # block exhaustion deadlock at high concurrency (c>=32). Default - # 32768 keeps peak block demand within the GPU KV pool. Set to 0 to - # disable chunking (only safe at low concurrency). - export CHUNKED_LMCACHE_MAX_TOKENS_PER_LOAD="${CHUNKED_LMCACHE_MAX_TOKENS_PER_LOAD:-32768}" - export PYTHONPATH="$LMCACHE_ROCM_PATCH_DIR${PYTHONPATH:+:$PYTHONPATH}" + git clone https://github.com/LMCache/LMCache.git + cd LMCache + pip install -r requirements/build.txt + CXX=hipcc BUILD_WITH_HIP=1 pip install -e . --no-build-isolation + cd .. + python3 -c "import lmcache.integration.vllm.lmcache_mp_connector" >/dev/null # Match the B200 Kimi LMCache setup: keep a 2.5 TB semantic CPU KV # pool, but let the external MP server own that pool so vLLM does not # split --kv-offloading-size across TP ranks through the integrated # LMCache backend. - TOTAL_CPU_DRAM_GB=2500 + #TODO: fix + TOTAL_CPU_DRAM_GB=3000 LMCACHE_HOST="${LMCACHE_HOST:-127.0.0.1}" LMCACHE_PORT="${LMCACHE_PORT:-5555}" LMCACHE_HTTP_PORT="${LMCACHE_HTTP_PORT:-8080}" @@ -742,7 +164,7 @@ PY # ZMQ endpoint. Bind the server to a raw host, but pass the connector a # ZMQ-style host string. LMCACHE_CONNECT_HOST="${LMCACHE_CONNECT_HOST:-tcp://$LMCACHE_HOST}" - LMCACHE_L1_SIZE_GB="${LMCACHE_L1_SIZE_GB:-$TOTAL_CPU_DRAM_GB}" + LMCACHE_L1_SIZE_GB="${LMCACHE_L1_SIZE_GB:-$((TOTAL_CPU_DRAM_GB / (8 / TP)))}" LMCACHE_L1_INIT_SIZE_GB="${LMCACHE_L1_INIT_SIZE_GB:-20}" # LMCache read locks are leases on chunks that lookup has promised # vLLM can retrieve. The default 300s TTL is too short for this @@ -750,10 +172,11 @@ PY # lookup and retrieve while GPU KV is saturated, which leaves the # object present in L1 but no longer readable. Keep the 2.5 TB pool # size unchanged and only extend the lookup-to-retrieve lease. - LMCACHE_L1_READ_TTL_SECONDS="${LMCACHE_L1_READ_TTL_SECONDS:-3600}" + LMCACHE_L1_READ_TTL_SECONDS="${LMCACHE_L1_READ_TTL_SECONDS:-7200}" LMCACHE_CHUNK_SIZE="${LMCACHE_CHUNK_SIZE:-256}" LMCACHE_MAX_WORKERS="${LMCACHE_MAX_WORKERS:-$TP}" export PYTHONHASHSEED="${PYTHONHASHSEED:-0}" + export LMCACHE_BLOCKING_TIMEOUT_SECS=120 echo "Starting LMCache MP server..." LMCACHE_CMD=( @@ -786,6 +209,7 @@ PY *) echo "Error: unsupported OFFLOADING value '$OFFLOADING'" >&2; exit 1 ;; esac +# ---- LLM server config ---------------------------------------------------------- EP_ARGS=() if [ "$EP_SIZE" -gt 1 ]; then EP_ARGS=(--enable-expert-parallel) @@ -794,6 +218,23 @@ fi echo "Starting vllm server..." export PYTHONNOUSERSITE=1 +# Install amd-quark for MXFP4 (manual install due to ROCm vLLM bug) +pip install amd-quark + +# Disable AITER RMSNorm for TP < 8 due to accuracy issues +if [ "${TP}" -lt 8 ]; then + export VLLM_ROCM_USE_AITER_RMSNORM=0 +fi + +# Workaround for MEC FW <177 RCCL memory reclaim issue +version=$(rocm-smi --showfw 2>/dev/null | grep MEC | head -n 1 | awk '{print $NF}') +if [[ "$version" == "" || ${version:-0} -lt 177 ]]; then + export HSA_NO_SCRATCH_RECLAIM=1 +fi + +export VLLM_ROCM_USE_AITER=1 +export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 + { set +x; } 2>/dev/null VLLM_CMD=( vllm serve "$MODEL_PATH" --served-model-name "$MODEL" @@ -802,9 +243,9 @@ VLLM_CMD=( --tensor-parallel-size="$TP" "${EP_ARGS[@]}" --gpu-memory-utilization 0.90 + --kv-cache-dtype fp8 \ --block-size=1 --trust-remote-code - --max-model-len "$MAX_MODEL_LEN" --max-num-seqs "$CONC" --mm-encoder-tp-mode data "${PREFIX_CACHE_ARGS[@]}" @@ -821,4 +262,4 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S # ---- Run benchmark ---------------------------------------------------------- build_replay_cmd "$RESULT_DIR" -run_agentic_replay_and_write_outputs "$RESULT_DIR" +run_agentic_replay_and_write_outputs "$RESULT_DIR" \ No newline at end of file diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp4_mi355x.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp4_mi355x.sh new file mode 100755 index 000000000..640fe7f65 --- /dev/null +++ b/benchmarks/single_node/agentic/minimaxm2.5_fp4_mi355x.sh @@ -0,0 +1,272 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x + +# Agentic trace replay benchmark for Kimi-K2.5 FP4 on MI355X using vLLM. +# +# Required env vars: +# MODEL, TP, CONC, OFFLOADING, TOTAL_CPU_DRAM_GB, RESULT_DIR +# +# OFFLOADING values: +# none - vLLM GPU KV only. +# cpu - vLLM native CPU offload. +# lmcache - LMCache MP server + vLLM LMCacheMPConnector. + +source "$(dirname "$0")/../../benchmark_lib.sh" + +check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR EP_SIZE DP_ATTENTION + +PORT=${PORT:-8888} +DURATION=${DURATION:-1800} +EP_SIZE=${EP_SIZE:-1} + +if [[ -n "${SLURM_JOB_ID:-}" ]]; then + echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" +fi + +# ROCR/HIP visibility for vLLM 0.14+ +if [ -n "${ROCR_VISIBLE_DEVICES:-}" ]; then + export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES" +fi + +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi + +rocm-smi || true +amd-smi || true + +# ---- Resolve traces and install deps ---------------------------------------- +# MiniMax-M2.5 servers run at max_model_len ~256k; the unfiltered 052726 +# corpus has requests up to ~1M proxy tokens that would be rejected. +# Switch to the 256k-capped variant (470 traces, max in+out <= 256k). +#export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k +#060226 +export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060226_256k + +resolve_trace_source +install_agentic_deps + +# ---- Server config ---------------------------------------------------------- +SERVER_LOG="$RESULT_DIR/server.log" +LMCACHE_LOG="$RESULT_DIR/lmcache_server.log" +mkdir -p "$RESULT_DIR" + +OFFLOAD_ARGS=() +PREFIX_CACHE_ARGS=() + +# ---- Lmcache config ---------------------------------------------------------- +LMCACHE_PID="" + +cleanup_lmcache_server() { + if [[ -n "$LMCACHE_PID" ]] && kill -0 "$LMCACHE_PID" 2>/dev/null; then + kill "$LMCACHE_PID" 2>/dev/null || true + wait "$LMCACHE_PID" 2>/dev/null || true + fi +} + +trap cleanup_lmcache_server EXIT + +wait_for_lmcache_ready() { + { set +x; } 2>/dev/null + local attempts="${LMCACHE_READY_ATTEMPTS:-120}" + local tail_pid="" + + while [ ! -f "$LMCACHE_LOG" ]; do + if [[ -n "$LMCACHE_PID" ]] && ! kill -0 "$LMCACHE_PID" 2>/dev/null; then + echo "LMCache server died before creating log file. Exiting." >&2 + exit 1 + fi + sleep 1 + done + + tail -f -n +1 "$LMCACHE_LOG" & + tail_pid=$! + + for ((i = 1; i <= attempts; i++)); do + if curl --output /dev/null --silent --fail "http://127.0.0.1:${LMCACHE_HTTP_PORT}/healthcheck"; then + kill "$tail_pid" 2>/dev/null || true + wait "$tail_pid" 2>/dev/null || true + return 0 + fi + if [[ -n "$LMCACHE_PID" ]] && ! kill -0 "$LMCACHE_PID" 2>/dev/null; then + echo "LMCache server died before becoming healthy. Log follows:" >&2 + kill "$tail_pid" 2>/dev/null || true + wait "$tail_pid" 2>/dev/null || true + cat "$LMCACHE_LOG" >&2 || true + exit 1 + fi + sleep 1 + done + + echo "Timed out waiting for LMCache server healthcheck. Log follows:" >&2 + kill "$tail_pid" 2>/dev/null || true + wait "$tail_pid" 2>/dev/null || true + cat "$LMCACHE_LOG" >&2 || true + exit 1 +} + +case "$OFFLOADING" in + none) ;; + cpu) + unset VLLM_USE_SIMPLE_KV_OFFLOAD + # MI355X nodes have ~2.7 TiB of host DRAM available for offload; + # reserve 2.5 TB for the offload pool (leaves ~200 GB headroom for + # worker RSS / page cache / slurm cgroup). + TOTAL_CPU_DRAM_GB=3000 + TOTAL_CPU_DRAM_PARTITION_GB="${TOTAL_CPU_DRAM_PARTITION_GB:-$((TOTAL_CPU_DRAM_GB / (8 / TP)))}" + # Use vLLM's regular native KV-offload path (OffloadingConnector), + # NOT the SimpleCPUOffloadConnector. The "native" backend resolves to + # OffloadingConnector by default; setting VLLM_USE_SIMPLE_KV_OFFLOAD=1 + # would switch it to SimpleCPUOffloadConnector. We intentionally leave + # that env var UNSET here so the regular OffloadingConnector path is + # used. The shortcut --kv_offloading_backend native + --kv_offloading_size + # form constructs the KVTransferConfig at engine startup + # (vllm/config/vllm.py:662). + + # Remove --disable-hybrid-kv-cache-manager and enable hybrid kv cache manager (default) + # This gives extra cache hit than disabling hybrid kv cache manager + # srok, + # --no-disable-hybrid-kv-cache-manager is not compatible with lmcache, even for non-hma + # https://github.com/vllm-project/vllm/blob/0585b5ba2eaa7860d6976bc7ba376bdbca5119fc/vllm/distributed/kv_transfer/kv_connector/factory.py#L56-L60 + OFFLOAD_ARGS=( + --kv_offloading_backend native + --kv_offloading_size "$TOTAL_CPU_DRAM_PARTITION_GB" + --disable-hybrid-kv-cache-manager + ) + ;; + lmcache) + { set +x; } 2>/dev/null + unset VLLM_USE_SIMPLE_KV_OFFLOAD + + git clone https://github.com/LMCache/LMCache.git + cd LMCache + pip install -r requirements/build.txt + CXX=hipcc BUILD_WITH_HIP=1 pip install -e . --no-build-isolation + cd .. + + python3 -c "import lmcache.integration.vllm.lmcache_mp_connector" >/dev/null + + # Match the B200 Kimi LMCache setup: keep a 2.5 TB semantic CPU KV + # pool, but let the external MP server own that pool so vLLM does not + # split --kv-offloading-size across TP ranks through the integrated + # LMCache backend. + TOTAL_CPU_DRAM_GB=3000 + LMCACHE_HOST="${LMCACHE_HOST:-127.0.0.1}" + LMCACHE_PORT="${LMCACHE_PORT:-5555}" + LMCACHE_HTTP_PORT="${LMCACHE_HTTP_PORT:-8080}" + # LMCacheMPConnector concatenates lmcache.mp.host and port into the + # ZMQ endpoint. Bind the server to a raw host, but pass the connector a + # ZMQ-style host string. + LMCACHE_CONNECT_HOST="${LMCACHE_CONNECT_HOST:-tcp://$LMCACHE_HOST}" + #LMCACHE_L1_SIZE_GB="${LMCACHE_L1_SIZE_GB:-$((TOTAL_CPU_DRAM_GB / (8 / TP)))}" + # (srok)TODO: intentionally increased DRAM size + TOTAL_CPU_DRAM_GB=2000 + LMCACHE_L1_SIZE_GB="${LMCACHE_L1_SIZE_GB:-$((TOTAL_CPU_DRAM_GB))}" + LMCACHE_L1_INIT_SIZE_GB="${LMCACHE_L1_INIT_SIZE_GB:-20}" + # LMCache read locks are leases on chunks that lookup has promised + # vLLM can retrieve. The default 300s TTL is too short for this + # long-context agentic queue: TP8/conc32 can spend >300s between + # lookup and retrieve while GPU KV is saturated, which leaves the + # object present in L1 but no longer readable. Keep the 2.5 TB pool + # size unchanged and only extend the lookup-to-retrieve lease. + LMCACHE_L1_READ_TTL_SECONDS="${LMCACHE_L1_READ_TTL_SECONDS:-7200}" + # (srok) check 256 vs 32 + #LMCACHE_CHUNK_SIZE="${LMCACHE_CHUNK_SIZE:-256}" + LMCACHE_CHUNK_SIZE="${LMCACHE_CHUNK_SIZE:-32}" + LMCACHE_MAX_WORKERS="${LMCACHE_MAX_WORKERS:-$TP}" + export PYTHONHASHSEED="${PYTHONHASHSEED:-0}" + export LMCACHE_BLOCKING_TIMEOUT_SECS=120 + + set -x + echo "Starting LMCache MP server..." + LMCACHE_CMD=( + lmcache server + --host "$LMCACHE_HOST" + --port "$LMCACHE_PORT" + --http-host "$LMCACHE_HOST" + --http-port "$LMCACHE_HTTP_PORT" + --l1-size-gb "$LMCACHE_L1_SIZE_GB" + --l1-init-size-gb "$LMCACHE_L1_INIT_SIZE_GB" + --l1-read-ttl-seconds "$LMCACHE_L1_READ_TTL_SECONDS" + --chunk-size "$LMCACHE_CHUNK_SIZE" + --max-workers "$LMCACHE_MAX_WORKERS" + --eviction-policy LRU + ) + printf '%q ' "${LMCACHE_CMD[@]}" > "$RESULT_DIR/lmcache_command.txt" + printf '\n' >> "$RESULT_DIR/lmcache_command.txt" + "${LMCACHE_CMD[@]}" > "$LMCACHE_LOG" 2>&1 & + LMCACHE_PID=$! + echo "LMCache server PID: $LMCACHE_PID" + wait_for_lmcache_ready + + PREFIX_CACHE_ARGS=(--enable-prefix-caching) + # srok, + # --no-disable-hybrid-kv-cache-manager is not compatible with lmcache, even for non-hma + # https://github.com/vllm-project/vllm/blob/0585b5ba2eaa7860d6976bc7ba376bdbca5119fc/vllm/distributed/kv_transfer/kv_connector/factory.py#L56-L60 + OFFLOAD_ARGS=( + --kv-transfer-config + "{\"kv_connector\":\"LMCacheMPConnector\",\"kv_connector_module_path\":\"lmcache.integration.vllm.lmcache_mp_connector\",\"kv_role\":\"kv_both\",\"kv_connector_extra_config\":{\"lmcache.mp.host\":\"$LMCACHE_CONNECT_HOST\",\"lmcache.mp.port\":$LMCACHE_PORT}}" + --disable-hybrid-kv-cache-manager + ) + ;; + *) echo "Error: unsupported OFFLOADING value '$OFFLOADING'" >&2; exit 1 ;; +esac + +# ---- LLM server config ---------------------------------------------------------- +EP_ARGS=() +if [ "$EP_SIZE" -gt 1 ]; then + EP_ARGS=(--enable-expert-parallel) +fi + +echo "Starting vllm server..." +export PYTHONNOUSERSITE=1 + +# Install amd-quark for MXFP4 (manual install due to ROCm vLLM bug) +pip install -q amd-quark + +# Workaround for MEC FW <177 RCCL memory reclaim issue +version=$(rocm-smi --showfw 2>/dev/null | grep MEC | head -n 1 | awk '{print $NF}') +if [[ "$version" == "" || ${version:-0} -lt 177 ]]; then + export HSA_NO_SCRATCH_RECLAIM=1 +fi + +export VLLM_ROCM_USE_AITER=1 +export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 + +{ set +x; } 2>/dev/null +VLLM_CMD=( + vllm serve "$MODEL_PATH" --served-model-name "$MODEL" + --host 0.0.0.0 + --port "$PORT" + --tensor-parallel-size="$TP" + "${EP_ARGS[@]}" + --gpu-memory-utilization 0.95 + --kv-cache-dtype fp8 \ + --block-size=32 + --trust-remote-code + --attention-backend "ROCM_AITER_FA" + --max-num-seqs "$CONC" + "${PREFIX_CACHE_ARGS[@]}" + "${OFFLOAD_ARGS[@]}" +) +printf '%q ' "${VLLM_CMD[@]}" | tee "$RESULT_DIR/vllm_command.txt" +printf '\n' | tee -a "$RESULT_DIR/vllm_command.txt" +"${VLLM_CMD[@]}" > "$SERVER_LOG" 2>&1 & +SERVER_PID=$! +echo "Server PID: $SERVER_PID" + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +# ---- Run benchmark ---------------------------------------------------------- +build_replay_cmd "$RESULT_DIR" + +run_agentic_replay_and_write_outputs "$RESULT_DIR" \ No newline at end of file diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh index 8e15e7850..9f1f79a3f 100755 --- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh +++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh @@ -2,18 +2,23 @@ set -euo pipefail set -x -# Agentic trace replay benchmark for MiniMax-M2.5 FP8 on MI355X using vLLM. +# Agentic trace replay benchmark for Kimi-K2.5 FP4 on MI355X using vLLM. # # Required env vars: -# MODEL, TP, CONC, RESULT_DIR +# MODEL, TP, CONC, OFFLOADING, TOTAL_CPU_DRAM_GB, RESULT_DIR +# +# OFFLOADING values: +# none - vLLM GPU KV only. +# cpu - vLLM native CPU offload. +# lmcache - LMCache MP server + vLLM LMCacheMPConnector. source "$(dirname "$0")/../../benchmark_lib.sh" -check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION EP_SIZE +check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR EP_SIZE DP_ATTENTION -if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then - MAX_MODEL_LEN=131072 -fi +PORT=${PORT:-8888} +DURATION=${DURATION:-1800} +EP_SIZE=${EP_SIZE:-1} if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" @@ -24,6 +29,10 @@ if [ -n "${ROCR_VISIBLE_DEVICES:-}" ]; then export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES" fi +if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +rocm-smi || true +amd-smi || true + # `hf download` creates the target dir if missing and is itself idempotent. # When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE # Either way, MODEL_PATH is what the server is launched with. @@ -35,59 +44,240 @@ else hf download "$MODEL" export MODEL_PATH="$MODEL" fi -rocm-smi || true -amd-smi || true # ---- Resolve traces and install deps ---------------------------------------- # MiniMax-M2.5 servers run at max_model_len ~256k; the unfiltered 052726 # corpus has requests up to ~1M proxy tokens that would be rejected. # Switch to the 256k-capped variant (470 traces, max in+out <= 256k). -export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060826_256k +#export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k +#060226 +export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060226_256k resolve_trace_source install_agentic_deps # ---- Server config ---------------------------------------------------------- SERVER_LOG="$RESULT_DIR/server.log" +LMCACHE_LOG="$RESULT_DIR/lmcache_server.log" mkdir -p "$RESULT_DIR" -OFFLOAD_ARGS="" +OFFLOAD_ARGS=() +PREFIX_CACHE_ARGS=() + +# ---- Lmcache config ---------------------------------------------------------- +LMCACHE_PID="" + +cleanup_lmcache_server() { + if [[ -n "$LMCACHE_PID" ]] && kill -0 "$LMCACHE_PID" 2>/dev/null; then + kill "$LMCACHE_PID" 2>/dev/null || true + wait "$LMCACHE_PID" 2>/dev/null || true + fi +} + +trap cleanup_lmcache_server EXIT + +wait_for_lmcache_ready() { + { set +x; } 2>/dev/null + local attempts="${LMCACHE_READY_ATTEMPTS:-120}" + local tail_pid="" + + while [ ! -f "$LMCACHE_LOG" ]; do + if [[ -n "$LMCACHE_PID" ]] && ! kill -0 "$LMCACHE_PID" 2>/dev/null; then + echo "LMCache server died before creating log file. Exiting." >&2 + exit 1 + fi + sleep 1 + done + + tail -f -n +1 "$LMCACHE_LOG" & + tail_pid=$! + + for ((i = 1; i <= attempts; i++)); do + if curl --output /dev/null --silent --fail "http://127.0.0.1:${LMCACHE_HTTP_PORT}/healthcheck"; then + kill "$tail_pid" 2>/dev/null || true + wait "$tail_pid" 2>/dev/null || true + return 0 + fi + if [[ -n "$LMCACHE_PID" ]] && ! kill -0 "$LMCACHE_PID" 2>/dev/null; then + echo "LMCache server died before becoming healthy. Log follows:" >&2 + kill "$tail_pid" 2>/dev/null || true + wait "$tail_pid" 2>/dev/null || true + cat "$LMCACHE_LOG" >&2 || true + exit 1 + fi + sleep 1 + done + + echo "Timed out waiting for LMCache server healthcheck. Log follows:" >&2 + kill "$tail_pid" 2>/dev/null || true + wait "$tail_pid" 2>/dev/null || true + cat "$LMCACHE_LOG" >&2 || true + exit 1 +} + case "$OFFLOADING" in none) ;; cpu) - # SimpleCPUOffloadConnector now works on ROCm with the - # vllm/vllm-openai-rocm:nightly-51f22dcfd0... image (vllm-project/vllm@20cac26b). - # Use the same offload path as NVIDIA so cross-vendor cpu-offload - # numbers are apples-to-apples. - # MI355X nodes have substantial DRAM; override workflow default (600 GB) - # so we offload up to 2 TB of KV cache. + unset VLLM_USE_SIMPLE_KV_OFFLOAD + # MI355X nodes have ~2.7 TiB of host DRAM available for offload; + # reserve 2.5 TB for the offload pool (leaves ~200 GB headroom for + # worker RSS / page cache / slurm cgroup). + TOTAL_CPU_DRAM_GB=3000 + TOTAL_CPU_DRAM_PARTITION_GB="${TOTAL_CPU_DRAM_PARTITION_GB:-$((TOTAL_CPU_DRAM_GB / (8 / TP)))}" + # Use vLLM's regular native KV-offload path (OffloadingConnector), + # NOT the SimpleCPUOffloadConnector. The "native" backend resolves to + # OffloadingConnector by default; setting VLLM_USE_SIMPLE_KV_OFFLOAD=1 + # would switch it to SimpleCPUOffloadConnector. We intentionally leave + # that env var UNSET here so the regular OffloadingConnector path is + # used. The shortcut --kv_offloading_backend native + --kv_offloading_size + # form constructs the KVTransferConfig at engine startup + # (vllm/config/vllm.py:662). + + # Remove --disable-hybrid-kv-cache-manager and enable hybrid kv cache manager (default) + # This gives extra cache hit than disabling hybrid kv cache manager + OFFLOAD_ARGS=( + --kv_offloading_backend native + --kv_offloading_size "$TOTAL_CPU_DRAM_PARTITION_GB" + ) + ;; + lmcache) + { set +x; } 2>/dev/null + unset VLLM_USE_SIMPLE_KV_OFFLOAD + + git clone https://github.com/LMCache/LMCache.git + cd LMCache + pip install -r requirements/build.txt + CXX=hipcc BUILD_WITH_HIP=1 pip install -e . --no-build-isolation + cd .. + + python3 -c "import lmcache.integration.vllm.lmcache_mp_connector" >/dev/null + + # Match the B200 Kimi LMCache setup: keep a 2.5 TB semantic CPU KV + # pool, but let the external MP server own that pool so vLLM does not + # split --kv-offloading-size across TP ranks through the integrated + # LMCache backend. + TOTAL_CPU_DRAM_GB=3000 + LMCACHE_HOST="${LMCACHE_HOST:-127.0.0.1}" + LMCACHE_PORT="${LMCACHE_PORT:-5555}" + LMCACHE_HTTP_PORT="${LMCACHE_HTTP_PORT:-8080}" + # LMCacheMPConnector concatenates lmcache.mp.host and port into the + # ZMQ endpoint. Bind the server to a raw host, but pass the connector a + # ZMQ-style host string. + LMCACHE_CONNECT_HOST="${LMCACHE_CONNECT_HOST:-tcp://$LMCACHE_HOST}" + #LMCACHE_L1_SIZE_GB="${LMCACHE_L1_SIZE_GB:-$((TOTAL_CPU_DRAM_GB / (8 / TP)))}" + # (srok)TODO: intentionally increased DRAM size TOTAL_CPU_DRAM_GB=2000 - export VLLM_USE_SIMPLE_KV_OFFLOAD=1 - OFFLOAD_ARGS="--kv_offloading_backend native --kv_offloading_size $TOTAL_CPU_DRAM_GB --disable-hybrid-kv-cache-manager" + LMCACHE_L1_SIZE_GB="${LMCACHE_L1_SIZE_GB:-$((TOTAL_CPU_DRAM_GB))}" + LMCACHE_L1_INIT_SIZE_GB="${LMCACHE_L1_INIT_SIZE_GB:-20}" + # LMCache read locks are leases on chunks that lookup has promised + # vLLM can retrieve. The default 300s TTL is too short for this + # long-context agentic queue: TP8/conc32 can spend >300s between + # lookup and retrieve while GPU KV is saturated, which leaves the + # object present in L1 but no longer readable. Keep the 2.5 TB pool + # size unchanged and only extend the lookup-to-retrieve lease. + LMCACHE_L1_READ_TTL_SECONDS="${LMCACHE_L1_READ_TTL_SECONDS:-7200}" + # (srok) check 256 vs 32 + #LMCACHE_CHUNK_SIZE="${LMCACHE_CHUNK_SIZE:-256}" + LMCACHE_CHUNK_SIZE="${LMCACHE_CHUNK_SIZE:-32}" + LMCACHE_MAX_WORKERS="${LMCACHE_MAX_WORKERS:-$TP}" + export PYTHONHASHSEED="${PYTHONHASHSEED:-0}" + export LMCACHE_BLOCKING_TIMEOUT_SECS=120 + + set -x + echo "Starting LMCache MP server..." + LMCACHE_CMD=( + lmcache server + --host "$LMCACHE_HOST" + --port "$LMCACHE_PORT" + --http-host "$LMCACHE_HOST" + --http-port "$LMCACHE_HTTP_PORT" + --l1-size-gb "$LMCACHE_L1_SIZE_GB" + --l1-init-size-gb "$LMCACHE_L1_INIT_SIZE_GB" + --l1-read-ttl-seconds "$LMCACHE_L1_READ_TTL_SECONDS" + --chunk-size "$LMCACHE_CHUNK_SIZE" + --max-workers "$LMCACHE_MAX_WORKERS" + --eviction-policy LRU + ) + printf '%q ' "${LMCACHE_CMD[@]}" > "$RESULT_DIR/lmcache_command.txt" + printf '\n' >> "$RESULT_DIR/lmcache_command.txt" + "${LMCACHE_CMD[@]}" > "$LMCACHE_LOG" 2>&1 & + LMCACHE_PID=$! + echo "LMCache server PID: $LMCACHE_PID" + wait_for_lmcache_ready + + PREFIX_CACHE_ARGS=(--enable-prefix-caching) + # Remove --disable-hybrid-kv-cache-manager and enable hybrid kv cache manager (default) + # This gives extra cache hit than disabling hybrid kv cache manager + OFFLOAD_ARGS=( + --kv-transfer-config + "{\"kv_connector\":\"LMCacheMPConnector\",\"kv_connector_module_path\":\"lmcache.integration.vllm.lmcache_mp_connector\",\"kv_role\":\"kv_both\",\"kv_connector_extra_config\":{\"lmcache.mp.host\":\"$LMCACHE_CONNECT_HOST\",\"lmcache.mp.port\":$LMCACHE_PORT}}" + ) ;; *) echo "Error: unsupported OFFLOADING value '$OFFLOADING'" >&2; exit 1 ;; esac -if [ "$EP_SIZE" -gt 1 ]; then EP=" --enable-expert-parallel"; else EP=" "; fi +# ---- LLM server config ---------------------------------------------------------- +EP_ARGS=() +if [ "$EP_SIZE" -gt 1 ]; then + EP_ARGS=(--enable-expert-parallel) +fi echo "Starting vllm server..." +export PYTHONNOUSERSITE=1 + +# Install amd-quark for MXFP4 (manual install due to ROCm vLLM bug) +pip install -q amd-quark + +# Workaround for MEC FW <177 RCCL memory reclaim issue +version=$(rocm-smi --showfw 2>/dev/null | grep MEC | head -n 1 | awk '{print $NF}') +if [[ "$version" == "" || ${version:-0} -lt 177 ]]; then + export HSA_NO_SCRATCH_RECLAIM=1 +fi + export VLLM_ROCM_USE_AITER=1 export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 -export PYTHONNOUSERSITE=1 +export VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=0 +VLLM_BLOCK_SIZE=32 +ASYNC_SCHEDULING_ARGS="" + +if [[ "$TP" == "8" && "$EP_SIZE" == "8" ]]; then + export VLLM_ROCM_USE_AITER_MOE=0 + ASYNC_SCHEDULING_ARGS="--no-async-scheduling" + echo "TP8/EP8: using block size 32, shuffle disabled, AITER MoE disabled, async scheduling disabled." +elif (( CONC < 64 )); then + ASYNC_SCHEDULING_ARGS="--no-async-scheduling" + echo "c${CONC}: using block size 32, shuffle disabled, async scheduling disabled." +elif (( CONC == 64 )); then + ASYNC_SCHEDULING_ARGS="--no-async-scheduling" + export VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=1 + VLLM_BLOCK_SIZE=16 + echo "c64: using block size 16, shuffle enabled, async scheduling disabled." +else + export VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=1 + VLLM_BLOCK_SIZE=16 + echo "c${CONC}: using block size 16, shuffle enabled, async scheduling enabled." +fi -vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \ ---host 0.0.0.0 \ ---port $PORT \ ---tensor-parallel-size=$TP \ -$EP \ ---gpu-memory-utilization 0.95 \ ---max-model-len $MAX_MODEL_LEN \ ---kv-cache-dtype fp8 \ ---block-size=32 \ ---max-num-seqs $CONC \ ---attention-backend "ROCM_AITER_UNIFIED_ATTN" \ ---trust-remote-code \ -$OFFLOAD_ARGS > "$SERVER_LOG" 2>&1 & +{ set +x; } 2>/dev/null +VLLM_CMD=( + vllm serve "$MODEL" + --host 0.0.0.0 + --port "$PORT" + --tensor-parallel-size="$TP" + "${EP_ARGS[@]}" + --gpu-memory-utilization 0.95 + --kv-cache-dtype fp8 + --block-size=$VLLM_BLOCK_SIZE + --trust-remote-code + --attention-backend "ROCM_AITER_FA" + --max-num-seqs "$CONC" + $ASYNC_SCHEDULING_ARGS + "${PREFIX_CACHE_ARGS[@]}" + "${OFFLOAD_ARGS[@]}" +) +printf '%q ' "${VLLM_CMD[@]}" | tee "$RESULT_DIR/vllm_command.txt" +printf '\n' | tee -a "$RESULT_DIR/vllm_command.txt" +"${VLLM_CMD[@]}" > "$SERVER_LOG" 2>&1 & SERVER_PID=$! echo "Server PID: $SERVER_PID" diff --git a/benchmarks/single_node/agentic/qwen3.5_fp4_mi355x.sh b/benchmarks/single_node/agentic/qwen3.5_fp4_mi355x.sh new file mode 100755 index 000000000..fe85b05ab --- /dev/null +++ b/benchmarks/single_node/agentic/qwen3.5_fp4_mi355x.sh @@ -0,0 +1,150 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x + +# Agentic trace replay benchmark for Qwen3.5 FP8 on MI300X using SGLang. +# +# Base server recipe follows the upstream MI300X reference +# (benchmarks/single_node/qwen3.5_fp8_mi300x.sh, the "AMD Andy" recipe): +# aiter attention backend, aiter allreduce fusion, mem-fraction 0.75. +# The agentic harness (resolve_trace_source / build_replay_cmd / +# run_agentic_replay_and_write_outputs) replaces run_benchmark_serving, and +# --disable-radix-cache is dropped because agentic replay needs prefix reuse. +# +# Required env vars: +# MODEL, TP, CONC, OFFLOADING, TOTAL_CPU_DRAM_GB, RESULT_DIR, DURATION, EP_SIZE +# +# OFFLOADING values: +# none - SGLang GPU KV with the default RadixAttention prefix cache. +# hicache - SGLang HiCache with a local CPU hierarchical cache on top of radix. + +source "$(dirname "$0")/../../benchmark_lib.sh" + +check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR EP_SIZE DP_ATTENTION + +PORT=${PORT:-8888} +DURATION=${DURATION:-1800} +EP_SIZE=${EP_SIZE:-1} + +SCHEDULER_RECV_INTERVAL=${SCHEDULER_RECV_INTERVAL:-30} + +if [[ -n "${SLURM_JOB_ID:-}" ]]; then + echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" +fi + +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi + +rocm-smi || true +amd-smi || true + +# ---- Resolve traces and install deps ---------------------------------------- +# Cap the replay corpus at 256k (470 traces, max in+out <= 256k) instead of the +# unfiltered 052726 corpus whose ~1M-token traces get rejected and add no perf +# signal at high concurrency. +#export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k +#060226 +export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060226_256k + +# ---- Resolve traces and install deps ---------------------------------------- +resolve_trace_source +install_agentic_deps + +# ---- Cache / offload config ------------------------------------------------- +SERVER_LOG="$RESULT_DIR/server.log" +mkdir -p "$RESULT_DIR" + +CACHE_ARGS=() +WARMUP_ARGS=() +CUDA_GRAPH_MAX_BS="$CONC" +case "$OFFLOADING" in + none) + # Leave SGLang's default RadixAttention prefix cache on — agentic + # replay needs it; --disable-radix-cache would zero the hit rate. + ;; + hicache) + # Qwen3.5's hybrid GDN/Mamba path allocates two HiCache host pools per + # TP rank (one hierarchical KV, one hierarchical Mamba), so the + # node-total DRAM budget divides by TP and the host-pool count. + TOTAL_CPU_DRAM_GB=3000 + HICACHE_HOST_POOL_COUNT="${HICACHE_HOST_POOL_COUNT:-2}" + HICACHE_MAX_SIZE_GB_PER_RANK_POOL="${HICACHE_MAX_SIZE_GB_PER_RANK_POOL:-${HICACHE_MAX_SIZE_GB_PER_RANK:-300}}" + HICACHE_WRITE_POLICY="${HICACHE_WRITE_POLICY:-write_through_selective}" + # Qwen3.5's hybrid Mamba path runs SGLang's no_buffer scheduler, which + # requires page_size=1. Keep the safer direct/layer_first copy path; + # kernel/page_first faults on first prefill in this mode on ROCm. + HICACHE_PAGE_SIZE="${HICACHE_PAGE_SIZE:-1}" + HICACHE_IO_BACKEND="${HICACHE_IO_BACKEND:-direct}" + HICACHE_MEM_LAYOUT="${HICACHE_MEM_LAYOUT:-layer_first}" + HICACHE_SIZE_GB="${HICACHE_SIZE_GB:-$((TOTAL_CPU_DRAM_GB / TP / HICACHE_HOST_POOL_COUNT))}" + if [ "$HICACHE_SIZE_GB" -gt "$HICACHE_MAX_SIZE_GB_PER_RANK_POOL" ]; then + HICACHE_SIZE_GB="$HICACHE_MAX_SIZE_GB_PER_RANK_POOL" + fi + if [ "$HICACHE_SIZE_GB" -lt 1 ]; then + echo "Error: computed HICACHE_SIZE_GB=$HICACHE_SIZE_GB from TOTAL_CPU_DRAM_GB=$TOTAL_CPU_DRAM_GB, TP=$TP, HICACHE_HOST_POOL_COUNT=$HICACHE_HOST_POOL_COUNT" >&2 + exit 1 + fi + echo "HiCache CPU pool: ${HICACHE_SIZE_GB} GB per rank per host pool across TP=${TP}, host_pool_count=${HICACHE_HOST_POOL_COUNT}" + CACHE_ARGS=( + --page-size "$HICACHE_PAGE_SIZE" + --enable-hierarchical-cache + --hicache-size "$HICACHE_SIZE_GB" + --hicache-io-backend "$HICACHE_IO_BACKEND" + --hicache-mem-layout "$HICACHE_MEM_LAYOUT" + --hicache-write-policy "$HICACHE_WRITE_POLICY" + ) + # HiCache startup reaches API readiness but SGLang's internal warmup + # request can time out on this path; let aiperf own benchmark traffic. + WARMUP_ARGS=(--skip-server-warmup) + # Don't force ROCm graph capture at every high concurrency point; conc=16 + # is the highest known-good capture size for this model/server path. + HICACHE_CUDA_GRAPH_MAX_BS="${HICACHE_CUDA_GRAPH_MAX_BS:-256}" + if [ "$HICACHE_CUDA_GRAPH_MAX_BS" -lt "$CUDA_GRAPH_MAX_BS" ]; then + CUDA_GRAPH_MAX_BS="$HICACHE_CUDA_GRAPH_MAX_BS" + fi + ;; + *) + echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, hicache)" >&2 + exit 1 + ;; +esac + +echo "Starting SGLang server..." +export PYTHONNOUSERSITE=1 + +python3 -m sglang.launch_server \ + --attention-backend aiter \ + --model-path $MODEL \ + --host=0.0.0.0 \ + --port $PORT \ + --tensor-parallel-size $TP \ + --ep-size $EP_SIZE \ + --trust-remote-code \ + --model-loader-extra-config '{"enable_multithread_load": true}' \ + --watchdog-timeout 1200 \ + --tokenizer-worker-num 6 \ + --cuda-graph-max-bs $CONC \ + --max-running-requests $CONC \ + --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \ + --mem-fraction-static 0.8 \ + "${CACHE_ARGS[@]}" \ + "${WARMUP_ARGS[@]}" \ + --enable-metrics > "$SERVER_LOG" 2>&1 & +SERVER_PID=$! +echo "Server PID: $SERVER_PID" + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +# ---- Run benchmark ---------------------------------------------------------- +build_replay_cmd "$RESULT_DIR" + +run_agentic_replay_and_write_outputs "$RESULT_DIR" \ No newline at end of file diff --git a/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh b/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh index ff901b674..8c6f82410 100755 --- a/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh +++ b/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh @@ -2,18 +2,31 @@ set -euo pipefail set -x -# Agentic trace replay benchmark for Qwen3.5 FP8 on MI355X using SGLang. +# Agentic trace replay benchmark for Qwen3.5 FP8 on MI300X using SGLang. +# +# Base server recipe follows the upstream MI300X reference +# (benchmarks/single_node/qwen3.5_fp8_mi300x.sh, the "AMD Andy" recipe): +# aiter attention backend, aiter allreduce fusion, mem-fraction 0.75. +# The agentic harness (resolve_trace_source / build_replay_cmd / +# run_agentic_replay_and_write_outputs) replaces run_benchmark_serving, and +# --disable-radix-cache is dropped because agentic replay needs prefix reuse. # # Required env vars: -# MODEL, TP, CONC, RESULT_DIR +# MODEL, TP, CONC, OFFLOADING, TOTAL_CPU_DRAM_GB, RESULT_DIR, DURATION, EP_SIZE +# +# OFFLOADING values: +# none - SGLang GPU KV with the default RadixAttention prefix cache. +# hicache - SGLang HiCache with a local CPU hierarchical cache on top of radix. source "$(dirname "$0")/../../benchmark_lib.sh" -check_env_vars MODEL TP CONC RESULT_DIR DURATION EP_SIZE +check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR EP_SIZE DP_ATTENTION -if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then - MAX_MODEL_LEN=131072 -fi +PORT=${PORT:-8888} +DURATION=${DURATION:-1800} +EP_SIZE=${EP_SIZE:-1} + +SCHEDULER_RECV_INTERVAL=${SCHEDULER_RECV_INTERVAL:-30} if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" @@ -30,23 +43,87 @@ else hf download "$MODEL" export MODEL_PATH="$MODEL" fi + rocm-smi || true amd-smi || true +# ---- Resolve traces and install deps ---------------------------------------- +# Cap the replay corpus at 256k (470 traces, max in+out <= 256k) instead of the +# unfiltered 052726 corpus whose ~1M-token traces get rejected and add no perf +# signal at high concurrency. +#export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k +#060226 +export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060226_256k + # ---- Resolve traces and install deps ---------------------------------------- resolve_trace_source install_agentic_deps -# ---- Start SGLang server ---------------------------------------------------- +# ---- Cache / offload config ------------------------------------------------- SERVER_LOG="$RESULT_DIR/server.log" mkdir -p "$RESULT_DIR" +CACHE_ARGS=() +WARMUP_ARGS=() +CUDA_GRAPH_MAX_BS="$CONC" +case "$OFFLOADING" in + none) + # Leave SGLang's default RadixAttention prefix cache on — agentic + # replay needs it; --disable-radix-cache would zero the hit rate. + ;; + hicache) + # Qwen3.5's hybrid GDN/Mamba path allocates two HiCache host pools per + # TP rank (one hierarchical KV, one hierarchical Mamba), so the + # node-total DRAM budget divides by TP and the host-pool count. + TOTAL_CPU_DRAM_GB=3000 + HICACHE_HOST_POOL_COUNT="${HICACHE_HOST_POOL_COUNT:-2}" + HICACHE_MAX_SIZE_GB_PER_RANK_POOL="${HICACHE_MAX_SIZE_GB_PER_RANK_POOL:-${HICACHE_MAX_SIZE_GB_PER_RANK:-300}}" + HICACHE_WRITE_POLICY="${HICACHE_WRITE_POLICY:-write_through_selective}" + # Qwen3.5's hybrid Mamba path runs SGLang's no_buffer scheduler, which + # requires page_size=1. Keep the safer direct/layer_first copy path; + # kernel/page_first faults on first prefill in this mode on ROCm. + HICACHE_PAGE_SIZE="${HICACHE_PAGE_SIZE:-1}" + HICACHE_IO_BACKEND="${HICACHE_IO_BACKEND:-direct}" + HICACHE_MEM_LAYOUT="${HICACHE_MEM_LAYOUT:-layer_first}" + HICACHE_SIZE_GB="${HICACHE_SIZE_GB:-$((TOTAL_CPU_DRAM_GB / TP / HICACHE_HOST_POOL_COUNT))}" + if [ "$HICACHE_SIZE_GB" -gt "$HICACHE_MAX_SIZE_GB_PER_RANK_POOL" ]; then + HICACHE_SIZE_GB="$HICACHE_MAX_SIZE_GB_PER_RANK_POOL" + fi + if [ "$HICACHE_SIZE_GB" -lt 1 ]; then + echo "Error: computed HICACHE_SIZE_GB=$HICACHE_SIZE_GB from TOTAL_CPU_DRAM_GB=$TOTAL_CPU_DRAM_GB, TP=$TP, HICACHE_HOST_POOL_COUNT=$HICACHE_HOST_POOL_COUNT" >&2 + exit 1 + fi + echo "HiCache CPU pool: ${HICACHE_SIZE_GB} GB per rank per host pool across TP=${TP}, host_pool_count=${HICACHE_HOST_POOL_COUNT}" + CACHE_ARGS=( + --page-size "$HICACHE_PAGE_SIZE" + --enable-hierarchical-cache + --hicache-size "$HICACHE_SIZE_GB" + --hicache-io-backend "$HICACHE_IO_BACKEND" + --hicache-mem-layout "$HICACHE_MEM_LAYOUT" + --hicache-write-policy "$HICACHE_WRITE_POLICY" + ) + # HiCache startup reaches API readiness but SGLang's internal warmup + # request can time out on this path; let aiperf own benchmark traffic. + WARMUP_ARGS=(--skip-server-warmup) + # Don't force ROCm graph capture at every high concurrency point; conc=16 + # is the highest known-good capture size for this model/server path. + HICACHE_CUDA_GRAPH_MAX_BS="${HICACHE_CUDA_GRAPH_MAX_BS:-256}" + if [ "$HICACHE_CUDA_GRAPH_MAX_BS" -lt "$CUDA_GRAPH_MAX_BS" ]; then + CUDA_GRAPH_MAX_BS="$HICACHE_CUDA_GRAPH_MAX_BS" + fi + ;; + *) + echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, hicache)" >&2 + exit 1 + ;; +esac + echo "Starting SGLang server..." export PYTHONNOUSERSITE=1 python3 -m sglang.launch_server \ --attention-backend triton \ - --model-path "$MODEL_PATH" --served-model-name "$MODEL" \ + --model-path $MODEL \ --host=0.0.0.0 \ --port $PORT \ --tensor-parallel-size $TP \ @@ -56,10 +133,10 @@ python3 -m sglang.launch_server \ --enable-aiter-allreduce-fusion \ --cuda-graph-max-bs $CONC \ --max-running-requests $CONC \ - --max-prefill-tokens 32768 \ - --scheduler-recv-interval 30 \ + --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \ --mem-fraction-static 0.8 \ - --context-length $MAX_MODEL_LEN \ + "${CACHE_ARGS[@]}" \ + "${WARMUP_ARGS[@]}" \ --enable-metrics > "$SERVER_LOG" 2>&1 & SERVER_PID=$! echo "Server PID: $SERVER_PID" @@ -69,4 +146,4 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S # ---- Run benchmark ---------------------------------------------------------- build_replay_cmd "$RESULT_DIR" -run_agentic_replay_and_write_outputs "$RESULT_DIR" +run_agentic_replay_and_write_outputs "$RESULT_DIR" \ No newline at end of file diff --git a/runners/launch_mi355x-amds.sh b/runners/launch_mi355x-amds.sh index d62e6bc4b..96b9ad4f0 100644 --- a/runners/launch_mi355x-amds.sh +++ b/runners/launch_mi355x-amds.sh @@ -210,7 +210,7 @@ else # mia1-p01-g09: pyxis broken (persistently fails to create container filesystem) # mia1-p01-g11: docker.sock permissions denied (cluster-cleanup step fails) # Both have been root-caused via #1431/#1432/#1440/#1441/#1443 sweep failures. - salloc --partition=$PARTITION --exclude=mia1-p01-g09,mia1-p01-g11 --gres=gpu:$TP --exclusive --cpus-per-task=128 --time=500 --no-shell --job-name="$RUNNER_NAME" + salloc --partition=$PARTITION --exclude=mia1-p01-g09,mia1-p01-g11,mia1-p01-g37 --gres=gpu:$TP --exclusive --cpus-per-task=128 --time=500 --no-shell --job-name="$RUNNER_NAME" JOB_ID=$(squeue --name="$RUNNER_NAME" -h -o %A | head -n1) srun --jobid=$JOB_ID bash -c "docker stop \$(docker ps -a -q)" From ba1bb37c2c95215185a2a103516aa21f64bb5b65 Mon Sep 17 00:00:00 2001 From: ajith-sirra-amd Date: Thu, 11 Jun 2026 12:54:36 +0530 Subject: [PATCH 02/21] Add GLM5.1 & Qwen3.5 MI300 Agentic Scripts Signed-off-by: ajith-sirra-amd --- .../single_node/agentic/glm5.1_fp8_mi300x.sh | 117 ++++++++++++++ .../single_node/agentic/qwen3.5_fp8_mi300x.sh | 147 ++++++++++++++++++ 2 files changed, 264 insertions(+) create mode 100644 benchmarks/single_node/agentic/glm5.1_fp8_mi300x.sh create mode 100644 benchmarks/single_node/agentic/qwen3.5_fp8_mi300x.sh diff --git a/benchmarks/single_node/agentic/glm5.1_fp8_mi300x.sh b/benchmarks/single_node/agentic/glm5.1_fp8_mi300x.sh new file mode 100644 index 000000000..3918ef9de --- /dev/null +++ b/benchmarks/single_node/agentic/glm5.1_fp8_mi300x.sh @@ -0,0 +1,117 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x + +# Agentic trace replay benchmark for GLM-5.1 FP8 on MI300X using SGLang. +# +# Required env vars: +# MODEL, TP, CONC, OFFLOADING, TOTAL_CPU_DRAM_GB, RESULT_DIR, DURATION +# +# OFFLOADING values: +# none - SGLang GPU KV with the default RadixAttention prefix cache. +# hicache - SGLang HiCache with a local CPU hierarchical cache on top of radix. + +source "$(dirname "$0")/../../benchmark_lib.sh" + +check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION + +if [[ -n "${SLURM_JOB_ID:-}" ]]; then + echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" +fi + +if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +rocm-smi || true +amd-smi || true + +# ---- Resolve traces and install deps ---------------------------------------- +resolve_trace_source +install_agentic_deps + +# ROCm / SGLang performance tuning for MI300X (gfx942) +export SAFETENSORS_FAST_GPU=1 + +# ---- Cache / offload config ------------------------------------------------- +SERVER_LOG="$RESULT_DIR/server.log" +mkdir -p "$RESULT_DIR" + +CACHE_ARGS=() +CUDA_GRAPH_MAX_BS="$CONC" +case "$OFFLOADING" in + none) + # Leave SGLang's default RadixAttention prefix cache on — agentic + # replay needs it; --disable-radix-cache would zero the hit rate. + ;; + hicache) + # GLM-5.1 is a dense-KV (non-hybrid) model, so it allocates a single + # HiCache host pool per TP rank. --hicache-size is per rank per host + # pool while the workflow input is a node-total DRAM budget, so divide + # by TP and the host-pool count. Overridable for one-off tuning. + HICACHE_HOST_POOL_COUNT="${HICACHE_HOST_POOL_COUNT:-1}" + HICACHE_MAX_SIZE_GB_PER_RANK_POOL="${HICACHE_MAX_SIZE_GB_PER_RANK_POOL:-180}" + HICACHE_WRITE_POLICY="${HICACHE_WRITE_POLICY:-write_through}" + HICACHE_IO_BACKEND="${HICACHE_IO_BACKEND:-direct}" + HICACHE_MEM_LAYOUT="${HICACHE_MEM_LAYOUT:-layer_first}" + HICACHE_SIZE_GB="${HICACHE_SIZE_GB:-$((TOTAL_CPU_DRAM_GB / TP / HICACHE_HOST_POOL_COUNT))}" + if [ "$HICACHE_SIZE_GB" -gt "$HICACHE_MAX_SIZE_GB_PER_RANK_POOL" ]; then + HICACHE_SIZE_GB="$HICACHE_MAX_SIZE_GB_PER_RANK_POOL" + fi + if [ "$HICACHE_SIZE_GB" -lt 1 ]; then + echo "Error: computed HICACHE_SIZE_GB=$HICACHE_SIZE_GB from TOTAL_CPU_DRAM_GB=$TOTAL_CPU_DRAM_GB, TP=$TP, HICACHE_HOST_POOL_COUNT=$HICACHE_HOST_POOL_COUNT" >&2 + exit 1 + fi + echo "HiCache CPU pool: ${HICACHE_SIZE_GB} GB per rank per host pool across TP=${TP}, host_pool_count=${HICACHE_HOST_POOL_COUNT}" + CACHE_ARGS=( + --enable-hierarchical-cache + --hicache-size "$HICACHE_SIZE_GB" + --hicache-io-backend "$HICACHE_IO_BACKEND" + --hicache-mem-layout "$HICACHE_MEM_LAYOUT" + --hicache-write-policy "$HICACHE_WRITE_POLICY" + ) + # Capture ROCm graphs up to full concurrency so the hicache arm is a + # fair A/B against the none arm (which captures to $CONC). The MI355X + # recipe caps this at 16 due to a high-conc capture crash on that HW; + # on MI300X we follow $CONC. Override via env if MI300X hits the same + # startup crash at high conc. + HICACHE_CUDA_GRAPH_MAX_BS="${HICACHE_CUDA_GRAPH_MAX_BS:-$CONC}" + if [ "$HICACHE_CUDA_GRAPH_MAX_BS" -lt "$CUDA_GRAPH_MAX_BS" ]; then + CUDA_GRAPH_MAX_BS="$HICACHE_CUDA_GRAPH_MAX_BS" + fi + ;; + *) + echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, hicache)" >&2 + exit 1 + ;; +esac + +pip install -U transformers + +echo "Starting SGLang server..." +export PYTHONNOUSERSITE=1 + +python3 -m sglang.launch_server \ + --model-path $MODEL \ + --host=0.0.0.0 \ + --port $PORT \ + --tensor-parallel-size $TP \ + --trust-remote-code \ + --cuda-graph-max-bs $CUDA_GRAPH_MAX_BS \ + --max-running-requests $CONC \ + --mem-fraction-static 0.85 \ + --tool-call-parser glm47 \ + --reasoning-parser glm45 \ + --model-loader-extra-config '{"enable_multithread_load": true, "num_threads": 8}' \ + --nsa-prefill-backend tilelang \ + --nsa-decode-backend tilelang \ + --kv-cache-dtype fp8_e4m3 \ + --tokenizer-worker-num $((TP*2)) \ + "${CACHE_ARGS[@]}" \ + --enable-metrics > "$SERVER_LOG" 2>&1 & +SERVER_PID=$! +echo "Server PID: $SERVER_PID" + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +# ---- Run benchmark ---------------------------------------------------------- +build_replay_cmd "$RESULT_DIR" + +run_agentic_replay_and_write_outputs "$RESULT_DIR" \ No newline at end of file diff --git a/benchmarks/single_node/agentic/qwen3.5_fp8_mi300x.sh b/benchmarks/single_node/agentic/qwen3.5_fp8_mi300x.sh new file mode 100644 index 000000000..e1032772d --- /dev/null +++ b/benchmarks/single_node/agentic/qwen3.5_fp8_mi300x.sh @@ -0,0 +1,147 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x + +# Agentic trace replay benchmark for Qwen3.5 FP8 on MI300X using SGLang. +# +# Base server recipe follows the upstream MI300X reference +# (benchmarks/single_node/qwen3.5_fp8_mi300x.sh, the "AMD Andy" recipe): +# aiter attention backend, aiter allreduce fusion, mem-fraction 0.75. +# The agentic harness (resolve_trace_source / build_replay_cmd / +# run_agentic_replay_and_write_outputs) replaces run_benchmark_serving, and +# --disable-radix-cache is dropped because agentic replay needs prefix reuse. +# +# Required env vars: +# MODEL, TP, CONC, OFFLOADING, TOTAL_CPU_DRAM_GB, RESULT_DIR, DURATION, EP_SIZE +# +# OFFLOADING values: +# none - SGLang GPU KV with the default RadixAttention prefix cache. +# hicache - SGLang HiCache with a local CPU hierarchical cache on top of radix. + +source "$(dirname "$0")/../../benchmark_lib.sh" + +check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION EP_SIZE + +SCHEDULER_RECV_INTERVAL=${SCHEDULER_RECV_INTERVAL:-30} + +if [[ -n "${SLURM_JOB_ID:-}" ]]; then + echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" +fi + +if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +rocm-smi || true +amd-smi || true + +# RCCL on these MI300X hosts fails ncclCommInitRank with an unhandled CUDA +# error when P2P is enabled; disable the P2P transport so TP init falls back +# to the shared-memory path. Overridable for hosts where P2P works. +export NCCL_P2P_DISABLE="${NCCL_P2P_DISABLE:-1}" + +# ---- Resolve traces and install deps ---------------------------------------- +# Cap the replay corpus at 256k (470 traces, max in+out <= 256k) instead of the +# unfiltered 052726 corpus whose ~1M-token traces get rejected and add no perf +# signal at high concurrency. +export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k + +resolve_trace_source +install_agentic_deps + +# ---- Cache / offload config ------------------------------------------------- +SERVER_LOG="$RESULT_DIR/server.log" +mkdir -p "$RESULT_DIR" + +CACHE_ARGS=() +WARMUP_ARGS=() +CUDA_GRAPH_MAX_BS="$CONC" +case "$OFFLOADING" in + none) + # Leave SGLang's default RadixAttention prefix cache on — agentic + # replay needs it; --disable-radix-cache would zero the hit rate. + ;; + hicache) + # Qwen3.5's hybrid GDN/Mamba path allocates two HiCache host pools per + # TP rank (one hierarchical KV, one hierarchical Mamba), so the + # node-total DRAM budget divides by TP and the host-pool count. + # MI300X nodes here expose ~2.3 TB usable CPU DRAM. The hybrid + # GDN/Mamba path allocates TWO host pools per TP rank (KV + Mamba), so + # the node total is HICACHE_SIZE_GB * TP * HICACHE_HOST_POOL_COUNT. The + # harness passes a generic TOTAL_CPU_DRAM_GB=2500, which yields + # 2500/8/2=156 GB/pool -> 156*8*2=2496 GB > available -> OOM-kill (137). + # Default to a node-safe 1900 (1888 GB allocated), overridable via env. + TOTAL_CPU_DRAM_GB="${HICACHE_TOTAL_CPU_DRAM_GB:-1900}" + HICACHE_HOST_POOL_COUNT="${HICACHE_HOST_POOL_COUNT:-2}" + HICACHE_MAX_SIZE_GB_PER_RANK_POOL="${HICACHE_MAX_SIZE_GB_PER_RANK_POOL:-${HICACHE_MAX_SIZE_GB_PER_RANK:-180}}" + HICACHE_WRITE_POLICY="${HICACHE_WRITE_POLICY:-write_through_selective}" + # Qwen3.5's hybrid Mamba path runs SGLang's no_buffer scheduler, which + # requires page_size=1. Keep the safer direct/layer_first copy path; + # kernel/page_first faults on first prefill in this mode on ROCm. + HICACHE_PAGE_SIZE="${HICACHE_PAGE_SIZE:-1}" + HICACHE_IO_BACKEND="${HICACHE_IO_BACKEND:-direct}" + HICACHE_MEM_LAYOUT="${HICACHE_MEM_LAYOUT:-layer_first}" + HICACHE_SIZE_GB="${HICACHE_SIZE_GB:-$((TOTAL_CPU_DRAM_GB / TP / HICACHE_HOST_POOL_COUNT))}" + if [ "$HICACHE_SIZE_GB" -gt "$HICACHE_MAX_SIZE_GB_PER_RANK_POOL" ]; then + HICACHE_SIZE_GB="$HICACHE_MAX_SIZE_GB_PER_RANK_POOL" + fi + if [ "$HICACHE_SIZE_GB" -lt 1 ]; then + echo "Error: computed HICACHE_SIZE_GB=$HICACHE_SIZE_GB from TOTAL_CPU_DRAM_GB=$TOTAL_CPU_DRAM_GB, TP=$TP, HICACHE_HOST_POOL_COUNT=$HICACHE_HOST_POOL_COUNT" >&2 + exit 1 + fi + echo "HiCache CPU pool: ${HICACHE_SIZE_GB} GB per rank per host pool across TP=${TP}, host_pool_count=${HICACHE_HOST_POOL_COUNT}" + CACHE_ARGS=( + --page-size "$HICACHE_PAGE_SIZE" + --enable-hierarchical-cache + --hicache-size "$HICACHE_SIZE_GB" + --hicache-io-backend "$HICACHE_IO_BACKEND" + --hicache-mem-layout "$HICACHE_MEM_LAYOUT" + --hicache-write-policy "$HICACHE_WRITE_POLICY" + ) + # HiCache startup reaches API readiness but SGLang's internal warmup + # request can time out on this path; let aiperf own benchmark traffic. + WARMUP_ARGS=(--skip-server-warmup) + # Capture ROCm graphs up to full concurrency so the hicache arm is a + # fair A/B against the none arm (which captures to $CONC). The MI355X + # recipe caps this at 16 due to a high-conc capture crash on that HW; + # on MI300X we lift it to match $CONC. Override via env if MI300X hits + # the same startup crash at high conc. + HICACHE_CUDA_GRAPH_MAX_BS="${HICACHE_CUDA_GRAPH_MAX_BS:-$CONC}" + if [ "$HICACHE_CUDA_GRAPH_MAX_BS" -lt "$CUDA_GRAPH_MAX_BS" ]; then + CUDA_GRAPH_MAX_BS="$HICACHE_CUDA_GRAPH_MAX_BS" + fi + ;; + *) + echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, hicache)" >&2 + exit 1 + ;; +esac + +echo "Starting SGLang server..." +export PYTHONNOUSERSITE=1 + +# following AMD Andy's MI300X recipe +# https://www.linkedin.com/feed/update/urn:li:activity:7429203734389280768/ +python3 -m sglang.launch_server \ + --attention-backend aiter \ + --model-path $MODEL \ + --host=0.0.0.0 \ + --port $PORT \ + --tensor-parallel-size $TP \ + --ep-size $EP_SIZE \ + --trust-remote-code \ + --tokenizer-worker-num 6 \ + --enable-aiter-allreduce-fusion \ + --cuda-graph-max-bs $CUDA_GRAPH_MAX_BS \ + --max-running-requests $CONC \ + --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \ + --mem-fraction-static 0.75 \ + "${CACHE_ARGS[@]}" \ + "${WARMUP_ARGS[@]}" \ + --enable-metrics > "$SERVER_LOG" 2>&1 & +SERVER_PID=$! +echo "Server PID: $SERVER_PID" + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +# ---- Run benchmark ---------------------------------------------------------- +build_replay_cmd "$RESULT_DIR" + +run_agentic_replay_and_write_outputs "$RESULT_DIR" \ No newline at end of file From eba42330b46d3388c0bb7cf78e537410930237dd Mon Sep 17 00:00:00 2001 From: seungrokj Date: Fri, 12 Jun 2026 15:07:17 +0900 Subject: [PATCH 03/21] [AMD] add DSV4-FP4-MI355x atom agentic benchmark and master yaml config Co-Authored-By: Claude Sonnet 4.6 --- .github/configs/amd-master.yaml | 16 ++ .../agentic/dsv4_fp4_mi355x_atom.sh | 260 ++++++++++++++++++ 2 files changed, 276 insertions(+) create mode 100755 benchmarks/single_node/agentic/dsv4_fp4_mi355x_atom.sh diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index ee8718506..6b6f38fb5 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -2174,6 +2174,22 @@ dsv4-fp4-mi355x-atom-mtp: search-space: - { tp: 8, ep: 1, conc-start: 1, conc-end: 1024, spec-decoding: mtp } +# target +dsv4-fp4-mi355x-atom-agentic-lmcache: + image: rocm/atom-dev:nightly_202606101557 + model: deepseek-ai/DeepSeek-V4-Pro + model-prefix: dsv4 + runner: mi355x + precision: fp4 + framework: atom + multinode: false + scenarios: + agentic-coding: + - duration: 1800 + search-space: + - { tp: 8, ep: 1, offloading: none, conc-list: [44, 48, 52, 56, 60] } + #- { tp: 8, ep: 1, offloading: lmcache, conc-list: [44, 48, 52, 56, 60] } + qwen3.5-bf16-mi325x-sglang-mtp: image: lmsysorg/sglang:v0.5.12-rocm720-mi30x model: Qwen/Qwen3.5-397B-A17B diff --git a/benchmarks/single_node/agentic/dsv4_fp4_mi355x_atom.sh b/benchmarks/single_node/agentic/dsv4_fp4_mi355x_atom.sh new file mode 100755 index 000000000..701f39b41 --- /dev/null +++ b/benchmarks/single_node/agentic/dsv4_fp4_mi355x_atom.sh @@ -0,0 +1,260 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x + +# Agentic trace replay benchmark for Kimi-K2.5 FP4 on MI355X using ATOM. +# +# Required env vars: +# MODEL, TP, CONC, OFFLOADING, TOTAL_CPU_DRAM_GB, RESULT_DIR +# +# OFFLOADING values: +# none - ATOM GPU KV only. +# cpu - ATOM native CPU offload. +# lmcache - LMCache MP server + ATOM LMCacheMPConnector. + +source "$(dirname "$0")/../../benchmark_lib.sh" + +check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR EP_SIZE DP_ATTENTION + +PORT=${PORT:-8888} +DURATION=${DURATION:-1800} +EP_SIZE=${EP_SIZE:-1} + +if [[ -n "${SLURM_JOB_ID:-}" ]]; then + echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" +fi + +# ROCR/HIP visibility for ATOM +if [ -n "${ROCR_VISIBLE_DEVICES:-}" ]; then + export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES" +fi + +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi + +rocm-smi || true +amd-smi || true + +# ---- Resolve traces and install deps ---------------------------------------- +# https://huggingface.co/datasets/semianalysisai/cc-traces-weka-with-subagents-060826 +export WEKA_LOADER_OVERRIDE=semianalysisai/cc-traces-weka-with-subagents-060826 + +# ---- Resolve traces and install deps ---------------------------------------- +resolve_trace_source +install_agentic_deps + +# ---- Server config ---------------------------------------------------------- +SERVER_LOG="$RESULT_DIR/server.log" +LMCACHE_LOG="$RESULT_DIR/lmcache_server.log" +mkdir -p "$RESULT_DIR" + +OFFLOAD_ARGS=() +PREFIX_CACHE_ARGS=() + +# ---- Lmcache config ---------------------------------------------------------- +LMCACHE_PID="" + +cleanup_lmcache_server() { + if [[ -n "$LMCACHE_PID" ]] && kill -0 "$LMCACHE_PID" 2>/dev/null; then + kill "$LMCACHE_PID" 2>/dev/null || true + wait "$LMCACHE_PID" 2>/dev/null || true + fi +} + +trap cleanup_lmcache_server EXIT + +wait_for_lmcache_ready() { + { set +x; } 2>/dev/null + local attempts="${LMCACHE_READY_ATTEMPTS:-120}" + local tail_pid="" + + while [ ! -f "$LMCACHE_LOG" ]; do + if [[ -n "$LMCACHE_PID" ]] && ! kill -0 "$LMCACHE_PID" 2>/dev/null; then + echo "LMCache server died before creating log file. Exiting." >&2 + exit 1 + fi + sleep 1 + done + + tail -f -n +1 "$LMCACHE_LOG" & + tail_pid=$! + + for ((i = 1; i <= attempts; i++)); do + if curl --output /dev/null --silent --fail "http://127.0.0.1:${LMCACHE_HTTP_PORT}/healthcheck"; then + kill "$tail_pid" 2>/dev/null || true + wait "$tail_pid" 2>/dev/null || true + return 0 + fi + if [[ -n "$LMCACHE_PID" ]] && ! kill -0 "$LMCACHE_PID" 2>/dev/null; then + echo "LMCache server died before becoming healthy. Log follows:" >&2 + kill "$tail_pid" 2>/dev/null || true + wait "$tail_pid" 2>/dev/null || true + cat "$LMCACHE_LOG" >&2 || true + exit 1 + fi + sleep 1 + done + + echo "Timed out waiting for LMCache server healthcheck. Log follows:" >&2 + kill "$tail_pid" 2>/dev/null || true + wait "$tail_pid" 2>/dev/null || true + cat "$LMCACHE_LOG" >&2 || true + exit 1 +} + +case "$OFFLOADING" in + none) ;; + cpu) + unset VLLM_USE_SIMPLE_KV_OFFLOAD + # MI355X nodes have ~2.7 TiB of host DRAM available for offload; + # reserve 2.5 TB for the offload pool (leaves ~200 GB headroom for + # worker RSS / page cache / slurm cgroup). + #TODO: fix + TOTAL_CPU_DRAM_GB=3000 + TOTAL_CPU_DRAM_PARTITION_GB="${TOTAL_CPU_DRAM_PARTITION_GB:-$((TOTAL_CPU_DRAM_GB / (8 / TP)))}" + # Use vLLM's regular native KV-offload path (OffloadingConnector), + # NOT the SimpleCPUOffloadConnector. The "native" backend resolves to + # OffloadingConnector by default; setting VLLM_USE_SIMPLE_KV_OFFLOAD=1 + # would switch it to SimpleCPUOffloadConnector. We intentionally leave + # that env var UNSET here so the regular OffloadingConnector path is + # used. The shortcut --kv_offloading_backend native + --kv_offloading_size + # form constructs the KVTransferConfig at engine startup + # (vllm/config/vllm.py:662). + OFFLOAD_ARGS=( + --kv_offloading_backend native + --kv_offloading_size "$TOTAL_CPU_DRAM_PARTITION_GB" + --disable-hybrid-kv-cache-manager + ) + ;; + lmcache) + { set +x; } 2>/dev/null + unset VLLM_USE_SIMPLE_KV_OFFLOAD + + git clone https://github.com/LMCache/LMCache.git + cd LMCache + pip install -r requirements/build.txt + CXX=hipcc BUILD_WITH_HIP=1 pip install -e . --no-build-isolation + cd .. + + python3 -c "import lmcache.integration.vllm.lmcache_mp_connector" >/dev/null + + # Match the B200 Kimi LMCache setup: keep a 2.5 TB semantic CPU KV + # pool, but let the external MP server own that pool so vLLM does not + # split --kv-offloading-size across TP ranks through the integrated + # LMCache backend. + #TODO: fix + TOTAL_CPU_DRAM_GB=3000 + LMCACHE_HOST="${LMCACHE_HOST:-127.0.0.1}" + LMCACHE_PORT="${LMCACHE_PORT:-5555}" + LMCACHE_HTTP_PORT="${LMCACHE_HTTP_PORT:-8080}" + # LMCacheMPConnector concatenates lmcache.mp.host and port into the + # ZMQ endpoint. Bind the server to a raw host, but pass the connector a + # ZMQ-style host string. + LMCACHE_CONNECT_HOST="${LMCACHE_CONNECT_HOST:-tcp://$LMCACHE_HOST}" + LMCACHE_L1_SIZE_GB="${LMCACHE_L1_SIZE_GB:-$((TOTAL_CPU_DRAM_GB / (8 / TP)))}" + LMCACHE_L1_INIT_SIZE_GB="${LMCACHE_L1_INIT_SIZE_GB:-20}" + # LMCache read locks are leases on chunks that lookup has promised + # vLLM can retrieve. The default 300s TTL is too short for this + # long-context agentic queue: TP8/conc32 can spend >300s between + # lookup and retrieve while GPU KV is saturated, which leaves the + # object present in L1 but no longer readable. Keep the 2.5 TB pool + # size unchanged and only extend the lookup-to-retrieve lease. + LMCACHE_L1_READ_TTL_SECONDS="${LMCACHE_L1_READ_TTL_SECONDS:-7200}" + LMCACHE_CHUNK_SIZE="${LMCACHE_CHUNK_SIZE:-256}" + LMCACHE_MAX_WORKERS="${LMCACHE_MAX_WORKERS:-$TP}" + export PYTHONHASHSEED="${PYTHONHASHSEED:-0}" + export LMCACHE_BLOCKING_TIMEOUT_SECS=120 + + echo "Starting LMCache MP server..." + LMCACHE_CMD=( + lmcache server + --host "$LMCACHE_HOST" + --port "$LMCACHE_PORT" + --http-host "$LMCACHE_HOST" + --http-port "$LMCACHE_HTTP_PORT" + --l1-size-gb "$LMCACHE_L1_SIZE_GB" + --l1-init-size-gb "$LMCACHE_L1_INIT_SIZE_GB" + --l1-read-ttl-seconds "$LMCACHE_L1_READ_TTL_SECONDS" + --chunk-size "$LMCACHE_CHUNK_SIZE" + --max-workers "$LMCACHE_MAX_WORKERS" + --eviction-policy LRU + ) + printf '%q ' "${LMCACHE_CMD[@]}" > "$RESULT_DIR/lmcache_command.txt" + printf '\n' >> "$RESULT_DIR/lmcache_command.txt" + "${LMCACHE_CMD[@]}" > "$LMCACHE_LOG" 2>&1 & + LMCACHE_PID=$! + echo "LMCache server PID: $LMCACHE_PID" + wait_for_lmcache_ready + # (srok) TODO: + PREFIX_CACHE_ARGS=(--enable_prefix_caching) + OFFLOAD_ARGS=( + --kv-transfer-config + "{\"kv_connector\":\"LMCacheMPConnector\",\"kv_connector_module_path\":\"lmcache.integration.vllm.lmcache_mp_connector\",\"kv_role\":\"kv_both\",\"kv_connector_extra_config\":{\"lmcache.mp.host\":\"$LMCACHE_CONNECT_HOST\",\"lmcache.mp.port\":$LMCACHE_PORT}}" + --disable-hybrid-kv-cache-manager + ) + ;; + *) echo "Error: unsupported OFFLOADING value '$OFFLOADING'" >&2; exit 1 ;; +esac + +# ---- LLM server config ---------------------------------------------------------- +EP_ARGS=() +if [ "$EP_SIZE" -gt 1 ]; then + EP_ARGS=(--enable-expert-parallel) +fi + +echo "Starting ATOM server..." +export PYTHONNOUSERSITE=1 + +# Workaround for MEC FW <177 RCCL memory reclaim issue +version=$(rocm-smi --showfw 2>/dev/null | grep MEC | head -n 1 | awk '{print $NF}') +if [[ "$version" == "" || ${version:-0} -lt 177 ]]; then + export HSA_NO_SCRATCH_RECLAIM=1 +fi + +PARALLEL_ARGS=(-tp "$TP") #TP +if [ "$DP_ATTENTION" = "true" ]; then + if [ "$EP_SIZE" -gt 1 ]; then #DP+EP + PARALLEL_ARGS=(-tp "$TP" --enable-expert-parallel --enable-dp-attention ) + else #DP+TP + PARALLEL_ARGS=(-tp "$TP" --enable-dp-attention ) + fi +fi + +set -x +export ATOM_DISABLE_MMAP=true +export AITER_BF16_FP8_MOE_BOUND=0 +export ATOM_MOE_GU_ITLV=1 +{ set +x; } 2>/dev/null + +ATOM_CMD=( + python3 -m atom.entrypoints.openai_server \ + --model $MODEL \ + --server-port $PORT \ + "${PARALLEL_ARGS[@]}" \ + --kv_cache_dtype fp8 \ + --trust-remote-code \ + --gpu-memory-utilization 0.85 \ + "${PREFIX_CACHE_ARGS[@]}" + "${OFFLOAD_ARGS[@]}" +) +printf '%q ' "${ATOM_CMD[@]}" | tee "$RESULT_DIR/vllm_command.txt" +printf '\n' | tee -a "$RESULT_DIR/vllm_command.txt" +"${ATOM_CMD[@]}" > "$SERVER_LOG" 2>&1 & +SERVER_PID=$! +echo "Server PID: $SERVER_PID" + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +# ---- Run benchmark ---------------------------------------------------------- +build_replay_cmd "$RESULT_DIR" + +run_agentic_replay_and_write_outputs "$RESULT_DIR" \ No newline at end of file From 32f50079d9911470bf28e6810342cb4d29dbf9a1 Mon Sep 17 00:00:00 2001 From: seungrokj Date: Fri, 12 Jun 2026 15:13:36 +0900 Subject: [PATCH 04/21] [AMD] update DSV4-FP4-MI355x atom agentic benchmark and master yaml config Co-Authored-By: Claude Sonnet 4.6 --- .github/configs/amd-master.yaml | 3 ++- benchmarks/single_node/agentic/dsv4_fp4_mi355x_atom.sh | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 6b6f38fb5..7ce6882c3 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -2187,7 +2187,8 @@ dsv4-fp4-mi355x-atom-agentic-lmcache: agentic-coding: - duration: 1800 search-space: - - { tp: 8, ep: 1, offloading: none, conc-list: [44, 48, 52, 56, 60] } + - { tp: 8, ep: 1, offloading: none, conc-list: [52] } + #- { tp: 8, ep: 1, offloading: none, conc-list: [44, 48, 52, 56, 60] } #- { tp: 8, ep: 1, offloading: lmcache, conc-list: [44, 48, 52, 56, 60] } qwen3.5-bf16-mi325x-sglang-mtp: diff --git a/benchmarks/single_node/agentic/dsv4_fp4_mi355x_atom.sh b/benchmarks/single_node/agentic/dsv4_fp4_mi355x_atom.sh index 701f39b41..1ec554669 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_mi355x_atom.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_mi355x_atom.sh @@ -46,7 +46,7 @@ amd-smi || true # ---- Resolve traces and install deps ---------------------------------------- # https://huggingface.co/datasets/semianalysisai/cc-traces-weka-with-subagents-060826 -export WEKA_LOADER_OVERRIDE=semianalysisai/cc-traces-weka-with-subagents-060826 +export WEKA_LOADER_OVERRIDE=semianalysisai_cc-traces-weka-with-subagents-060826 # ---- Resolve traces and install deps ---------------------------------------- resolve_trace_source From 351e729285088efd7db8a135ad13429e37e6b49d Mon Sep 17 00:00:00 2001 From: seungrokj Date: Fri, 12 Jun 2026 17:04:32 +0900 Subject: [PATCH 05/21] [AMD] dsv4_fp4_mi355x_atom.sh: update agentic benchmark script Co-Authored-By: Claude Sonnet 4.6 --- benchmarks/single_node/agentic/dsv4_fp4_mi355x_atom.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/single_node/agentic/dsv4_fp4_mi355x_atom.sh b/benchmarks/single_node/agentic/dsv4_fp4_mi355x_atom.sh index 1ec554669..f1e680cbe 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_mi355x_atom.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_mi355x_atom.sh @@ -46,7 +46,7 @@ amd-smi || true # ---- Resolve traces and install deps ---------------------------------------- # https://huggingface.co/datasets/semianalysisai/cc-traces-weka-with-subagents-060826 -export WEKA_LOADER_OVERRIDE=semianalysisai_cc-traces-weka-with-subagents-060826 +export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060826 # ---- Resolve traces and install deps ---------------------------------------- resolve_trace_source From 64ce90cde70f6d504ef067415ed84481f5a803d9 Mon Sep 17 00:00:00 2001 From: ajith-sirra-amd Date: Fri, 12 Jun 2026 14:12:23 +0530 Subject: [PATCH 06/21] Add DSV4 MI355X Agentic Scripts Signed-off-by: ajith-sirra-amd --- .github/configs/amd-master.yaml | 15 ++ .../single_node/agentic/dsv4_fp4_mi355x.sh | 208 ++++++++++++++++++ .../agentic/dsv4_fp4_mi355x_sglang.sh | 44 +++- 3 files changed, 263 insertions(+), 4 deletions(-) create mode 100644 benchmarks/single_node/agentic/dsv4_fp4_mi355x.sh diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 7ce6882c3..7460aca80 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -2736,3 +2736,18 @@ dsv4-fp4-mi355x-sglang-agentic: search-space: - { tp: 8, offloading: none, conc-list: [16, 32, 64] } - { tp: 8, dp-attn: true, offloading: none, conc-list: [64, 128, 256] } + +dsv4-fp4-mi355x-sglang-agentic-hicache: + image: rocm/sgl-dev:rocm720-mi35x-0363e6c-20260509-DSv4 + model: deepseek-ai/DeepSeek-V4-Pro + model-prefix: dsv4 + runner: mi355x + precision: fp4 + framework: sglang + multinode: false + scenarios: + agentic-coding: + - duration: 1800 + search-space: + - { tp: 8, offloading: none, conc-list: [52] } + - { tp: 8, offloading: hicache, conc-list: [52] } \ No newline at end of file diff --git a/benchmarks/single_node/agentic/dsv4_fp4_mi355x.sh b/benchmarks/single_node/agentic/dsv4_fp4_mi355x.sh new file mode 100644 index 000000000..3d05e0bf4 --- /dev/null +++ b/benchmarks/single_node/agentic/dsv4_fp4_mi355x.sh @@ -0,0 +1,208 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x + +# Agentic trace replay benchmark for DeepSeek-V4-Pro FP4 on MI355X using SGLang. +# Adapted from benchmarks/single_node/dsv4_fp4_mi355x_sglang.sh (fixed-seq-len +# sibling) with the agentic harness (build_replay_cmd / write_agentic_result_json +# / analyze_benchmark_distributions) swapped in for run_benchmark_serving. +# +# This launcher does NOT support CPU offload. SGLang's KV offload paths are +# different from vLLM's SimpleCPUOffloadConnector, and the matching agentic +# config (dsv4-fp4-mi355x-sglang-agentic) only sweeps offloading=none. +# +# Required env vars: +# MODEL, TP, CONC, OFFLOADING, TOTAL_CPU_DRAM_GB, RESULT_DIR + +source "$(dirname "$0")/../../benchmark_lib.sh" + +check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION EP_SIZE DP_ATTENTION + +if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then + MAX_MODEL_LEN=1000000 +fi + +if [[ -n "${SLURM_JOB_ID:-}" ]]; then + echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" +fi + +# ROCR/HIP visibility under slurm cgroups. +if [ -n "${ROCR_VISIBLE_DEVICES:-}" ]; then + export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES" +fi + +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi +rocm-smi || true +amd-smi || true + +# ---- Resolve traces and install deps ---------------------------------------- +resolve_trace_source +install_agentic_deps + +# Reject anything other than none: this launcher has no SGLang CPU-offload +# wiring (different surface than vLLM's SimpleCPUOffloadConnector). +CACHE_ARGS=() +WARMUP_ARGS=() +CUDA_GRAPH_MAX_BS="$CONC" +[ "$CUDA_GRAPH_MAX_BS" -gt 64 ] && CUDA_GRAPH_MAX_BS=64 +case "$OFFLOADING" in + none) + # Leave SGLang's default RadixAttention prefix cache on — agentic + # replay needs it; --disable-radix-cache would zero the hit rate. + ;; + hicache) + # DeepSeek V4 HiCache uses ratio-based capacity control, not GB-based. + # DSv4 allocates several physical host sub-pools for each logical host + # token. MI355X nodes have ~3 TB of host DRAM (similar to B200's 3.8 + # TiB), so ratio=8 at TP≥8 provides a large useful CPU tier within the + # node budget. Lower TP configs use higher ratios to maintain adequate + # host token capacity without exceeding DRAM limits. + if [ "$TP" -ge 8 ]; then + DEFAULT_HICACHE_RATIO=8 + else + DEFAULT_HICACHE_RATIO=16 + fi + HICACHE_RATIO="${HICACHE_RATIO:-$DEFAULT_HICACHE_RATIO}" + HICACHE_WRITE_POLICY="${HICACHE_WRITE_POLICY:-write_through}" + HICACHE_IO_BACKEND="${HICACHE_IO_BACKEND:-direct}" + HICACHE_MEM_LAYOUT="${HICACHE_MEM_LAYOUT:-page_first_direct}" + export SGLANG_ENABLE_UNIFIED_RADIX_TREE=1 + CACHE_ARGS=( + --enable-hierarchical-cache + --hicache-ratio "$HICACHE_RATIO" + --hicache-write-policy "$HICACHE_WRITE_POLICY" + --hicache-io-backend "$HICACHE_IO_BACKEND" + --hicache-mem-layout "$HICACHE_MEM_LAYOUT" + ) + echo "HiCache DSv4 CPU tier: ratio=$HICACHE_RATIO, write_policy=$HICACHE_WRITE_POLICY, io_backend=$HICACHE_IO_BACKEND, mem_layout=$HICACHE_MEM_LAYOUT" + ;; + *) + echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, hicache)" >&2 + exit 1 + ;; +esac + +# Transformers in the container doesn't recognize the `deepseek_v4` model_type. +# PR #23608's fallback in hf_transformers_utils.get_config tries to handle this +# by writing a patched config to /tmp, but in practice isn't catching the error +# in this image. Patch the cached config.json directly instead: set model_type +# to `deepseek_v3` so AutoConfig.from_pretrained succeeds, and keep +# architectures=['DeepseekV4ForCausalLM'] so SGLang dispatches to its native +# DSv4 model class (python/sglang/srt/models/deepseek_v4.py). +python3 << PYEOF +import json +from huggingface_hub import hf_hub_download +path = hf_hub_download(repo_id="$MODEL", filename="config.json") +with open(path) as f: + config = json.load(f) +if config.get("model_type") == "deepseek_v4": + config["model_type"] = "deepseek_v3" + with open(path, "w") as f: + json.dump(config, f, indent=2) + print(f"Patched {path}: model_type deepseek_v4 -> deepseek_v3") +else: + print(f"No patch needed: model_type is {config.get('model_type')!r}") +PYEOF + +# DSv4 FP4-experts path. Mirrors the env block in the fixed-seq-len sibling +# (benchmarks/single_node/dsv4_fp4_mi355x_sglang.sh), which tracks the active +# block in python/run_dsv4.sh on the amd/deepseek_v4 branch: +# SGLANG_DSV4_FP4_EXPERTS=True -> route experts through FP4 kernels +# SGLANG_FORCE_TRITON_MOE_FP8=0 -> dispatch MoE through aiter and apply +# the swiglu_limit clamp in the triton +# MoE fallback path. +export SGLANG_REASONING_EFFORT=max +export SGLANG_OPT_USE_FUSED_COMPRESS=true +export SGLANG_OPT_USE_OLD_COMPRESSOR=true +export SGLANG_OPT_USE_TILELANG_SWA_PREPARE=false +export SGLANG_OPT_USE_JIT_KERNEL_FUSED_TOPK=false +export SGLANG_OPT_USE_FUSED_HASH_TOPK=false +export SGLANG_OPT_DEEPGEMM_HC_PRENORM=false +export SGLANG_OPT_USE_TILELANG_MHC_PRE=false +export SGLANG_OPT_USE_TILELANG_MHC_POST=false +export SGLANG_OPT_USE_AITER_MHC_PRE=true +export SGLANG_OPT_USE_AITER_MHC_POST=true +export SGLANG_ENABLE_THINKING=1 +export SGLANG_USE_AITER=1 +export SGLANG_USE_ROCM700A=1 +export SGLANG_TOPK_TRANSFORM_512_TORCH=0 +export SGLANG_FP8_PAGED_MQA_LOGITS_TORCH=1 +export SGLANG_DSV4_FP4_EXPERTS=True +export SGLANG_OPT_DPSK_V4_RADIX=0 +export SGLANG_OPT_USE_OVERLAP_STORE_CACHE=false +export SGLANG_OPT_USE_FUSED_STORE_CACHE=false +export SGLANG_FORCE_TRITON_MOE_FP8=0 +export SGLANG_HACK_FLASHMLA_BACKEND=tilelang +export SGLANG_OPT_USE_TILELANG_INDEXER=true +export SGLANG_OPT_USE_TRITON_SWA_PREPARE=true + +# ---- Server config ---------------------------------------------------------- +SERVER_LOG="$RESULT_DIR/server.log" +mkdir -p "$RESULT_DIR" + +# Parallelism: pure TP, TP+EP, or DEP (DP-attn + EP). Matches the dsv4 b200 +# vllm agentic launcher so the agentic sweep can probe both interactivity and +# throughput regimes. +PARALLEL_ARGS=(--tensor-parallel-size "$TP") +if [ "$DP_ATTENTION" = "true" ]; then + PARALLEL_ARGS+=( + --dp "$TP" + --enable-dp-attention + --enable-prefill-delayer + ) +fi +if [ "${EP_SIZE:-1}" -gt 1 ]; then + PARALLEL_ARGS+=(--ep-size "$EP_SIZE") +fi + +# --max-running-requests is per-engine. With DP-attn each DP engine handles +# only CONC/$TP sequences in steady state (the agentic harness load-balances +# users across DP ranks), so size the per-engine cap to that. +# Pure TP is a single engine and sees all CONC sequences itself. +if [ "$DP_ATTENTION" = "true" ]; then + PER_ENGINE_MAX_RUNNING=$(( CONC / TP )) + [ "$PER_ENGINE_MAX_RUNNING" -lt 1 ] && PER_ENGINE_MAX_RUNNING=1 +else + PER_ENGINE_MAX_RUNNING=$CONC +fi + +echo "Starting sglang server..." +python3 -m sglang.launch_server \ + --model-path "$MODEL_PATH" --served-model-name "$MODEL" \ + --host=0.0.0.0 \ + --port "$PORT" \ + "${PARALLEL_ARGS[@]}" \ + --trust-remote-code \ + --attention-backend compressed \ + --max-running-requests "$PER_ENGINE_MAX_RUNNING" \ + --cuda-graph-max-bs "$CUDA_GRAPH_MAX_BS" \ + --page-size 256 \ + --context-length "$MAX_MODEL_LEN" \ + --chunked-prefill-size 8192 \ + --disable-shared-experts-fusion \ + --tool-call-parser deepseekv4 \ + --reasoning-parser deepseek-v4 \ + --chat-template "$(dirname "$0")/../chat_templates/deepseek_v4_thinking.jinja" \ + --watchdog-timeout 1800 \ + --enable-metrics \ + "${CACHE_ARGS[@]}" \ + "${WARMUP_ARGS[@]}" > "$SERVER_LOG" 2>&1 & +SERVER_PID=$! +echo "Server PID: $SERVER_PID" + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +# ---- Run benchmark ---------------------------------------------------------- +build_replay_cmd "$RESULT_DIR" + +run_agentic_replay_and_write_outputs "$RESULT_DIR" diff --git a/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh b/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh index 029c8ea7f..3d05e0bf4 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh @@ -51,10 +51,43 @@ install_agentic_deps # Reject anything other than none: this launcher has no SGLang CPU-offload # wiring (different surface than vLLM's SimpleCPUOffloadConnector). +CACHE_ARGS=() +WARMUP_ARGS=() +CUDA_GRAPH_MAX_BS="$CONC" +[ "$CUDA_GRAPH_MAX_BS" -gt 64 ] && CUDA_GRAPH_MAX_BS=64 case "$OFFLOADING" in - none) ;; + none) + # Leave SGLang's default RadixAttention prefix cache on — agentic + # replay needs it; --disable-radix-cache would zero the hit rate. + ;; + hicache) + # DeepSeek V4 HiCache uses ratio-based capacity control, not GB-based. + # DSv4 allocates several physical host sub-pools for each logical host + # token. MI355X nodes have ~3 TB of host DRAM (similar to B200's 3.8 + # TiB), so ratio=8 at TP≥8 provides a large useful CPU tier within the + # node budget. Lower TP configs use higher ratios to maintain adequate + # host token capacity without exceeding DRAM limits. + if [ "$TP" -ge 8 ]; then + DEFAULT_HICACHE_RATIO=8 + else + DEFAULT_HICACHE_RATIO=16 + fi + HICACHE_RATIO="${HICACHE_RATIO:-$DEFAULT_HICACHE_RATIO}" + HICACHE_WRITE_POLICY="${HICACHE_WRITE_POLICY:-write_through}" + HICACHE_IO_BACKEND="${HICACHE_IO_BACKEND:-direct}" + HICACHE_MEM_LAYOUT="${HICACHE_MEM_LAYOUT:-page_first_direct}" + export SGLANG_ENABLE_UNIFIED_RADIX_TREE=1 + CACHE_ARGS=( + --enable-hierarchical-cache + --hicache-ratio "$HICACHE_RATIO" + --hicache-write-policy "$HICACHE_WRITE_POLICY" + --hicache-io-backend "$HICACHE_IO_BACKEND" + --hicache-mem-layout "$HICACHE_MEM_LAYOUT" + ) + echo "HiCache DSv4 CPU tier: ratio=$HICACHE_RATIO, write_policy=$HICACHE_WRITE_POLICY, io_backend=$HICACHE_IO_BACKEND, mem_layout=$HICACHE_MEM_LAYOUT" + ;; *) - echo "Error: dsv4_fp4_mi355x_sglang.sh only supports OFFLOADING=none (got '$OFFLOADING')" >&2 + echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, hicache)" >&2 exit 1 ;; esac @@ -152,7 +185,7 @@ python3 -m sglang.launch_server \ --trust-remote-code \ --attention-backend compressed \ --max-running-requests "$PER_ENGINE_MAX_RUNNING" \ - --cuda-graph-max-bs "$PER_ENGINE_MAX_RUNNING" \ + --cuda-graph-max-bs "$CUDA_GRAPH_MAX_BS" \ --page-size 256 \ --context-length "$MAX_MODEL_LEN" \ --chunked-prefill-size 8192 \ @@ -160,7 +193,10 @@ python3 -m sglang.launch_server \ --tool-call-parser deepseekv4 \ --reasoning-parser deepseek-v4 \ --chat-template "$(dirname "$0")/../chat_templates/deepseek_v4_thinking.jinja" \ - --watchdog-timeout 1800 > "$SERVER_LOG" 2>&1 & + --watchdog-timeout 1800 \ + --enable-metrics \ + "${CACHE_ARGS[@]}" \ + "${WARMUP_ARGS[@]}" > "$SERVER_LOG" 2>&1 & SERVER_PID=$! echo "Server PID: $SERVER_PID" From 8ca4bc1b3eb6fc1fdd2fdbe702c56dcd16de37cc Mon Sep 17 00:00:00 2001 From: ajith-sirra-amd Date: Fri, 12 Jun 2026 14:17:52 +0530 Subject: [PATCH 07/21] Add DSV4 MI355X Agentic Scripts Signed-off-by: ajith-sirra-amd --- benchmarks/single_node/agentic/dsv4_fp4_mi355x.sh | 4 ++++ benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/benchmarks/single_node/agentic/dsv4_fp4_mi355x.sh b/benchmarks/single_node/agentic/dsv4_fp4_mi355x.sh index 3d05e0bf4..236895cd2 100644 --- a/benchmarks/single_node/agentic/dsv4_fp4_mi355x.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_mi355x.sh @@ -45,6 +45,10 @@ fi rocm-smi || true amd-smi || true +# ---- Resolve traces and install deps ---------------------------------------- +# https://huggingface.co/datasets/semianalysisai/cc-traces-weka-with-subagents-060826 +export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060826 + # ---- Resolve traces and install deps ---------------------------------------- resolve_trace_source install_agentic_deps diff --git a/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh b/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh index 3d05e0bf4..236895cd2 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh @@ -45,6 +45,10 @@ fi rocm-smi || true amd-smi || true +# ---- Resolve traces and install deps ---------------------------------------- +# https://huggingface.co/datasets/semianalysisai/cc-traces-weka-with-subagents-060826 +export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060826 + # ---- Resolve traces and install deps ---------------------------------------- resolve_trace_source install_agentic_deps From 37f57a70e39ad8b1a5189a6698d8a9be9b609119 Mon Sep 17 00:00:00 2001 From: seungrokj Date: Mon, 15 Jun 2026 01:59:58 +0900 Subject: [PATCH 08/21] [AMD] update DSV4-FP4-MI355X SGLang agentic benchmark and master yaml config Co-Authored-By: Claude Sonnet 4.6 --- .github/configs/amd-master.yaml | 8 +- .../fixed_seq_len/dsv4_fp4_mi355x_sglang.sh | 254 ++++++++++-------- 2 files changed, 149 insertions(+), 113 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 7460aca80..9b9cceb12 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -2737,8 +2737,9 @@ dsv4-fp4-mi355x-sglang-agentic: - { tp: 8, offloading: none, conc-list: [16, 32, 64] } - { tp: 8, dp-attn: true, offloading: none, conc-list: [64, 128, 256] } +# target dsv4-fp4-mi355x-sglang-agentic-hicache: - image: rocm/sgl-dev:rocm720-mi35x-0363e6c-20260509-DSv4 + image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260610 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: mi355x @@ -2749,5 +2750,6 @@ dsv4-fp4-mi355x-sglang-agentic-hicache: agentic-coding: - duration: 1800 search-space: - - { tp: 8, offloading: none, conc-list: [52] } - - { tp: 8, offloading: hicache, conc-list: [52] } \ No newline at end of file + #DPA, conc>=64 + - { tp: 8, dp-attn: false, offloading: none, conc-list: [4, 8, 16, 32, 48, 64, 128, 196, 256] } + - { tp: 8, dp-attn: false, offloading: hicache, conc-list: [4, 8, 16, 32, 48, 64, 128, 196, 256] } \ No newline at end of file diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_sglang.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_sglang.sh index b02a09489..c6f6cba25 100755 --- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_sglang.sh +++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_sglang.sh @@ -1,101 +1,138 @@ #!/usr/bin/env bash +set -euo pipefail +set -x + +# Agentic trace replay benchmark for DeepSeek-V4-Pro FP4 on MI355X using SGLang. +# Adapted from benchmarks/single_node/dsv4_fp4_mi355x_sglang.sh (fixed-seq-len +# sibling) with the agentic harness (build_replay_cmd / write_agentic_result_json +# / analyze_benchmark_distributions) swapped in for run_benchmark_serving. +# +# This launcher does NOT support CPU offload. SGLang's KV offload paths are +# different from vLLM's SimpleCPUOffloadConnector, and the matching agentic +# config (dsv4-fp4-mi355x-sglang-agentic) only sweeps offloading=none. +# +# Required env vars: +# MODEL, TP, CONC, OFFLOADING, TOTAL_CPU_DRAM_GB, RESULT_DIR source "$(dirname "$0")/../../benchmark_lib.sh" -check_env_vars \ - MODEL \ - TP \ - DP_ATTENTION \ - EP_SIZE \ - CONC \ - ISL \ - OSL \ - RANDOM_RANGE_RATIO \ - RESULT_FILENAME \ - MAX_MODEL_LEN - -if [[ -n "$SLURM_JOB_ID" ]]; then - echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION EP_SIZE DP_ATTENTION + +if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then + MAX_MODEL_LEN=1000000 fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi - -# sglang ships in the image at the SHA encoded in the image tag (built -# from the amd/deepseek_v4 branch in sgl-project/sglang). To bump sglang, -# bump the image tag in .github/configs/amd-master.yaml. - -# Transformers in the container doesn't recognize the `deepseek_v4` model_type. -# PR #23608's fallback in hf_transformers_utils.get_config tries to handle this -# by writing a patched config to /tmp, but in practice isn't catching the error -# in this image. Patch the cached config.json directly instead: set model_type -# to `deepseek_v3` so AutoConfig.from_pretrained succeeds, and keep -# architectures=['DeepseekV4ForCausalLM'] so SGLang dispatches to its native -# DSv4 model class (python/sglang/srt/models/deepseek_v4.py). -python3 << PYEOF -import json -from huggingface_hub import hf_hub_download -path = hf_hub_download(repo_id="$MODEL", filename="config.json") -with open(path) as f: - config = json.load(f) -if config.get("model_type") == "deepseek_v4": - config["model_type"] = "deepseek_v3" - with open(path, "w") as f: - json.dump(config, f, indent=2) - print(f"Patched {path}: model_type deepseek_v4 -> deepseek_v3") -else: - print(f"No patch needed: model_type is {config.get('model_type')!r}") -PYEOF - -# DSv4 FP4-experts path. Tracks the env block in python/run_dsv4.sh on the -# amd/deepseek_v4 branch (HEAD's active block is FP8; we override the two -# FP4-specific flags below): -# SGLANG_DSV4_FP4_EXPERTS=True -> route experts through the FP4 kernels -# SGLANG_FORCE_TRITON_MOE_FP8=0 -> dispatch MoE through aiter and apply -# the swiglu_limit clamp in the triton -# MoE fallback path. -export SGLANG_REASONING_EFFORT=max -export SGLANG_OPT_USE_FUSED_COMPRESS=true -export SGLANG_OPT_USE_OLD_COMPRESSOR=false -export SGLANG_OPT_USE_TILELANG_SWA_PREPARE=false -export SGLANG_OPT_USE_JIT_KERNEL_FUSED_TOPK=false -export SGLANG_OPT_USE_FUSED_HASH_TOPK=true +if [[ -n "${SLURM_JOB_ID:-}" ]]; then + echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" +fi + +# ROCR/HIP visibility under slurm cgroups. +if [ -n "${ROCR_VISIBLE_DEVICES:-}" ]; then + export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES" +fi + +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi +rocm-smi || true +amd-smi || true + +# ---- Resolve traces and install deps ---------------------------------------- +# https://huggingface.co/datasets/semianalysisai/cc-traces-weka-with-subagents-060826 +# export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060226 + +# ---- Resolve traces and install deps ---------------------------------------- +resolve_trace_source +install_agentic_deps + +# ---- Server config ---------------------------------------------------------- +SERVER_LOG="$RESULT_DIR/server.log" +mkdir -p "$RESULT_DIR" + +# ---- Hicache config ---------------------------------------------------------- +# Reject anything other than none: this launcher has no SGLang CPU-offload +# wiring (different surface than vLLM's SimpleCPUOffloadConnector). + +case "$OFFLOADING" in + none) + # Leave SGLang's default RadixAttention prefix cache on — agentic + # replay needs it; --disable-radix-cache would zero the hit rate. + ;; + hicache) + # DeepSeek V4 HiCache uses ratio-based capacity control, not GB-based. + # DSv4 allocates several physical host sub-pools for each logical host + # token. MI355X nodes have ~3 TB of host DRAM (similar to B200's 3.8 + # TiB), so ratio=8 at TP≥8 provides a large useful CPU tier within the + # node budget. Lower TP configs use higher ratios to maintain adequate + # host token capacity without exceeding DRAM limits. + if [ "$TP" -ge 8 ]; then + DEFAULT_HICACHE_RATIO=8 + else + DEFAULT_HICACHE_RATIO=16 + fi + HICACHE_RATIO="${HICACHE_RATIO:-$DEFAULT_HICACHE_RATIO}" + HICACHE_WRITE_POLICY="${HICACHE_WRITE_POLICY:-write_through}" + HICACHE_IO_BACKEND="${HICACHE_IO_BACKEND:-direct}" + HICACHE_MEM_LAYOUT="${HICACHE_MEM_LAYOUT:-page_first_direct}" + export SGLANG_ENABLE_UNIFIED_RADIX_TREE=1 + CACHE_ARGS=( + --enable-hierarchical-cache + --hicache-ratio "$HICACHE_RATIO" + --hicache-write-policy "$HICACHE_WRITE_POLICY" + --hicache-io-backend "$HICACHE_IO_BACKEND" + --hicache-mem-layout "$HICACHE_MEM_LAYOUT" + ) + echo "HiCache DSv4 CPU tier: ratio=$HICACHE_RATIO, write_policy=$HICACHE_WRITE_POLICY, io_backend=$HICACHE_IO_BACKEND, mem_layout=$HICACHE_MEM_LAYOUT" + ;; + *) + echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, hicache)" >&2 + exit 1 + ;; +esac + +# ---- LLM server config ---------------------------------------------------------- + +CACHE_ARGS=() +WARMUP_ARGS=() +CUDA_GRAPH_MAX_BS="$CONC" +[ "$CUDA_GRAPH_MAX_BS" -gt 64 ] && CUDA_GRAPH_MAX_BS=64 + +export SGLANG_DEFAULT_THINKING=1 +export SGLANG_DSV4_REASONING_EFFORT=max export SGLANG_OPT_DEEPGEMM_HC_PRENORM=false +export SGLANG_USE_AITER=1 +export SGLANG_USE_ROCM700A=0 +export SGLANG_OPT_USE_FUSED_COMPRESS=true +export SGLANG_HACK_FLASHMLA_BACKEND=unified_kv_triton +export SGLANG_OPT_FP8_WO_A_GEMM=false +export SGLANG_OPT_USE_JIT_INDEXER_METADATA=false +export SGLANG_OPT_USE_TOPK_V2=false +export SGLANG_OPT_USE_AITER_INDEXER=true +export SGLANG_OPT_USE_TILELANG_INDEXER=false export SGLANG_OPT_USE_TILELANG_MHC_PRE=false export SGLANG_OPT_USE_TILELANG_MHC_POST=false -export SGLANG_OPT_USE_AITER_MHC_PRE=true -export SGLANG_OPT_USE_AITER_MHC_POST=true -export SGLANG_ENABLE_THINKING=1 -export SGLANG_USE_AITER=1 -export SGLANG_USE_ROCM700A=1 -export SGLANG_TOPK_TRANSFORM_512_TORCH=0 export SGLANG_FP8_PAGED_MQA_LOGITS_TORCH=1 -export SGLANG_DSV4_FP4_EXPERTS=True -export SGLANG_OPT_DPSK_V4_RADIX=1 -export SGLANG_OPT_USE_OVERLAP_STORE_CACHE=false -export SGLANG_OPT_USE_FUSED_STORE_CACHE=true -export SGLANG_FORCE_TRITON_MOE_FP8=0 -export SGLANG_HACK_FLASHMLA_BACKEND=triton -export SGLANG_OPT_USE_TILELANG_INDEXER=true -export SGLANG_OPT_USE_TRITON_SWA_PREPARE=true +export SGLANG_OPT_USE_FUSED_COMPRESS_TRITON=true export AITER_BF16_FP8_MOE_BOUND=0 -export SGLANG_OPT_FUSE_WQA_WKV=true -export SGLANG_OPT_USE_FUSED_PAGED_COMPRESS=true -export SGLANG_OPT_USE_MULTI_STREAM_OVERLAP=0 +export SGLANG_EAGER_INPUT_NO_COPY=true -SERVER_LOG=/workspace/server.log - -EVAL_CONTEXT_ARGS="" -if [ "${EVAL_ONLY}" = "true" ]; then - setup_eval_context - EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" -fi -# Start GPU monitoring (power, temperature, clocks every second) -start_gpu_monitor +# multi-stream +export SGLANG_OPT_USE_MULTI_STREAM_OVERLAP=false +export SGLANG_ROCM_USE_MULTI_STREAM=false -PARALLEL_ARGS=( - --tensor-parallel-size "$TP" -) -if [ "${DP_ATTENTION}" = "true" ]; then +# Parallelism: pure TP, TP+EP, or DEP (DP-attn + EP). Matches the dsv4 b200 +# vllm agentic launcher so the agentic sweep can probe both interactivity and +# throughput regimes. +PARALLEL_ARGS=(--tensor-parallel-size "$TP") +if [ "$DP_ATTENTION" = "true" ]; then PARALLEL_ARGS+=( --dp "$TP" --enable-dp-attention @@ -106,14 +143,26 @@ if [ "${EP_SIZE:-1}" -gt 1 ]; then PARALLEL_ARGS+=(--ep-size "$EP_SIZE") fi -python3 -m sglang.launch_server \ +# --max-running-requests is per-engine. With DP-attn each DP engine handles +# only CONC/$TP sequences in steady state (the agentic harness load-balances +# users across DP ranks), so size the per-engine cap to that. +# Pure TP is a single engine and sees all CONC sequences itself. +if [ "$DP_ATTENTION" = "true" ]; then + PER_ENGINE_MAX_RUNNING=$(( CONC / TP )) + [ "$PER_ENGINE_MAX_RUNNING" -lt 1 ] && PER_ENGINE_MAX_RUNNING=1 +else + PER_ENGINE_MAX_RUNNING=$CONC +fi + +echo "Starting sglang server..." +sglang serve \ --model-path $MODEL \ --host=0.0.0.0 \ --port $PORT \ "${PARALLEL_ARGS[@]}" \ --trust-remote-code \ --disable-radix-cache \ - --attention-backend compressed \ + --attention-backend dsv4 \ --max-running-requests ${CONC} \ --mem-fraction-static 0.90 \ --swa-full-tokens-ratio 0.15 \ @@ -124,31 +173,16 @@ python3 -m sglang.launch_server \ --tool-call-parser deepseekv4 \ --reasoning-parser deepseek-v4 \ --chat-template "$(dirname "$0")/../chat_templates/deepseek_v4_thinking.jinja" \ - --watchdog-timeout 1800 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & - + --watchdog-timeout 1800 \ + --enable-metrics \ + "${CACHE_ARGS[@]}" \ + "${WARMUP_ARGS[@]}" > "$SERVER_LOG" 2>&1 & SERVER_PID=$! +echo "Server PID: $SERVER_PID" -# Wait for server to be ready wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" -run_benchmark_serving \ - --model "$MODEL" \ - --port "$PORT" \ - --backend vllm \ - --input-len "$ISL" \ - --output-len "$OSL" \ - --random-range-ratio "$RANDOM_RANGE_RATIO" \ - --num-prompts "$((CONC * 10))" \ - --max-concurrency "$CONC" \ - --result-filename "$RESULT_FILENAME" \ - --result-dir /workspace/ - -# After throughput, run evaluation only if RUN_EVAL is true -if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" - append_lm_eval_summary -fi +# ---- Run benchmark ---------------------------------------------------------- +build_replay_cmd "$RESULT_DIR" -# Stop GPU monitoring -stop_gpu_monitor -set +x +run_agentic_replay_and_write_outputs "$RESULT_DIR" From 76d90e0dcbd5ac4dc02c8a90d97428e0481694b3 Mon Sep 17 00:00:00 2001 From: seungrokj Date: Mon, 15 Jun 2026 09:20:25 +0900 Subject: [PATCH 09/21] [AMD] update DSV4-FP4-MI355X SGLang agentic/fixed-seq-len benchmark scripts and master yaml Co-Authored-By: Claude Sonnet 4.6 --- .github/configs/amd-master.yaml | 6 +- .../agentic/dsv4_fp4_mi355x_sglang.sh | 104 ++++------ .../fixed_seq_len/dsv4_fp4_mi355x_sglang.sh | 180 ++++++------------ 3 files changed, 100 insertions(+), 190 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 9b9cceb12..d2a90a6b1 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -2751,5 +2751,7 @@ dsv4-fp4-mi355x-sglang-agentic-hicache: - duration: 1800 search-space: #DPA, conc>=64 - - { tp: 8, dp-attn: false, offloading: none, conc-list: [4, 8, 16, 32, 48, 64, 128, 196, 256] } - - { tp: 8, dp-attn: false, offloading: hicache, conc-list: [4, 8, 16, 32, 48, 64, 128, 196, 256] } \ No newline at end of file + #- { tp: 8, dp-attn: false, offloading: none, conc-list: [4, 8, 16, 32, 48, 64, 128, 196, 256] } + #- { tp: 8, dp-attn: false, offloading: hicache, conc-list: [4, 8, 16, 32, 48, 64, 128, 196, 256] } + - { tp: 8, dp-attn: false, offloading: none, conc-list: [64] } + - { tp: 8, dp-attn: false, offloading: hicache, conc-list: [64] } \ No newline at end of file diff --git a/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh b/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh index 236895cd2..c6f6cba25 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh @@ -47,18 +47,20 @@ amd-smi || true # ---- Resolve traces and install deps ---------------------------------------- # https://huggingface.co/datasets/semianalysisai/cc-traces-weka-with-subagents-060826 -export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060826 +# export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060226 # ---- Resolve traces and install deps ---------------------------------------- resolve_trace_source install_agentic_deps +# ---- Server config ---------------------------------------------------------- +SERVER_LOG="$RESULT_DIR/server.log" +mkdir -p "$RESULT_DIR" + +# ---- Hicache config ---------------------------------------------------------- # Reject anything other than none: this launcher has no SGLang CPU-offload # wiring (different surface than vLLM's SimpleCPUOffloadConnector). -CACHE_ARGS=() -WARMUP_ARGS=() -CUDA_GRAPH_MAX_BS="$CONC" -[ "$CUDA_GRAPH_MAX_BS" -gt 64 ] && CUDA_GRAPH_MAX_BS=64 + case "$OFFLOADING" in none) # Leave SGLang's default RadixAttention prefix cache on — agentic @@ -96,63 +98,35 @@ case "$OFFLOADING" in ;; esac -# Transformers in the container doesn't recognize the `deepseek_v4` model_type. -# PR #23608's fallback in hf_transformers_utils.get_config tries to handle this -# by writing a patched config to /tmp, but in practice isn't catching the error -# in this image. Patch the cached config.json directly instead: set model_type -# to `deepseek_v3` so AutoConfig.from_pretrained succeeds, and keep -# architectures=['DeepseekV4ForCausalLM'] so SGLang dispatches to its native -# DSv4 model class (python/sglang/srt/models/deepseek_v4.py). -python3 << PYEOF -import json -from huggingface_hub import hf_hub_download -path = hf_hub_download(repo_id="$MODEL", filename="config.json") -with open(path) as f: - config = json.load(f) -if config.get("model_type") == "deepseek_v4": - config["model_type"] = "deepseek_v3" - with open(path, "w") as f: - json.dump(config, f, indent=2) - print(f"Patched {path}: model_type deepseek_v4 -> deepseek_v3") -else: - print(f"No patch needed: model_type is {config.get('model_type')!r}") -PYEOF - -# DSv4 FP4-experts path. Mirrors the env block in the fixed-seq-len sibling -# (benchmarks/single_node/dsv4_fp4_mi355x_sglang.sh), which tracks the active -# block in python/run_dsv4.sh on the amd/deepseek_v4 branch: -# SGLANG_DSV4_FP4_EXPERTS=True -> route experts through FP4 kernels -# SGLANG_FORCE_TRITON_MOE_FP8=0 -> dispatch MoE through aiter and apply -# the swiglu_limit clamp in the triton -# MoE fallback path. -export SGLANG_REASONING_EFFORT=max -export SGLANG_OPT_USE_FUSED_COMPRESS=true -export SGLANG_OPT_USE_OLD_COMPRESSOR=true -export SGLANG_OPT_USE_TILELANG_SWA_PREPARE=false -export SGLANG_OPT_USE_JIT_KERNEL_FUSED_TOPK=false -export SGLANG_OPT_USE_FUSED_HASH_TOPK=false +# ---- LLM server config ---------------------------------------------------------- + +CACHE_ARGS=() +WARMUP_ARGS=() +CUDA_GRAPH_MAX_BS="$CONC" +[ "$CUDA_GRAPH_MAX_BS" -gt 64 ] && CUDA_GRAPH_MAX_BS=64 + +export SGLANG_DEFAULT_THINKING=1 +export SGLANG_DSV4_REASONING_EFFORT=max export SGLANG_OPT_DEEPGEMM_HC_PRENORM=false +export SGLANG_USE_AITER=1 +export SGLANG_USE_ROCM700A=0 +export SGLANG_OPT_USE_FUSED_COMPRESS=true +export SGLANG_HACK_FLASHMLA_BACKEND=unified_kv_triton +export SGLANG_OPT_FP8_WO_A_GEMM=false +export SGLANG_OPT_USE_JIT_INDEXER_METADATA=false +export SGLANG_OPT_USE_TOPK_V2=false +export SGLANG_OPT_USE_AITER_INDEXER=true +export SGLANG_OPT_USE_TILELANG_INDEXER=false export SGLANG_OPT_USE_TILELANG_MHC_PRE=false export SGLANG_OPT_USE_TILELANG_MHC_POST=false -export SGLANG_OPT_USE_AITER_MHC_PRE=true -export SGLANG_OPT_USE_AITER_MHC_POST=true -export SGLANG_ENABLE_THINKING=1 -export SGLANG_USE_AITER=1 -export SGLANG_USE_ROCM700A=1 -export SGLANG_TOPK_TRANSFORM_512_TORCH=0 export SGLANG_FP8_PAGED_MQA_LOGITS_TORCH=1 -export SGLANG_DSV4_FP4_EXPERTS=True -export SGLANG_OPT_DPSK_V4_RADIX=0 -export SGLANG_OPT_USE_OVERLAP_STORE_CACHE=false -export SGLANG_OPT_USE_FUSED_STORE_CACHE=false -export SGLANG_FORCE_TRITON_MOE_FP8=0 -export SGLANG_HACK_FLASHMLA_BACKEND=tilelang -export SGLANG_OPT_USE_TILELANG_INDEXER=true -export SGLANG_OPT_USE_TRITON_SWA_PREPARE=true +export SGLANG_OPT_USE_FUSED_COMPRESS_TRITON=true +export AITER_BF16_FP8_MOE_BOUND=0 +export SGLANG_EAGER_INPUT_NO_COPY=true -# ---- Server config ---------------------------------------------------------- -SERVER_LOG="$RESULT_DIR/server.log" -mkdir -p "$RESULT_DIR" +# multi-stream +export SGLANG_OPT_USE_MULTI_STREAM_OVERLAP=false +export SGLANG_ROCM_USE_MULTI_STREAM=false # Parallelism: pure TP, TP+EP, or DEP (DP-attn + EP). Matches the dsv4 b200 # vllm agentic launcher so the agentic sweep can probe both interactivity and @@ -181,17 +155,19 @@ else fi echo "Starting sglang server..." -python3 -m sglang.launch_server \ - --model-path "$MODEL_PATH" --served-model-name "$MODEL" \ +sglang serve \ + --model-path $MODEL \ --host=0.0.0.0 \ - --port "$PORT" \ + --port $PORT \ "${PARALLEL_ARGS[@]}" \ --trust-remote-code \ - --attention-backend compressed \ - --max-running-requests "$PER_ENGINE_MAX_RUNNING" \ - --cuda-graph-max-bs "$CUDA_GRAPH_MAX_BS" \ + --disable-radix-cache \ + --attention-backend dsv4 \ + --max-running-requests ${CONC} \ + --mem-fraction-static 0.90 \ + --swa-full-tokens-ratio 0.15 \ --page-size 256 \ - --context-length "$MAX_MODEL_LEN" \ + --context-length $MAX_MODEL_LEN \ --chunked-prefill-size 8192 \ --disable-shared-experts-fusion \ --tool-call-parser deepseekv4 \ diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_sglang.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_sglang.sh index c6f6cba25..6797f1023 100755 --- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_sglang.sh +++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_sglang.sh @@ -1,109 +1,28 @@ #!/usr/bin/env bash -set -euo pipefail -set -x - -# Agentic trace replay benchmark for DeepSeek-V4-Pro FP4 on MI355X using SGLang. -# Adapted from benchmarks/single_node/dsv4_fp4_mi355x_sglang.sh (fixed-seq-len -# sibling) with the agentic harness (build_replay_cmd / write_agentic_result_json -# / analyze_benchmark_distributions) swapped in for run_benchmark_serving. -# -# This launcher does NOT support CPU offload. SGLang's KV offload paths are -# different from vLLM's SimpleCPUOffloadConnector, and the matching agentic -# config (dsv4-fp4-mi355x-sglang-agentic) only sweeps offloading=none. -# -# Required env vars: -# MODEL, TP, CONC, OFFLOADING, TOTAL_CPU_DRAM_GB, RESULT_DIR source "$(dirname "$0")/../../benchmark_lib.sh" -check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION EP_SIZE DP_ATTENTION - -if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then - MAX_MODEL_LEN=1000000 -fi - -if [[ -n "${SLURM_JOB_ID:-}" ]]; then - echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" +check_env_vars \ + MODEL \ + TP \ + DP_ATTENTION \ + EP_SIZE \ + CONC \ + ISL \ + OSL \ + RANDOM_RANGE_RATIO \ + RESULT_FILENAME \ + MAX_MODEL_LEN + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" fi -# ROCR/HIP visibility under slurm cgroups. -if [ -n "${ROCR_VISIBLE_DEVICES:-}" ]; then - export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES" -fi +if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi -# `hf download` creates the target dir if missing and is itself idempotent. -# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE -# Either way, MODEL_PATH is what the server is launched with. -if [[ -n "${MODEL_PATH:-}" ]]; then - if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then - hf download "$MODEL" --local-dir "$MODEL_PATH" - fi -else - hf download "$MODEL" - export MODEL_PATH="$MODEL" -fi -rocm-smi || true -amd-smi || true - -# ---- Resolve traces and install deps ---------------------------------------- -# https://huggingface.co/datasets/semianalysisai/cc-traces-weka-with-subagents-060826 -# export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060226 - -# ---- Resolve traces and install deps ---------------------------------------- -resolve_trace_source -install_agentic_deps - -# ---- Server config ---------------------------------------------------------- -SERVER_LOG="$RESULT_DIR/server.log" -mkdir -p "$RESULT_DIR" - -# ---- Hicache config ---------------------------------------------------------- -# Reject anything other than none: this launcher has no SGLang CPU-offload -# wiring (different surface than vLLM's SimpleCPUOffloadConnector). - -case "$OFFLOADING" in - none) - # Leave SGLang's default RadixAttention prefix cache on — agentic - # replay needs it; --disable-radix-cache would zero the hit rate. - ;; - hicache) - # DeepSeek V4 HiCache uses ratio-based capacity control, not GB-based. - # DSv4 allocates several physical host sub-pools for each logical host - # token. MI355X nodes have ~3 TB of host DRAM (similar to B200's 3.8 - # TiB), so ratio=8 at TP≥8 provides a large useful CPU tier within the - # node budget. Lower TP configs use higher ratios to maintain adequate - # host token capacity without exceeding DRAM limits. - if [ "$TP" -ge 8 ]; then - DEFAULT_HICACHE_RATIO=8 - else - DEFAULT_HICACHE_RATIO=16 - fi - HICACHE_RATIO="${HICACHE_RATIO:-$DEFAULT_HICACHE_RATIO}" - HICACHE_WRITE_POLICY="${HICACHE_WRITE_POLICY:-write_through}" - HICACHE_IO_BACKEND="${HICACHE_IO_BACKEND:-direct}" - HICACHE_MEM_LAYOUT="${HICACHE_MEM_LAYOUT:-page_first_direct}" - export SGLANG_ENABLE_UNIFIED_RADIX_TREE=1 - CACHE_ARGS=( - --enable-hierarchical-cache - --hicache-ratio "$HICACHE_RATIO" - --hicache-write-policy "$HICACHE_WRITE_POLICY" - --hicache-io-backend "$HICACHE_IO_BACKEND" - --hicache-mem-layout "$HICACHE_MEM_LAYOUT" - ) - echo "HiCache DSv4 CPU tier: ratio=$HICACHE_RATIO, write_policy=$HICACHE_WRITE_POLICY, io_backend=$HICACHE_IO_BACKEND, mem_layout=$HICACHE_MEM_LAYOUT" - ;; - *) - echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, hicache)" >&2 - exit 1 - ;; -esac - -# ---- LLM server config ---------------------------------------------------------- - -CACHE_ARGS=() -WARMUP_ARGS=() -CUDA_GRAPH_MAX_BS="$CONC" -[ "$CUDA_GRAPH_MAX_BS" -gt 64 ] && CUDA_GRAPH_MAX_BS=64 +# sglang ships in the image at the SHA encoded in the image tag (built +# from the amd/deepseek_v4 branch in sgl-project/sglang). To bump sglang, +# bump the image tag in .github/configs/amd-master.yaml. export SGLANG_DEFAULT_THINKING=1 export SGLANG_DSV4_REASONING_EFFORT=max @@ -128,33 +47,31 @@ export SGLANG_EAGER_INPUT_NO_COPY=true export SGLANG_OPT_USE_MULTI_STREAM_OVERLAP=false export SGLANG_ROCM_USE_MULTI_STREAM=false -# Parallelism: pure TP, TP+EP, or DEP (DP-attn + EP). Matches the dsv4 b200 -# vllm agentic launcher so the agentic sweep can probe both interactivity and -# throughput regimes. -PARALLEL_ARGS=(--tensor-parallel-size "$TP") -if [ "$DP_ATTENTION" = "true" ]; then +SERVER_LOG=/workspace/server.log + +EVAL_CONTEXT_ARGS="" +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" +fi +# Start GPU monitoring (power, temperature, clocks every second) +start_gpu_monitor + +PARALLEL_ARGS=( + --tensor-parallel-size "$TP" +) +if [ "${DP_ATTENTION}" = "true" ]; then PARALLEL_ARGS+=( --dp "$TP" --enable-dp-attention --enable-prefill-delayer + --prefill-delayer-max-delay-ms 5000 ) fi if [ "${EP_SIZE:-1}" -gt 1 ]; then PARALLEL_ARGS+=(--ep-size "$EP_SIZE") fi -# --max-running-requests is per-engine. With DP-attn each DP engine handles -# only CONC/$TP sequences in steady state (the agentic harness load-balances -# users across DP ranks), so size the per-engine cap to that. -# Pure TP is a single engine and sees all CONC sequences itself. -if [ "$DP_ATTENTION" = "true" ]; then - PER_ENGINE_MAX_RUNNING=$(( CONC / TP )) - [ "$PER_ENGINE_MAX_RUNNING" -lt 1 ] && PER_ENGINE_MAX_RUNNING=1 -else - PER_ENGINE_MAX_RUNNING=$CONC -fi - -echo "Starting sglang server..." sglang serve \ --model-path $MODEL \ --host=0.0.0.0 \ @@ -173,16 +90,31 @@ sglang serve \ --tool-call-parser deepseekv4 \ --reasoning-parser deepseek-v4 \ --chat-template "$(dirname "$0")/../chat_templates/deepseek_v4_thinking.jinja" \ - --watchdog-timeout 1800 \ - --enable-metrics \ - "${CACHE_ARGS[@]}" \ - "${WARMUP_ARGS[@]}" > "$SERVER_LOG" 2>&1 & + --watchdog-timeout 1800 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & + SERVER_PID=$! -echo "Server PID: $SERVER_PID" +# Wait for server to be ready wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" -# ---- Run benchmark ---------------------------------------------------------- -build_replay_cmd "$RESULT_DIR" +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts "$((CONC * 10))" \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ + +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" + append_lm_eval_summary +fi -run_agentic_replay_and_write_outputs "$RESULT_DIR" +# Stop GPU monitoring +stop_gpu_monitor +set +x \ No newline at end of file From 4ebc4e2fe2f74f6e39d4fcd50de0fcffd7074d22 Mon Sep 17 00:00:00 2001 From: seungrokj Date: Mon, 15 Jun 2026 11:55:10 +0900 Subject: [PATCH 10/21] [AMD] remove unused CACHE_ARGS from dsv4_fp4_mi355x_sglang agentic script Co-Authored-By: Claude Sonnet 4.6 --- benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh b/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh index c6f6cba25..73ebac6f7 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh @@ -100,7 +100,6 @@ esac # ---- LLM server config ---------------------------------------------------------- -CACHE_ARGS=() WARMUP_ARGS=() CUDA_GRAPH_MAX_BS="$CONC" [ "$CUDA_GRAPH_MAX_BS" -gt 64 ] && CUDA_GRAPH_MAX_BS=64 From 735e9a3a8df700f1a06a7b66c2148794f7d7aac6 Mon Sep 17 00:00:00 2001 From: seungrokj Date: Mon, 15 Jun 2026 11:56:58 +0900 Subject: [PATCH 11/21] [AMD] tune hicache ratio and disable none-offloading in agentic config Co-Authored-By: Claude Sonnet 4.6 --- .github/configs/amd-master.yaml | 2 +- benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index d2a90a6b1..88a6046f7 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -2753,5 +2753,5 @@ dsv4-fp4-mi355x-sglang-agentic-hicache: #DPA, conc>=64 #- { tp: 8, dp-attn: false, offloading: none, conc-list: [4, 8, 16, 32, 48, 64, 128, 196, 256] } #- { tp: 8, dp-attn: false, offloading: hicache, conc-list: [4, 8, 16, 32, 48, 64, 128, 196, 256] } - - { tp: 8, dp-attn: false, offloading: none, conc-list: [64] } + #- { tp: 8, dp-attn: false, offloading: none, conc-list: [64] } - { tp: 8, dp-attn: false, offloading: hicache, conc-list: [64] } \ No newline at end of file diff --git a/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh b/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh index 73ebac6f7..2003c2761 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh @@ -74,7 +74,7 @@ case "$OFFLOADING" in # node budget. Lower TP configs use higher ratios to maintain adequate # host token capacity without exceeding DRAM limits. if [ "$TP" -ge 8 ]; then - DEFAULT_HICACHE_RATIO=8 + DEFAULT_HICACHE_RATIO=2 else DEFAULT_HICACHE_RATIO=16 fi From d3caa2b94ecd72640d86e49a798de572ea4249fa Mon Sep 17 00:00:00 2001 From: seungrokj Date: Mon, 15 Jun 2026 13:00:39 +0900 Subject: [PATCH 12/21] [AMD] remove --disable-radix-cache from dsv4_fp4_mi355x_sglang agentic script Co-Authored-By: Claude Sonnet 4.6 --- benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh b/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh index 2003c2761..5c780b646 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh @@ -160,7 +160,6 @@ sglang serve \ --port $PORT \ "${PARALLEL_ARGS[@]}" \ --trust-remote-code \ - --disable-radix-cache \ --attention-backend dsv4 \ --max-running-requests ${CONC} \ --mem-fraction-static 0.90 \ From c11f63776cc849220aefd380fca0b2e2c783e3f2 Mon Sep 17 00:00:00 2001 From: billishyahao Date: Fri, 12 Jun 2026 03:56:25 +0000 Subject: [PATCH 13/21] [AMD] add dsv4 sglang disagg --- .github/configs/amd-master.yaml | 623 +++++++++--------- benchmarks/multi_node/amd_utils/bench.sh | 7 +- benchmarks/multi_node/amd_utils/env.sh | 55 ++ benchmarks/multi_node/amd_utils/models.yaml | 35 + .../multi_node/amd_utils/server_sglang.sh | 20 +- benchmarks/multi_node/amd_utils/submit.sh | 10 + .../dsv4_fp4_mi355x_sglang-disagg.sh | 83 +++ 7 files changed, 525 insertions(+), 308 deletions(-) create mode 100755 benchmarks/multi_node/dsv4_fp4_mi355x_sglang-disagg.sh diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 88a6046f7..80c14f58b 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -16,6 +16,7 @@ dsr1-fp4-mi355x-sglang: - isl: 8192 osl: 1024 search-space: + - { tp: 4, conc-start: 4, conc-end: 64 } - { tp: 8, conc-start: 4, conc-end: 64 } # Agentic-coding sweep commented out for this image-bump PR — the # 10-conc agentic matrix amplifies sweep cost and the bump validation @@ -261,7 +262,7 @@ qwen3.5-fp8-mi325x-sglang: - { tp: 8, conc-start: 4, conc-end: 64 } qwen3.5-fp8-mi355x-sglang: - image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260517 + image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260528 model: Qwen/Qwen3.5-397B-A17B-FP8 model-prefix: qwen3.5 runner: mi355x @@ -273,17 +274,14 @@ qwen3.5-fp8-mi355x-sglang: - isl: 1024 osl: 1024 search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 32 } - - { tp: 8, ep: 8, conc-start: 64, conc-end: 256 } - - { tp: 2, ep: 2, conc-start: 128, conc-end: 256 } + - { tp: 4, ep: 1, conc-start: 4, conc-end: 256 } - isl: 8192 osl: 1024 search-space: - - { tp: 2, ep: 2, conc-start: 4, conc-end: 32 } - - { tp: 4, ep: 1, conc-start: 32, conc-end: 256 } + - { tp: 4, ep: 1, conc-start: 4, conc-end: 256 } qwen3.5-fp8-mi355x-sglang-mtp: - image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260517 + image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260528 model: Qwen/Qwen3.5-397B-A17B-FP8 model-prefix: qwen3.5 runner: mi355x @@ -295,14 +293,30 @@ qwen3.5-fp8-mi355x-sglang-mtp: - isl: 1024 osl: 1024 search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 32, spec-decoding: mtp } - - { tp: 8, ep: 8, conc-start: 64, conc-end: 256, spec-decoding: mtp } - - { tp: 2, ep: 2, conc-start: 128, conc-end: 256, spec-decoding: mtp } + - { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } - isl: 8192 osl: 1024 search-space: - - { tp: 2, ep: 2, conc-start: 4, conc-end: 32, spec-decoding: mtp } - - { tp: 4, ep: 1, conc-start: 32, conc-end: 256, spec-decoding: mtp } + - { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } + +# Diverged from qwen3.5-fp8-mi355x-sglang (agentic-coding sibling). Metadata is +# identical to origin/main's qwen3.5-fp8-mi355x-sglang; the split exists because this +# PR adds an agentic-coding scenarios block that differs from main +# (either main had none or had a different conc/offload sweep). +# The original qwen3.5-fp8-mi355x-sglang entry stays byte-identical to origin/main. +qwen3.5-fp8-mi355x-sglang-agentic: + image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260414 + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + runner: mi355x + precision: fp8 + framework: sglang + multinode: false + scenarios: + agentic-coding: + - duration: 1800 + search-space: + - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] } qwen3.5-fp8-mi355x-atom: image: rocm/atom:rocm7.2.3_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom20260511 @@ -413,7 +427,7 @@ qwen3.5-fp8-mi355x-sglang-disagg: - "DECODE_MTP_SIZE=0" qwen3.5-fp4-mi355x-sglang: - image: lmsysorg/sglang:v0.5.12-rocm720-mi35x + image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260604 model: amd/Qwen3.5-397B-A17B-MXFP4 model-prefix: qwen3.5 runner: mi355x @@ -433,22 +447,6 @@ qwen3.5-fp4-mi355x-sglang: - { tp: 2, conc-start: 4, conc-end: 256 } - { tp: 4, conc-start: 4, conc-end: 16 } -# target -qwen3.5-fp4-mi355x-sglang-agentic-hicache: - image: lmsysorg/sglang:v0.5.12-rocm720-mi35x - model: amd/Qwen3.5-397B-A17B-MXFP4 - model-prefix: qwen3.5 - runner: mi355x - precision: fp4 - framework: sglang - multinode: false - scenarios: - agentic-coding: - - duration: 1800 - search-space: - - { tp: 2, ep: 1, offloading: none, conc-list: [8, 16, 32, 40, 48, 56, 72] } - - { tp: 2, ep: 1, offloading: hicache, conc-list: [8, 16, 32, 40, 48, 56, 72] } - qwen3.5-fp4-mi355x-atom: image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post model: amd/Qwen3.5-397B-A17B-MXFP4 @@ -471,7 +469,7 @@ qwen3.5-fp4-mi355x-atom: - { tp: 4, conc-start: 4, conc-end: 16 } qwen3.5-fp4-mi355x-sglang-mtp: - image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260517 + image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260604 model: amd/Qwen3.5-397B-A17B-MXFP4 model-prefix: qwen3.5 runner: mi355x @@ -483,12 +481,12 @@ qwen3.5-fp4-mi355x-sglang-mtp: - isl: 1024 osl: 1024 search-space: - - { tp: 2, conc-start: 4, conc-end: 256, spec-decoding: mtp } + - { tp: 2, conc-start: 4, conc-end: 128, spec-decoding: mtp } - { tp: 4, conc-start: 4, conc-end: 16, spec-decoding: mtp } - isl: 8192 osl: 1024 search-space: - - { tp: 2, conc-start: 4, conc-end: 256, spec-decoding: mtp } + - { tp: 2, conc-start: 4, conc-end: 128, spec-decoding: mtp } - { tp: 4, conc-start: 4, conc-end: 16, spec-decoding: mtp } qwen3.5-fp4-mi355x-sglang-disagg: @@ -701,6 +699,26 @@ glm5.1-fp4-mi355x-sglang: - { tp: 2, conc-start: 4, conc-end: 256 } - { tp: 4, conc-start: 4, conc-end: 16 } +# Diverged from glm5.1-fp4-mi355x-sglang (agentic-coding sibling). Metadata is +# identical to origin/main's glm5.1-fp4-mi355x-sglang; the split exists because this +# PR adds an agentic-coding scenarios block that differs from main +# (either main had none or had a different conc/offload sweep). +# The original glm5.1-fp4-mi355x-sglang entry stays byte-identical to origin/main. +glm5.1-fp4-mi355x-sglang-agentic: + image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415 + model: amd/GLM-5.1-MXFP4 + model-prefix: glm5.1 + runner: mi355x + precision: fp4 + framework: sglang + multinode: false + scenarios: + agentic-coding: + - duration: 1800 + search-space: + # sglang manages KV eviction; mi355x glm5.1 caps at tp=4 conc=16 in fixed-seq, so cap conservatively + - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] } + glm5.1-fp4-mi355x-atom: image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post model: amd/GLM-5.1-MXFP4 @@ -721,7 +739,7 @@ glm5.1-fp4-mi355x-atom: - { tp: 4, conc-start: 4, conc-end: 256 } kimik2.5-int4-mi355x-vllm: - image: vllm/vllm-openai-rocm:v0.22.0 + image: vllm/vllm-openai-rocm:v0.21.0 model: moonshotai/Kimi-K2.5 model-prefix: kimik2.5 runner: mi355x @@ -740,7 +758,7 @@ kimik2.5-int4-mi355x-vllm: - { tp: 8, conc-start: 4, conc-end: 64 } kimik2.5-int4-mi325x-vllm: - image: vllm/vllm-openai-rocm:v0.22.0 + image: vllm/vllm-openai-rocm:v0.21.0 model: moonshotai/Kimi-K2.5 model-prefix: kimik2.5 runner: mi325x @@ -759,7 +777,7 @@ kimik2.5-int4-mi325x-vllm: - { tp: 8, conc-start: 4, conc-end: 64 } kimik2.5-int4-mi300x-vllm: - image: vllm/vllm-openai-rocm:v0.22.0 + image: vllm/vllm-openai-rocm:v0.21.0 model: moonshotai/Kimi-K2.5 model-prefix: kimik2.5 runner: mi300x @@ -798,6 +816,38 @@ kimik2.5-fp4-mi355x-vllm: - { tp: 8, conc-start: 4, conc-end: 64 } - { tp: 4, conc-start: 4, conc-end: 64 } +# Diverged from kimik2.5-fp4-mi355x-vllm (agentic-coding sibling). Reasons below; +# the original kimik2.5-fp4-mi355x-vllm entry is left identical to origin/main so +# its fixed-seq-len sweep is unaffected. +# - image: 'vllm/vllm-openai-rocm:v0.18.0' -> 'vllm/vllm-openai-rocm:v0.21.0' +kimik2.5-fp4-mi355x-vllm-agentic: + # v0.21.0 (released 2026-05-14) supersedes the prior nightly pin + # (51f22dcf...) which was carrying the SimpleCPUOffloadConnector ROCm + # cpu_offload_blocks > 0 fix. v0.21.0 is much newer than that fix and + # includes all subsequent ROCm offload work. + image: vllm/vllm-openai-rocm:v0.21.0 + model: amd/Kimi-K2.5-MXFP4 + model-prefix: kimik2.5 + runner: mi355x + precision: fp4 + framework: vllm + multinode: false + scenarios: + agentic-coding: + - duration: 1800 + search-space: + - { tp: 8, offloading: none, conc-list: [1, 2, 4, 8, 16, 24, 32, 40, 48] } + # CPU offload only above the KV cliff. Lower concurrencies fit + # entirely on-GPU, so paying the offload-path overhead there would + # just slow them down without measuring anything new. + - { tp: 8, offloading: cpu, conc-list: [32, 40, 48, 56] } + # TP=4 probe: half-node layout doubles per-GPU weight footprint + # (~62 GB on MI355X's 288 GB HBM, plenty of headroom). Restrict to + # cliff-region concurrencies on both offload modes so we can directly + # compare TP=4 vs TP=8 at the same conc points. + - { tp: 4, offloading: none, conc-list: [16, 24, 32, 40] } + - { tp: 4, offloading: cpu, conc-list: [16, 24, 32, 40] } + kimik2.5-fp4-mi355x-atom: image: rocm/atom:rocm7.2.3_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom20260511 model: amd/Kimi-K2.5-MXFP4 @@ -842,6 +892,33 @@ minimaxm2.5-fp8-mi355x-vllm: - { tp: 4, ep: 4, conc-start: 4, conc-end: 512 } - { tp: 8, ep: 8, conc-start: 2, conc-end: 2 } +# Diverged from minimaxm2.5-fp8-mi355x-vllm (agentic-coding sibling). Reasons below; +# the original minimaxm2.5-fp8-mi355x-vllm entry is left identical to origin/main so +# its fixed-seq-len sweep is unaffected. +# - image: 'vllm/vllm-openai-rocm:v0.19.0' -> 'vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf' +minimaxm2.5-fp8-mi355x-vllm-agentic: + # Nightly carrying vllm-project/vllm@20cac26b ("[Bug fix][KV Connector] + # add cpu_offload_blocks > 0 check before maybe_run_layer_kv_offload"), + # which enables SimpleCPUOffloadConnector on ROCm. Required for the + # cpu-offload sweep points to use the same offload path as the NVIDIA + # agentic-coding configs. + image: vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf + model: MiniMaxAI/MiniMax-M2.5 + model-prefix: minimaxm2.5 + runner: mi355x + precision: fp8 + framework: vllm + multinode: false + scenarios: + agentic-coding: + # MI355X tp=4 ep=4: compute ceiling ~60 (empirical), KV cliff ~91 (analytical). + # Compute saturates first; cpu offload likely won't help, but worth confirming. + # AMD uses native OffloadingConnector (NOT SimpleCPUOffloadConnector). + - duration: 1800 + search-space: + - { tp: 4, ep: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 48, 56, 64, 72, 96] } + - { tp: 4, ep: 4, offloading: cpu, conc-list: [48, 56, 64, 72, 96] } + minimaxm2.5-fp8-mi355x-atom: image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post model: MiniMaxAI/MiniMax-M2.5 @@ -888,22 +965,6 @@ minimaxm2.5-fp4-mi355x-atom: - { tp: 4, conc-start: 4, conc-end: 128 } - { tp: 8, conc-start: 4, conc-end: 16 } -# target -minimaxm2.5-fp4-mi355x-vllm-agentic-lmcache: - image: vllm/vllm-openai-rocm:v0.22.0 - model: amd/MiniMax-M2.5-MXFP4 - model-prefix: minimaxm2.5 - runner: mi355x - precision: fp4 - framework: vllm - multinode: false - scenarios: - agentic-coding: - - duration: 1800 - search-space: - - { tp: 1, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 40, 48] } - - { tp: 1, ep: 1, offloading: lmcache, conc-list: [1, 2, 4, 8, 16, 32, 40, 48] } - minimaxm2.5-fp4-mi355x-vllm: image: vllm/vllm-openai-rocm:v0.22.0 model: amd/MiniMax-M2.5-MXFP4 @@ -928,7 +989,7 @@ minimaxm2.5-fp4-mi355x-vllm: - { tp: 4, conc-start: 4, conc-end: 64 } minimaxm2.5-fp8-mi300x-vllm: - image: vllm/vllm-openai-rocm:v0.22.0 + image: vllm/vllm-openai-rocm:v0.21.0 model: MiniMaxAI/MiniMax-M2.5 model-prefix: minimaxm2.5 runner: mi300x @@ -948,6 +1009,29 @@ minimaxm2.5-fp8-mi300x-vllm: - { tp: 2, conc-start: 4, conc-end: 64 } - { tp: 4, conc-start: 4, conc-end: 64 } +# Diverged from minimaxm2.5-fp8-mi300x-vllm (agentic-coding sibling). Reasons below; +# the original minimaxm2.5-fp8-mi300x-vllm entry is left identical to origin/main so +# its fixed-seq-len sweep is unaffected. +# - image: 'vllm/vllm-openai-rocm:v0.16.0' -> 'vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf' +minimaxm2.5-fp8-mi300x-vllm-agentic: + # Nightly carrying vllm-project/vllm@20cac26b — see mi355x config above. + image: vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf + model: MiniMaxAI/MiniMax-M2.5 + model-prefix: minimaxm2.5 + runner: mi300x + precision: fp8 + framework: vllm + multinode: false + scenarios: + agentic-coding: + # MI300X tp=4: compute ceiling ~25 (estimated, between H100 and H200); + # KV cliff ~52. Compute saturates first. + # AMD uses native OffloadingConnector (NOT SimpleCPUOffloadConnector). + - duration: 1800 + search-space: + - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 20, 24, 28, 32, 40, 48] } + - { tp: 4, offloading: cpu, conc-list: [16, 20, 24, 28, 32] } + minimaxm2.5-fp8-mi325x-vllm: image: vllm/vllm-openai-rocm:v0.22.0 model: MiniMaxAI/MiniMax-M2.5 @@ -969,8 +1053,32 @@ minimaxm2.5-fp8-mi325x-vllm: - { tp: 2, conc-start: 4, conc-end: 64 } - { tp: 8, ep: 8, conc-start: 4, conc-end: 256 } +# Diverged from minimaxm2.5-fp8-mi325x-vllm (agentic-coding sibling). Reasons below; +# the original minimaxm2.5-fp8-mi325x-vllm entry is left identical to origin/main so +# its fixed-seq-len sweep is unaffected. +# - image: 'vllm/vllm-openai-rocm:v0.18.0' -> 'vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf' +minimaxm2.5-fp8-mi325x-vllm-agentic: + # Nightly carrying vllm-project/vllm@20cac26b — see mi355x config above. + image: vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf + model: MiniMaxAI/MiniMax-M2.5 + model-prefix: minimaxm2.5 + runner: mi325x + precision: fp8 + framework: vllm + multinode: false + scenarios: + agentic-coding: + # MI325X tp=4: cloned from MI300X recipe (slightly faster compute, + # similar HBM profile). Compute saturates first; cpu-offload window + # exercises the SimpleCPUOffloadConnector path enabled by the rocm + # nightly. Mirror MI300X conc grid for cross-vendor comparability. + - duration: 1800 + search-space: + - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 20, 24, 28, 32, 40, 48] } + - { tp: 4, offloading: cpu, conc-list: [16, 20, 24, 28, 32] } + gptoss-fp4-mi300x-vllm: - image: vllm/vllm-openai-rocm:v0.22.0 + image: vllm/vllm-openai-rocm:v0.17.0 model: openai/gpt-oss-120b model-prefix: gptoss runner: mi300x @@ -1021,7 +1129,7 @@ gptoss-fp4-mi325x-vllm: gptoss-fp4-mi355x-vllm: image: vllm/vllm-openai-rocm:v0.22.0 - model: amd/gpt-oss-120b-w-mxfp4-a-fp8 + model: openai/gpt-oss-120b model-prefix: gptoss runner: mi355x precision: fp4 @@ -1084,7 +1192,7 @@ dsr1-fp8-mi355x-atom: - { tp: 8, conc-start: 4, conc-end: 128 } dsr1-fp8-mi355x-atom-mtp: - image: rocm/atom:rocm7.2.3_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom20260511 + image: rocm/atom:rocm7.2.4_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.3 model: deepseek-ai/DeepSeek-R1-0528 model-prefix: dsr1 runner: mi355x @@ -1096,7 +1204,7 @@ dsr1-fp8-mi355x-atom-mtp: - isl: 1024 osl: 1024 search-space: - - { tp: 8, conc-start: 4, conc-end: 256, spec-decoding: mtp } + - { tp: 8, conc-start: 4, conc-end: 512, spec-decoding: mtp } - isl: 8192 osl: 1024 search-space: @@ -1411,7 +1519,7 @@ dsr1-fp8-mi355x-sglang-disagg-mtp: - "DECODE_MTP_SIZE=2" kimik2.5-fp4-mi355x-vllm-disagg: - image: vllm/vllm-openai-rocm:v0.22.0 + image: vllm/vllm-openai-rocm:nightly-bf610c2f56764e1b30bc6065f4ceace3d6e59036 model: amd/Kimi-K2.5-MXFP4 model-prefix: kimik2.5 runner: mi355x-disagg @@ -1465,7 +1573,7 @@ kimik2.5-fp4-mi355x-vllm-disagg: - "DECODE_NODES=2" minimaxm2.5-fp8-mi355x-vllm-disagg: - image: vllm/vllm-openai-rocm:v0.22.0 + image: vllm/vllm-openai-rocm:nightly-a6682d1d259cca69a9ae737ea5608fbbe7520031 model: MiniMaxAI/MiniMax-M2.5 model-prefix: minimaxm2.5 runner: mi355x-disagg @@ -1858,6 +1966,7 @@ dsr1-fp4-mi355x-sglang-disagg-1k1k-mtp: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=1" + dsr1-fp4-mi355x-sglang-disagg-8k1k-mtp: image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260529 model: amd/DeepSeek-R1-0528-MXFP4-v2 @@ -1968,6 +2077,7 @@ dsr1-fp4-mi355x-sglang-disagg-8k1k-mtp: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=3" + # 1*DEP8 + 1*DEP8 - spec-decoding: "mtp" conc-list: [ 128 ] @@ -2025,8 +2135,72 @@ dsr1-fp4-mi355x-sglang-disagg-8k1k-mtp: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=1" + +# DSv4 PD-disaggregation on MI355X via SGLang + MoRI. Structure mirrors +# dsr1-fp4-mi355x-sglang-disagg but only the isl 8192 / osl 1024 scenario, with two +# topology families captured from the validated manual recipe (see +# dsv4_mi355x_sglang_disagg_plan.md): +# - pure-TP 1P1D (TP8, mori KV transfer) +# - DEP 1P1D (TP8/EP8/DP8, mori KV transfer + mori MoE a2a, dp-attention) +# DSv4-specific serving knobs (attention-backend dsv4, page-size 256, unified_kv_triton, +# AITER indexer, deepseekv4 parsers) live in amd_utils/{models.yaml,env.sh}; the bench +# client uses --dsv4 framing (amd_utils/bench.sh). STP only for now (reference recipe has +# no spec decoding); MTP is a follow-up. +dsv4-fp4-mi355x-sglang-disagg: + image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260610 + model: deepseek-ai/DeepSeek-V4-Pro + model-prefix: dsv4 + runner: mi355x-disagg + precision: fp4 + framework: sglang-disagg + multinode: true + disagg: true + scenarios: + fixed-seq-len: + - isl: 8192 + osl: 1024 + search-space: + # non-MTP configurations + # 1P1D pure TP8 (mori KV transfer) + - spec-decoding: "none" + conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" + + # 1P1D DEP8 (mori KV transfer + mori MoE a2a, dp-attention) + - spec-decoding: "none" + conc-list: [ 512, 768, 1024 ] + prefill: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "PREFILL_NODES=2" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" + dsv4-fp4-mi355x-sglang: - image: rocm/sgl-dev:rocm720-mi35x-f96ac98-20260526-DSv4 + image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260610 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: mi355x @@ -2081,6 +2255,25 @@ dsv4-fp4-mi355x-sglang-mtp: - { tp: 8, dp-attn: true, conc-start: 64, conc-end: 2048, spec-decoding: mtp } - { tp: 8, dp-attn: false, conc-start: 1, conc-end: 32, spec-decoding: mtp } +# DSv4 on MI355X via vLLM, using the official vllm/vllm-openai-rocm +# nightly image. DSv4 base ROCm support (vllm-project/vllm#40871) merged +# on 2026-05-05, so any nightly built after that includes the +# DeepseekV4ForCausalLM model class. +# +# IMPORTANT: pin to a digest-suffixed nightly tag rather than the +# floating `:nightly`. launch_mi355x-amds.sh caches enroot squashfs +# files keyed on the image string and short-circuits re-import if the +# file already exists, so the floating tag silently keeps a stale build +# even after Docker Hub updates `:nightly`. +# +# DeepSeek-V4-Pro is FP4+FP8 mixed (FP4 MoE expert weights, FP8 for the +# rest); InferenceX classifies this as fp4 — same as the sister sglang +# and atom DSv4 mi355x entries below. Image and serving flags follow the +# validated recipe from vllm-project/recipes#433: AITER+AITER_LINEAR, mp +# executor, triton_unfused MoE (required for the FP4 expert format), +# async scheduling, max-num-seqs=128, max-num-batched-tokens=8192, +# gpu-mem-util=0.6. TP8 sweeps conc 4-64; DEP8 has a single conc=64 +# probe to validate the ROCm DP+EP path. dsv4-fp4-mi355x-vllm: image: vllm/vllm-openai-rocm:v0.22.0 model: deepseek-ai/DeepSeek-V4-Pro @@ -2174,23 +2367,6 @@ dsv4-fp4-mi355x-atom-mtp: search-space: - { tp: 8, ep: 1, conc-start: 1, conc-end: 1024, spec-decoding: mtp } -# target -dsv4-fp4-mi355x-atom-agentic-lmcache: - image: rocm/atom-dev:nightly_202606101557 - model: deepseek-ai/DeepSeek-V4-Pro - model-prefix: dsv4 - runner: mi355x - precision: fp4 - framework: atom - multinode: false - scenarios: - agentic-coding: - - duration: 1800 - search-space: - - { tp: 8, ep: 1, offloading: none, conc-list: [52] } - #- { tp: 8, ep: 1, offloading: none, conc-list: [44, 48, 52, 56, 60] } - #- { tp: 8, ep: 1, offloading: lmcache, conc-list: [44, 48, 52, 56, 60] } - qwen3.5-bf16-mi325x-sglang-mtp: image: lmsysorg/sglang:v0.5.12-rocm720-mi30x model: Qwen/Qwen3.5-397B-A17B @@ -2286,6 +2462,44 @@ glm5-fp8-mi325x-sglang-mtp: search-space: - { tp: 8, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp } +# ============================================================================ +# Net-new agentic recipes from chore/agentx-v0.3 (no overlap with main entries). +# Recipes that ALREADY existed on main were intentionally left at main's version +# to preserve main behavior; PR-branch modifications to those recipes are NOT +# brought in here. +# ============================================================================ + +qwen3.5-fp8-mi355x-sglang-agentic-hicache: + image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260521 + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + runner: mi355x + precision: fp8 + framework: sglang + multinode: false + scenarios: + agentic-coding: + - duration: 1800 + search-space: + - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] } + - { tp: 8, ep: 1, offloading: hicache, conc-list: [16, 32, 48, 64] } + +dsv4-fp4-mi355x-vllm-agentic: + image: vllm/vllm-openai-rocm:v0.21.0 + model: deepseek-ai/DeepSeek-V4-Pro + model-prefix: dsv4 + runner: mi355x + precision: fp4 + framework: vllm + multinode: false + scenarios: + agentic-coding: + - duration: 1800 + search-space: + - { tp: 8, offloading: none, conc-list: [1, 2, 4] } + - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 10, 12, 16] } + - { tp: 4, ep: 4, dp-attn: true, offloading: none, conc-list: [16, 24, 32, 40, 48] } + dsr1-fp4-mi355x-sglang-disagg-mtp: image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260519 model: amd/DeepSeek-R1-0528-MXFP4-v2 @@ -2514,214 +2728,20 @@ dsr1-fp4-mi355x-sglang-disagg-mtp: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=1" -qwen3.5-fp8-mi355x-sglang-agentic: - image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260414 - model: Qwen/Qwen3.5-397B-A17B-FP8 - model-prefix: qwen3.5 - runner: mi355x - precision: fp8 - framework: sglang - multinode: false - scenarios: - agentic-coding: - - duration: 1800 - search-space: - - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] } - -glm5.1-fp4-mi355x-sglang-agentic: - image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415 - model: amd/GLM-5.1-MXFP4 - model-prefix: glm5.1 - runner: mi355x - precision: fp4 - framework: sglang - multinode: false - scenarios: - agentic-coding: - - duration: 1800 - search-space: - # sglang manages KV eviction; mi355x glm5.1 caps at tp=4 conc=16 in fixed-seq, so cap conservatively - - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] } - -# target -glm5.1-fp4-mi355x-sglang-agentic-hicache: - image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260529 - model: amd/GLM-5.1-MXFP4 - model-prefix: glm5.1 - runner: mi355x - precision: fp4 - framework: sglang - multinode: false - scenarios: - agentic-coding: - - duration: 1800 - search-space: - # sglang manages KV eviction; mi355x glm5.1 caps at tp=4 conc=16 in fixed-seq, so cap conservatively - - { tp: 2, ep: 1, offloading: none, conc-list: [4, 8, 16, 32, 40, 48] } - - { tp: 2, ep: 1, offloading: hicache, conc-list: [4, 8, 16, 32, 40, 48] } - -kimik2.5-fp4-mi355x-vllm-agentic: - image: vllm/vllm-openai-rocm:v0.22.0 - model: amd/Kimi-K2.5-MXFP4 - model-prefix: kimik2.5 - runner: mi355x - precision: fp4 - framework: vllm - multinode: false - scenarios: - agentic-coding: - - duration: 1800 - search-space: - - { tp: 8, offloading: none, conc-list: [1, 2, 4, 8, 16, 24, 32, 40, 48] } - # CPU offload only above the KV cliff. Lower concurrencies fit - # entirely on-GPU, so paying the offload-path overhead there would - # just slow them down without measuring anything new. - - { tp: 8, offloading: cpu, conc-list: [32, 40, 48, 56] } - # TP=4 probe: half-node layout doubles per-GPU weight footprint - # (~62 GB on MI355X's 288 GB HBM, plenty of headroom). Restrict to - # cliff-region concurrencies on both offload modes so we can directly - # compare TP=4 vs TP=8 at the same conc points. - - { tp: 4, offloading: none, conc-list: [16, 24, 32, 40] } - - { tp: 4, offloading: cpu, conc-list: [16, 24, 32, 40] } - -# target -kimik2.5-fp4-mi355x-vllm-agentic-lmcache: - image: vllm/vllm-openai-rocm:v0.22.0 - model: amd/Kimi-K2.5-MXFP4 - model-prefix: kimik2.5 - runner: mi355x - precision: fp4 - framework: vllm - multinode: false - scenarios: - agentic-coding: - - duration: 1800 - search-space: - - { tp: 4, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 40, 48, 56, 64] } - - { tp: 4, ep: 1, offloading: lmcache, conc-list: [1, 2, 4, 8, 16, 32, 40, 48, 56, 64] } - -# target -kimik2.5-fp4-mi355x-vllm-agentic-lmcache-060226DRAM1500GB: - image: vllm/vllm-openai-rocm:v0.22.0 - model: amd/Kimi-K2.5-MXFP4 - model-prefix: kimik2.5 - runner: mi355x - precision: fp4 - framework: vllm - multinode: false - scenarios: - agentic-coding: - - duration: 1800 - search-space: - #- { tp: 4, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 40, 48, 56, 64] } - - { tp: 4, ep: 1, offloading: lmcache, conc-list: [1, 2, 4, 8, 16, 32, 40, 48, 56, 64] } - -minimaxm2.5-fp8-mi355x-vllm-agentic: - image: vllm/vllm-openai-rocm:v0.22.0 - model: MiniMaxAI/MiniMax-M2.5 - model-prefix: minimaxm2.5 - runner: mi355x - precision: fp8 - framework: vllm - multinode: false - scenarios: - agentic-coding: - # MI355X tp=4 ep=4: compute ceiling ~60 (empirical), KV cliff ~91 (analytical). - # Compute saturates first; cpu offload likely won't help, but worth confirming. - # AMD uses native OffloadingConnector (NOT SimpleCPUOffloadConnector). - - duration: 1800 - search-space: - - { tp: 4, ep: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 48, 56, 64, 72, 96] } - - { tp: 4, ep: 4, offloading: cpu, conc-list: [48, 56, 64, 72, 96] } - -# target -minimaxm2.5-fp8-mi355x-vllm-agentic-lmcache: - image: vllm/vllm-openai-rocm:v0.22.0 - model: MiniMaxAI/MiniMax-M2.5 - model-prefix: minimaxm2.5 - runner: mi355x - precision: fp8 - framework: vllm - multinode: false - scenarios: - agentic-coding: - # MI355X tp=4 ep=4: compute ceiling ~60 (empirical), KV cliff ~91 (analytical). - # Compute saturates first; cpu offload likely won't help, but worth confirming. - # AMD uses native OffloadingConnector (NOT SimpleCPUOffloadConnector). - - duration: 1800 - search-space: - - { tp: 2, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 40, 48] } - - { tp: 2, ep: 1, offloading: lmcache, conc-list: [1, 2, 4, 8, 16, 32, 40, 48] } - -minimaxm2.5-fp8-mi300x-vllm-agentic: - image: vllm/vllm-openai-rocm:v0.22.0 - model: MiniMaxAI/MiniMax-M2.5 - model-prefix: minimaxm2.5 - runner: mi300x - precision: fp8 - framework: vllm - multinode: false - scenarios: - agentic-coding: - # MI300X tp=4: compute ceiling ~25 (estimated, between H100 and H200); - # KV cliff ~52. Compute saturates first. - # AMD uses native OffloadingConnector (NOT SimpleCPUOffloadConnector). - - duration: 1800 - search-space: - - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 20, 24, 28, 32, 40, 48] } - - { tp: 4, offloading: cpu, conc-list: [16, 20, 24, 28, 32] } - -minimaxm2.5-fp8-mi325x-vllm-agentic: - image: vllm/vllm-openai-rocm:v0.22.0 - model: MiniMaxAI/MiniMax-M2.5 - model-prefix: minimaxm2.5 - runner: mi325x - precision: fp8 - framework: vllm - multinode: false - scenarios: - agentic-coding: - # MI325X tp=4: cloned from MI300X recipe (slightly faster compute, - # similar HBM profile). Compute saturates first; cpu-offload window - # exercises the SimpleCPUOffloadConnector path enabled by the rocm - # nightly. Mirror MI300X conc grid for cross-vendor comparability. - - duration: 1800 - search-space: - - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 20, 24, 28, 32, 40, 48] } - - { tp: 4, offloading: cpu, conc-list: [16, 20, 24, 28, 32] } - -# target -qwen3.5-fp8-mi355x-sglang-agentic-hicache: - image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260531 - model: Qwen/Qwen3.5-397B-A17B-FP8 - model-prefix: qwen3.5 - runner: mi355x - precision: fp8 - framework: sglang - multinode: false - scenarios: - agentic-coding: - - duration: 1800 - search-space: - - { tp: 4, ep: 1, offloading: none, conc-list: [8, 16, 32, 40, 48, 56, 72] } - - { tp: 4, ep: 1, offloading: hicache, conc-list: [8, 16, 32, 40, 48, 56, 72] } - -dsv4-fp4-mi355x-vllm-agentic: - image: vllm/vllm-openai-rocm:v0.22.0 - model: deepseek-ai/DeepSeek-V4-Pro - model-prefix: dsv4 - runner: mi355x - precision: fp4 - framework: vllm - multinode: false - scenarios: - agentic-coding: - - duration: 1800 - search-space: - - { tp: 8, offloading: none, conc-list: [1, 2, 4] } - - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 10, 12, 16] } - - { tp: 4, ep: 4, dp-attn: true, offloading: none, conc-list: [16, 24, 32, 40, 48] } +# DSv4-Pro FP4 on MI355X via SGLang. Uses a rocm720 mi35x image built off the +# amd/deepseek_v4 branch in sgl-project/sglang; the SHA is encoded in the +# image tag, so bumping sglang is just an image tag bump here. Sweeps +# DP-attention on/off and EP=8. + +# Diverged from dsv4-fp4-mi355x-sglang (agentic-coding sibling). Reasons below; +# the original dsv4-fp4-mi355x-sglang entry is left identical to origin/main so +# its fixed-seq-len sweep is unaffected. +# - scenarios: replaced fixed-seq-len with agentic-coding. +# Image is identical to the base entry (rocm/sgl-dev DSv4 build). +# CONC ranges mirror dsv4-fp4-b200-vllm-agentic for cross-hardware +# comparability. Offload sweep is none-only (SGLang has no equivalent of +# vLLM's SimpleCPUOffloadConnector path that we exercise on b200). dsv4-fp4-mi355x-sglang-agentic: image: rocm/sgl-dev:rocm720-mi35x-0363e6c-20260509-DSv4 model: deepseek-ai/DeepSeek-V4-Pro @@ -2737,21 +2757,22 @@ dsv4-fp4-mi355x-sglang-agentic: - { tp: 8, offloading: none, conc-list: [16, 32, 64] } - { tp: 8, dp-attn: true, offloading: none, conc-list: [64, 128, 256] } -# target -dsv4-fp4-mi355x-sglang-agentic-hicache: - image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260610 - model: deepseek-ai/DeepSeek-V4-Pro - model-prefix: dsv4 - runner: mi355x - precision: fp4 - framework: sglang - multinode: false - scenarios: - agentic-coding: - - duration: 1800 - search-space: - #DPA, conc>=64 - #- { tp: 8, dp-attn: false, offloading: none, conc-list: [4, 8, 16, 32, 48, 64, 128, 196, 256] } - #- { tp: 8, dp-attn: false, offloading: hicache, conc-list: [4, 8, 16, 32, 48, 64, 128, 196, 256] } - #- { tp: 8, dp-attn: false, offloading: none, conc-list: [64] } - - { tp: 8, dp-attn: false, offloading: hicache, conc-list: [64] } \ No newline at end of file +# DSv4 on MI355X via vLLM, using the official vllm/vllm-openai-rocm +# nightly image. DSv4 base ROCm support (vllm-project/vllm#40871) merged +# on 2026-05-05, so any nightly built after that includes the +# DeepseekV4ForCausalLM model class. +# +# IMPORTANT: pin to a digest-suffixed nightly tag rather than the +# floating `:nightly`. launch_mi355x-amds.sh caches enroot squashfs +# files keyed on the image string and short-circuits re-import if the +# file already exists, so the floating tag silently keeps a stale build +# even after Docker Hub updates `:nightly`. +# +# DeepSeek-V4-Pro is FP4+FP8 mixed (FP4 MoE expert weights, FP8 for the +# rest); InferenceX classifies this as fp4 — same as the sister sglang +# and atom DSv4 mi355x entries below. Image and serving flags follow the +# validated recipe from vllm-project/recipes#433: AITER+AITER_LINEAR, mp +# executor, triton_unfused MoE (required for the FP4 expert format), +# async scheduling, max-num-seqs=128, max-num-batched-tokens=8192, +# gpu-mem-util=0.6. TP8 sweeps conc 4-64; DEP8 has a single conc=64 +# probe to validate the ROCm DP+EP path. diff --git a/benchmarks/multi_node/amd_utils/bench.sh b/benchmarks/multi_node/amd_utils/bench.sh index 05384f435..d198a4ddd 100755 --- a/benchmarks/multi_node/amd_utils/bench.sh +++ b/benchmarks/multi_node/amd_utils/bench.sh @@ -79,7 +79,12 @@ for max_concurrency in "${chosen_concurrencies[@]}"; do if [[ "$ENGINE" == "vllm-disagg" ]]; then extra_flags="--trust-remote-code --tokenizer $MODEL_PATH" else - if [ "$IS_MTP" = "true" ]; then + # DeepSeek-V4-Pro ships no jinja chat_template, so --use-chat-template crashes; + # --dsv4 applies the DSv4 ... framing instead + # (chat-formatted inputs are required for correct EAGLE/MTP acceptance too). + if [[ "$model_name" == "DeepSeek-V4-Pro" ]]; then + extra_flags="--dsv4" + elif [ "$IS_MTP" = "true" ]; then extra_flags="--use-chat-template" fi fi diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh index 71d2653bd..6b0e4206a 100755 --- a/benchmarks/multi_node/amd_utils/env.sh +++ b/benchmarks/multi_node/amd_utils/env.sh @@ -228,6 +228,61 @@ $1 == "DSCP" && $2 == ":" && $NF == p { fi fi + # ========================================================================= + # DeepSeek-V4-Pro PD recipe overrides + # Placed at the end of the SGLang env block so it wins over the global + # MoRI/SGLang defaults set above. Mirrors the validated DSv4 manual PD + # commands (see dsv4_mi355x_sglang_disagg_plan.md §2). Only the SGLang/MoRI + # env knobs are pinned here; CLI flags live in models.yaml and the cluster + # NIC/socket vars (NCCL_IB_HCA, *_SOCKET_IFNAME, IBDEVICES) stay runner-derived. + # ========================================================================= + if [[ "$MODEL_NAME" == "DeepSeek-V4-Pro" ]]; then + # MoRI dispatch/combine dtypes: auto for both roles (not the fp8 split default) + export SGLANG_MORI_DISPATCH_DTYPE=auto + export MORI_COMBINE_DTYPE_PREFILL=auto + export MORI_COMBINE_DTYPE_DECODE=auto + + # Per-role MoRI dispatch sizing (used by the harness chunked/MoE math) + export MORI_MAX_DISPATCH_TOKENS_PREFILL=8192 + export MORI_MAX_DISPATCH_TOKENS_DECODE=64 + export MORI_MOE_MAX_INPUT_TOKENS_PREFILL=2048 + export MORI_MOE_MAX_INPUT_TOKENS_DECODE=332 + + # PER_RANK dispatch tokens are pinned independently of the sizing above + # (16384 prefill / 128 decode in the reference recipe). server_sglang.sh + # prefers these over the MORI_MAX_DISPATCH_TOKENS_* coupling when set. + export MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK_PREFILL=16384 + export MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK_DECODE=128 + + # Fixed inter-kernel switch threshold (not derived). NOTE: the DP+EP path in + # server_sglang.sh recomputes this dynamically for the DEP topology. + export SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD=4096 + + # Overlap plan stream on for DSv4 (global default is 0) + export SGLANG_ENABLE_OVERLAP_PLAN_STREAM=0 + + # DSv4 model kernel routing (mirrors the single-node / manual PD recipe) + export SGLANG_DEFAULT_THINKING=1 + export SGLANG_DSV4_REASONING_EFFORT=max + export SGLANG_USE_ROCM700A=0 + export SGLANG_HACK_FLASHMLA_BACKEND=unified_kv_triton + export SGLANG_OPT_DEEPGEMM_HC_PRENORM=false + export SGLANG_OPT_USE_FUSED_COMPRESS=true + export SGLANG_OPT_USE_FUSED_COMPRESS_TRITON=true + export SGLANG_OPT_FP8_WO_A_GEMM=false + export SGLANG_OPT_USE_JIT_INDEXER_METADATA=false + export SGLANG_OPT_USE_TOPK_V2=false + export SGLANG_OPT_USE_AITER_INDEXER=true + export SGLANG_OPT_USE_TILELANG_INDEXER=false + export SGLANG_OPT_USE_TILELANG_MHC_PRE=false + export SGLANG_OPT_USE_TILELANG_MHC_POST=false + export SGLANG_FP8_PAGED_MQA_LOGITS_TORCH=1 + export SGLANG_OPT_USE_MULTI_STREAM_OVERLAP=false + export SGLANG_ROCM_USE_MULTI_STREAM=false + export AITER_BF16_FP8_MOE_BOUND=0 + export SGLANG_EAGER_INPUT_NO_COPY=true + fi + # FIXME: WA for latest upstream 0305 image export PYTHONPATH=/sgl-workspace/aiter:${PYTHONPATH} diff --git a/benchmarks/multi_node/amd_utils/models.yaml b/benchmarks/multi_node/amd_utils/models.yaml index 605a377be..e68c448ce 100644 --- a/benchmarks/multi_node/amd_utils/models.yaml +++ b/benchmarks/multi_node/amd_utils/models.yaml @@ -349,3 +349,38 @@ DeepSeek-R1-0528-MXFP4-v2: max_running_requests: 128 chunked_prefill_size: 262144 cuda_graph_bs_range: "1-128" + +# DeepSeek-V4-Pro PD-disaggregation recipe (MI355X, SGLang + MoRI). +# KV transfer = mori for both topologies (pure-TP and DEP); the DP path additionally +# routes the MoE all-to-all through mori (--moe-a2a-backend mori) with dp-attention. +# DSv4-specific kernel routing (unified_kv_triton, AITER indexer, fp8 wo_a fallback, +# thinking/reasoning-effort, dispatch dtypes, per-role PER_RANK dispatch tokens) is set +# in env.sh's DeepSeek-V4-Pro block. The bench client uses --dsv4 framing (bench.sh). +# prefill.disable_cuda_graph routes prefill to --disable-cuda-graph; decode keeps +# --cuda-graph-bs. See dsv4_mi355x_sglang_disagg_plan.md. +DeepSeek-V4-Pro: + base_flags: "--decode-log-interval 100 --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend dsv4 --page-size 256 --swa-full-tokens-ratio 0.1 --disable-shared-experts-fusion --tool-call-parser deepseekv4 --reasoning-parser deepseek-v4 --disaggregation-transfer-backend mori" + dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" + prefill: + mem_fraction_static: 0.8 + disable_radix_cache: true + disable_cuda_graph: true + dp: + max_running_requests: 1024 + chunked_prefill_size: 131072 + context_length: 9217 + max_total_tokens: 262144 + no_dp: + max_running_requests: 128 + chunked_prefill_size: 131072 + context_length: 9217 + max_total_tokens: 262144 + decode: + mem_fraction_static: 0.85 + prefill_round_robin_balance: true + dp: + max_running_requests: 1024 + cuda_graph_bs_range: "1-128" + no_dp: + max_running_requests: 128 + cuda_graph_bs_range: "1-128" diff --git a/benchmarks/multi_node/amd_utils/server_sglang.sh b/benchmarks/multi_node/amd_utils/server_sglang.sh index c28ccab41..38fbdfc8e 100755 --- a/benchmarks/multi_node/amd_utils/server_sglang.sh +++ b/benchmarks/multi_node/amd_utils/server_sglang.sh @@ -125,6 +125,7 @@ decode = m.get('decode', {}) print(f'PREFILL_MEM_FRACTION_STATIC=\"{prefill.get(\"mem_fraction_static\", 0.8)}\"') print(f'PREFILL_DISABLE_RADIX_CACHE=\"{prefill.get(\"disable_radix_cache\", True)}\"') +print(f'PREFILL_DISABLE_CUDA_GRAPH=\"{prefill.get(\"disable_cuda_graph\", False)}\"') dp = prefill.get('dp', {}) no_dp = prefill.get('no_dp', {}) @@ -136,6 +137,8 @@ print(f'PREFILL_MAX_TOTAL_TOKENS_DP=\"{dp.get(\"max_total_tokens\", \"\")}\"') print(f'PREFILL_ENABLE_TWO_BATCH_OVERLAP_DP=\"{dp.get(\"enable_two_batch_overlap\", False)}\"') print(f'PREFILL_MAX_RUNNING_REQUESTS_NO_DP=\"{no_dp.get(\"max_running_requests\", 128)}\"') print(f'PREFILL_CHUNKED_PREFILL_SIZE_NO_DP=\"{eval_formula(no_dp.get(\"chunked_prefill_size\", 262144))}\"') +print(f'PREFILL_CONTEXT_LENGTH_NO_DP=\"{no_dp.get(\"context_length\", \"\")}\"') +print(f'PREFILL_MAX_TOTAL_TOKENS_NO_DP=\"{no_dp.get(\"max_total_tokens\", \"\")}\"') s, e = parse_range(no_dp.get('cuda_graph_bs_range', '1-128'), 1, 128) print(f'PREFILL_CUDA_GRAPH_BS_NO_DP_START=\"{s}\"') print(f'PREFILL_CUDA_GRAPH_BS_NO_DP_END=\"{e}\"') @@ -183,8 +186,8 @@ else prefill_cuda_graph_bs=($(seq $PREFILL_CUDA_GRAPH_BS_NO_DP_START $PREFILL_CUDA_GRAPH_BS_NO_DP_END)) prefill_max_running_requests=$PREFILL_MAX_RUNNING_REQUESTS_NO_DP prefill_chunked_prefill_size=$PREFILL_CHUNKED_PREFILL_SIZE_NO_DP - prefill_context_length="" - prefill_max_total_tokens="" + prefill_context_length=$PREFILL_CONTEXT_LENGTH_NO_DP + prefill_max_total_tokens=$PREFILL_MAX_TOTAL_TOKENS_NO_DP prefill_enable_two_batch_overlap="false" fi @@ -222,7 +225,12 @@ if [[ "$DECODE_ENABLE_DP" == "true" ]] && [[ "$DECODE_ENABLE_EP" == "true" ]]; t fi # Build the composed config strings (equivalent to the old MODEL_PREFILL_CONFIGS / MODEL_DECODE_CONFIGS) -PREFILL_MODE_FLAGS="--mem-fraction-static ${PREFILL_MEM_FRACTION_STATIC} --max-running-requests ${prefill_max_running_requests} --chunked-prefill-size ${prefill_chunked_prefill_size} --cuda-graph-bs ${prefill_cuda_graph_bs[*]} " +# disable_cuda_graph (model-level) routes prefill to --disable-cuda-graph instead of --cuda-graph-bs. +if [[ "$PREFILL_DISABLE_CUDA_GRAPH" == "True" ]] || [[ "$PREFILL_DISABLE_CUDA_GRAPH" == "true" ]]; then + PREFILL_MODE_FLAGS="--mem-fraction-static ${PREFILL_MEM_FRACTION_STATIC} --max-running-requests ${prefill_max_running_requests} --chunked-prefill-size ${prefill_chunked_prefill_size} --disable-cuda-graph " +else + PREFILL_MODE_FLAGS="--mem-fraction-static ${PREFILL_MEM_FRACTION_STATIC} --max-running-requests ${prefill_max_running_requests} --chunked-prefill-size ${prefill_chunked_prefill_size} --cuda-graph-bs ${prefill_cuda_graph_bs[*]} " +fi if [[ "$PREFILL_DISABLE_RADIX_CACHE" == "True" ]] || [[ "$PREFILL_DISABLE_RADIX_CACHE" == "true" ]]; then PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --disable-radix-cache" fi @@ -418,7 +426,7 @@ if [ "$NODE_RANK" -eq 0 ]; then PREFILL_MORI_MOE_ENV="SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_PREFILL}" fi set +x - PREFILL_CMD="SGLANG_MORI_COMBINE_DTYPE=${MORI_COMBINE_DTYPE_PREFILL} ${PREFILL_SDMA_ENV} ${PREFILL_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \ + PREFILL_CMD="SGLANG_MORI_COMBINE_DTYPE=${MORI_COMBINE_DTYPE_PREFILL} ${PREFILL_SDMA_ENV} ${PREFILL_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK_PREFILL:-${MORI_MAX_DISPATCH_TOKENS_PREFILL}} python3 -m sglang.launch_server \ --model-path $MODEL_DIR/$MODEL_NAME \ --disaggregation-mode prefill \ --disaggregation-ib-device ${IBDEVICES} \ @@ -650,7 +658,7 @@ elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$NODE_OFFSET" ]; then PREFILL_MORI_MOE_ENV="SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_PREFILL}" fi set +x - PREFILL_CMD="SGLANG_MORI_COMBINE_DTYPE=${MORI_COMBINE_DTYPE_PREFILL} ${PREFILL_SDMA_ENV} ${PREFILL_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \ + PREFILL_CMD="SGLANG_MORI_COMBINE_DTYPE=${MORI_COMBINE_DTYPE_PREFILL} ${PREFILL_SDMA_ENV} ${PREFILL_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK_PREFILL:-${MORI_MAX_DISPATCH_TOKENS_PREFILL}} python3 -m sglang.launch_server \ --model-path $MODEL_DIR/${MODEL_NAME} \ --disaggregation-mode prefill \ --disaggregation-ib-device ${IBDEVICES} \ @@ -718,7 +726,7 @@ else DECODE_MORI_MOE_ENV="SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_DECODE}" fi set +x - DECODE_CMD="SGLANG_MORI_COMBINE_DTYPE=${MORI_COMBINE_DTYPE_DECODE} ${DECODE_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE} python3 -m sglang.launch_server \ + DECODE_CMD="SGLANG_MORI_COMBINE_DTYPE=${MORI_COMBINE_DTYPE_DECODE} ${DECODE_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK_DECODE:-${MORI_MAX_DISPATCH_TOKENS_DECODE}} python3 -m sglang.launch_server \ --model-path ${MODEL_DIR}/${MODEL_NAME} \ --disaggregation-mode decode \ --disaggregation-ib-device ${IBDEVICES} \ diff --git a/benchmarks/multi_node/amd_utils/submit.sh b/benchmarks/multi_node/amd_utils/submit.sh index fa3d65418..c264293a7 100755 --- a/benchmarks/multi_node/amd_utils/submit.sh +++ b/benchmarks/multi_node/amd_utils/submit.sh @@ -47,6 +47,10 @@ Required environment variables: MODEL_NAME Model name directory CONTAINER_IMAGE Docker image name (e.g., vllm_disagg_pd:latest) RUNNER_NAME Runner identifier (for job name) + +Optional environment variables: + DRY_RUN 1 = echo composed server/router launch commands instead of + running them (preview a recipe against a real allocation). USAGE } @@ -125,6 +129,12 @@ export BENCH_MAX_CONCURRENCY=${CONCURRENCIES} export BENCH_REQUEST_RATE=${REQUEST_RATE} export BENCH_RANDOM_RANGE_RATIO=${RANDOM_RANGE_RATIO:-0.8} +# DRY_RUN=1 makes server_sglang.sh echo the composed prefill/decode/router launch +# commands instead of executing them (useful for previewing a recipe against a real +# allocation). Threaded here → job.slurm → Docker (-e DRY_RUN) → server_sglang.sh. +# sbatch defaults to --export=ALL, so exporting it is what carries it into the job. +export DRY_RUN="${DRY_RUN:-0}" + # Eval-related env vars (threaded from workflow → runner → here → job.slurm → Docker) export RUN_EVAL="${RUN_EVAL:-false}" export EVAL_ONLY="${EVAL_ONLY:-false}" diff --git a/benchmarks/multi_node/dsv4_fp4_mi355x_sglang-disagg.sh b/benchmarks/multi_node/dsv4_fp4_mi355x_sglang-disagg.sh new file mode 100755 index 000000000..d17d1a323 --- /dev/null +++ b/benchmarks/multi_node/dsv4_fp4_mi355x_sglang-disagg.sh @@ -0,0 +1,83 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + CONC_LIST \ + ISL \ + OSL \ + IMAGE \ + SPEC_DECODING \ + MODEL_PATH \ + PREFILL_NUM_WORKERS \ + PREFILL_TP \ + PREFILL_EP \ + PREFILL_DP_ATTN \ + DECODE_NUM_WORKERS \ + DECODE_TP \ + DECODE_EP \ + DECODE_DP_ATTN \ + PREFILL_NODES \ + DECODE_NODES \ + RANDOM_RANGE_RATIO \ + FRAMEWORK + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +set -x + +# Use upstreamed multi_node scripts (no external clone needed) +cd "$GITHUB_WORKSPACE/benchmarks/multi_node/amd_utils" || exit 1 + +# Set up SGL launch script-specific environment variables +export TIME_LIMIT="08:00:00" +export MODEL_PATH=$MODEL_PATH +export MODEL_NAME=$MODEL_NAME +export CONTAINER_IMAGE=$IMAGE + +if [[ "${PREFILL_EP:-1}" -eq 1 ]]; then +export PREFILL_ENABLE_EP=false +else +export PREFILL_ENABLE_EP=true +fi + +if [[ "$PREFILL_DP_ATTN" == "true" ]]; then +export PREFILL_ENABLE_DP=true +else +export PREFILL_ENABLE_DP=false +fi + +if [[ "${DECODE_EP:-1}" -eq 1 ]]; then +export DECODE_ENABLE_EP=false +else +export DECODE_ENABLE_EP=true +fi + +if [[ "$DECODE_DP_ATTN" == "true" ]]; then +export DECODE_ENABLE_DP=true +else +export DECODE_ENABLE_DP=false +fi + +# Launch jobs based on ISL/OSL +# Replace ' ' in CONC_LIST with 'x' such that the concurrency list is represented +# by a list of numbers delimited by 'x'. This is because of how the underlying launch script +# expects the concurrencies. +JOB_ID=$(bash ./submit.sh $PREFILL_NODES \ + $PREFILL_NUM_WORKERS \ + $DECODE_NODES \ + $DECODE_NUM_WORKERS \ + $ISL $OSL "${CONC_LIST// /x}" inf \ + ${PREFILL_ENABLE_EP} ${PREFILL_ENABLE_DP} \ + ${DECODE_ENABLE_EP} ${DECODE_ENABLE_DP} \ + ${PREFILL_TP} ${DECODE_TP} \ + ${RANDOM_RANGE_RATIO}) + +if [[ $? -ne 0 ]]; then + echo "Failed to submit job" >&2 + exit 1 +fi + +echo "$JOB_ID" From a9e1304067230bb9cbad4eff41ef8eeb58321e1a Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Mon, 15 Jun 2026 15:30:02 +0000 Subject: [PATCH 14/21] recipe for agentic dsr1 fp4 and dsv4 fp4 Signed-off-by: thshan@amd.com Co-authored-by: Cursor --- .github/configs/amd-master.yaml | 119 ++++++++++++ .github/workflows/run-sweep.yml | 1 + .../agentic/dsr1_fp4_mi355x_sglang-disagg.sh | 176 ++++++++++++++++++ .../agentic/dsv4_fp4_mi355x_sglang-disagg.sh | 160 ++++++++++++++++ benchmarks/multi_node/amd_utils/env.sh | 32 +++- benchmarks/multi_node/amd_utils/job.slurm | 73 +++++++- benchmarks/multi_node/amd_utils/models.yaml | 1 + .../multi_node/amd_utils/server_sglang.sh | 41 +++- .../multi_node/amd_utils/trace_replay.sh | 93 +++++++++ utils/matrix_logic/generate_sweep_configs.py | 2 + utils/matrix_logic/validation.py | 3 + 11 files changed, 689 insertions(+), 12 deletions(-) create mode 100755 benchmarks/multi_node/agentic/dsr1_fp4_mi355x_sglang-disagg.sh create mode 100755 benchmarks/multi_node/agentic/dsv4_fp4_mi355x_sglang-disagg.sh create mode 100644 benchmarks/multi_node/amd_utils/trace_replay.sh diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 80c14f58b..ec5dc2b70 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -2776,3 +2776,122 @@ dsv4-fp4-mi355x-sglang-agentic: # async scheduling, max-num-seqs=128, max-num-batched-tokens=8192, # gpu-mem-util=0.6. TP8 sweeps conc 4-64; DEP8 has a single conc=64 # probe to validate the ROCm DP+EP path. + +# target +dsv4-fp4-mi355x-sglang-agentic-hicache: + image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260610 + model: deepseek-ai/DeepSeek-V4-Pro + model-prefix: dsv4 + runner: mi355x + precision: fp4 + framework: sglang + multinode: false + scenarios: + agentic-coding: + - duration: 1800 + search-space: + #DPA, conc>=64 + #- { tp: 8, dp-attn: false, offloading: none, conc-list: [4, 8, 16, 32, 48, 64, 128, 196, 256] } + #- { tp: 8, dp-attn: false, offloading: hicache, conc-list: [4, 8, 16, 32, 48, 64, 128, 196, 256] } + #- { tp: 8, dp-attn: false, offloading: none, conc-list: [64] } + - { tp: 8, dp-attn: false, offloading: hicache, conc-list: [64] } + +dsr1-fp4-mi355x-sglang-disagg-agentic-hicache: + image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260609 + model: amd/DeepSeek-R1-0528-MXFP4-v2 + model-prefix: dsr1 + runner: mi355x + precision: fp4 + framework: sglang-disagg + multinode: true + disagg: true + scenarios: + agentic-coding: + - duration: 1800 + search-space: + - spec-decoding: "none" + conc-list: [ 1 ] + offloading: none + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" + - spec-decoding: "none" + conc-list: [ 1 ] + offloading: hicache + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" + +dsv4-fp4-mi355x-sglang-disagg-agentic-hicache: + image: lmsysorg/sglang-rocm:v0.5.13-rocm720-mi35x-20260614 + model: deepseek-ai/DeepSeek-V4-Pro + model-prefix: dsv4 + runner: mi355x-disagg + precision: fp4 + framework: sglang-disagg + multinode: true + disagg: true + scenarios: + agentic-coding: + - duration: 1800 + search-space: + - spec-decoding: "none" + conc-list: [ 1 ] + offloading: none + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" + - spec-decoding: "none" + conc-list: [ 1 ] + offloading: hicache + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" diff --git a/.github/workflows/run-sweep.yml b/.github/workflows/run-sweep.yml index 3533e8175..6de3d92ea 100644 --- a/.github/workflows/run-sweep.yml +++ b/.github/workflows/run-sweep.yml @@ -416,6 +416,7 @@ jobs: decode-additional-settings: ${{ toJson(matrix.config.decode.additional-settings) }} conc: ${{ matrix.config.conc }} duration: ${{ matrix.config.duration }} + offloading: ${{ matrix.config.offloading }} run-eval: false scenario-type: agentic-coding diff --git a/benchmarks/multi_node/agentic/dsr1_fp4_mi355x_sglang-disagg.sh b/benchmarks/multi_node/agentic/dsr1_fp4_mi355x_sglang-disagg.sh new file mode 100755 index 000000000..c82aea48c --- /dev/null +++ b/benchmarks/multi_node/agentic/dsr1_fp4_mi355x_sglang-disagg.sh @@ -0,0 +1,176 @@ +#!/usr/bin/env bash + +# Agentic trace-replay recipe for a disaggregated SGLang server on MI355X +# (DeepSeek-R1-0528 MXFP4-v2, 1P1D TP8). +# +# CI-style sibling of dsr1_fp8_mi355x_sglang-disagg.sh: driven entirely by +# environment variables and submits a SLURM job via submit.sh. The agentic / +# HiCache-offload configuration is ported from local_test_dsr1_agentic_offload.sh +# and is fully env-overridable so a YAML config can tune it. + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + CONC_LIST \ + ISL \ + OSL \ + IMAGE \ + SPEC_DECODING \ + MODEL_PATH \ + PREFILL_NUM_WORKERS \ + PREFILL_TP \ + PREFILL_EP \ + PREFILL_DP_ATTN \ + DECODE_NUM_WORKERS \ + DECODE_TP \ + DECODE_EP \ + DECODE_DP_ATTN \ + PREFILL_NODES \ + DECODE_NODES \ + RANDOM_RANGE_RATIO \ + DURATION \ + OFFLOADING \ + IS_AGENTIC \ + FRAMEWORK + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +set -x + +# Use upstreamed multi_node scripts (no external clone needed) +cd "$GITHUB_WORKSPACE/benchmarks/multi_node/amd_utils" || exit 1 + +# Set up SGL launch script-specific environment variables +export TIME_LIMIT="${TIME_LIMIT:-08:00:00}" +export MODEL_PATH=$MODEL_PATH +export MODEL_NAME=$MODEL_NAME +export CONTAINER_IMAGE=$IMAGE + +# ── Identity / result naming ── +export MODEL_PREFIX="${MODEL_PREFIX:-dsr1}" +export PRECISION="${PRECISION:-fp4}" +export RESULT_FILENAME="${RESULT_FILENAME:-${RUNNER_NAME:-dsr1-fp4-agentic}}" + +# ── Agentic benchmark params ── +# DURATION threads through submit.sh -> job.slurm -> Docker -> bench.sh. +# CONC_LIST drives the concurrency sweep (submit.sh splits on 'x'). +export DURATION="${DURATION:-1800}" +export MAX_MODEL_LEN="${MAX_MODEL_LEN:-163840}" + +# ── Aiter fault mitigations (ROCm/ROCm#6023) ── +export SGLANG_AITER_MLA_PERSIST="${SGLANG_AITER_MLA_PERSIST:-1}" +# 1 => append --disable-custom-all-reduce to prefill+decode (Aiter fault mitigation). +export DISABLE_CUSTOM_ALL_REDUCE="${DISABLE_CUSTOM_ALL_REDUCE:-1}" + +# # ── Hugging Face cache persistence ── +# # Persist the HF Hub/datasets cache across runs so traces aren't re-downloaded. +# export HF_CACHE_HOST_DIR="${HF_CACHE_HOST_DIR:-$HOME/.cache/huggingface}" +# mkdir -p "${HF_CACHE_HOST_DIR}" +# export EXTRA_DOCKER_MOUNTS="${EXTRA_DOCKER_MOUNTS:-} -v ${HF_CACHE_HOST_DIR}:/root/.cache/huggingface" +# # HF auth token: provide via the environment/CI secrets (do NOT hardcode here). +# export HF_TOKEN="${HF_TOKEN:-}" +# if [[ -n "${HF_TOKEN:-}" && -z "${HUGGING_FACE_HUB_TOKEN:-}" ]]; then +# export HUGGING_FACE_HUB_TOKEN="$HF_TOKEN" +# fi + +# ── In-tree sglang patches ── +# mori_conn.py targets hybrid-state bugs (GLM-5, Qwen3.5) not present in +# DSR1-MXFP4-v2 (pure MLA). Skip the auto-apply in job.slurm. +export MORI_CONN_PATCH="${MORI_CONN_PATCH:-skip}" + +# ── KV cache offloading (HiCache) ── +# OFFLOADING=hicache (default for this recipe) | none. HICACHE_TIER: +# L2 -> GPU + CPU-DRAM host pool only. L3 -> + Mooncake distributed KV store. +export OFFLOADING="${OFFLOADING:-hicache}" +export HICACHE_TIER="${HICACHE_TIER:-L3}" +export HICACHE_TOTAL_CPU_DRAM_GB="${HICACHE_TOTAL_CPU_DRAM_GB:-64}" +export HICACHE_HOST_POOL_COUNT="${HICACHE_HOST_POOL_COUNT:-1}" +export HICACHE_PAGE_SIZE="${HICACHE_PAGE_SIZE:-64}" +# Per-rank L2 host pool in GB (100GB/rank x TP8 = ~800GB pinned host DRAM/node). +export HICACHE_SIZE_GB="${HICACHE_SIZE_GB:-100}" + +# ── HiCache layout/backend driven by HICACHE_TIER ── +# Each tier has a canonical (layout, io_backend, write_policy, storage_backend) +# combo (mirrors server_sglang.sh build_hicache_flags). Any var set explicitly +# in the environment wins over the tier default. +# L3 (Mooncake): page_first + direct + write_through + storage=mooncake +# L2 (CPU DRAM): layer_first + kernel + write_through_selective + storage=none +if [[ "${HICACHE_TIER^^}" == "L3" ]]; then + export HICACHE_MEM_LAYOUT="${HICACHE_MEM_LAYOUT:-page_first}" + export HICACHE_IO_BACKEND="${HICACHE_IO_BACKEND:-direct}" + export HICACHE_WRITE_POLICY="${HICACHE_WRITE_POLICY:-write_through}" + export HICACHE_STORAGE_BACKEND="${HICACHE_STORAGE_BACKEND:-mooncake}" +else + export HICACHE_MEM_LAYOUT="${HICACHE_MEM_LAYOUT:-layer_first}" + export HICACHE_IO_BACKEND="${HICACHE_IO_BACKEND:-direct}" + # write_through_selective evicts only under GPU memory pressure, giving mori + # time to complete RDMA KV transfers before pages are freed. write_through + # evicts immediately and races with mori → GPU memory access faults. + export HICACHE_WRITE_POLICY="${HICACHE_WRITE_POLICY:-write_through_selective}" + export HICACHE_STORAGE_BACKEND="${HICACHE_STORAGE_BACKEND:-}" +fi +export HICACHE_DECODE="${HICACHE_DECODE:-0}" +# Shared nodes: use non-default Mooncake ports to avoid colliding with other users. +export MC_MASTER_PORT="${MC_MASTER_PORT:-58137}" +export MC_METRICS_PORT="${MC_METRICS_PORT:-19003}" +export MC_PATCH_HOSTPOOL="${MC_PATCH_HOSTPOOL:-1}" +export MC_PROTOCOL="${MC_PROTOCOL:-tcp}" +export MC_GLOBAL_SEG="${MC_GLOBAL_SEG:-30gb}" +export MC_DEVICE="${MC_DEVICE:-rdma0}" +export MC_MASTER_ADDR="${MC_MASTER_ADDR:-}" + +# ── MoRIIO RDMA Send Queue tuning (headroom for conc>=8) ── +export MORI_IO_SQ_BACKOFF_TIMEOUT_US="${MORI_IO_SQ_BACKOFF_TIMEOUT_US:-500000}" +export MORI_IO_QP_MAX_SEND_WR="${MORI_IO_QP_MAX_SEND_WR:-32768}" + +# ── SGLang PD router policy + server metrics ── +export PREFILL_ROUTER_POLICY="${PREFILL_ROUTER_POLICY:-random}" +export ENABLE_METRICS="${ENABLE_METRICS:-1}" + +# ── MTP ── +export DECODE_MTP_SIZE="${DECODE_MTP_SIZE:-0}" + +# Derive EP/DP enable flags from the topology inputs (same as the fixed-seq recipe). +if [[ "${PREFILL_EP:-1}" -eq 1 ]]; then +export PREFILL_ENABLE_EP=false +else +export PREFILL_ENABLE_EP=true +fi + +if [[ "$PREFILL_DP_ATTN" == "true" ]]; then +export PREFILL_ENABLE_DP=true +else +export PREFILL_ENABLE_DP=false +fi + +if [[ "${DECODE_EP:-1}" -eq 1 ]]; then +export DECODE_ENABLE_EP=false +else +export DECODE_ENABLE_EP=true +fi + +if [[ "$DECODE_DP_ATTN" == "true" ]]; then +export DECODE_ENABLE_DP=true +else +export DECODE_ENABLE_DP=false +fi + +# Launch the job. CONC_LIST is space-delimited in YAML; submit.sh wants 'x'. +JOB_ID=$(bash ./submit.sh $PREFILL_NODES \ + $PREFILL_NUM_WORKERS \ + $DECODE_NODES \ + $DECODE_NUM_WORKERS \ + $ISL $OSL "${CONC_LIST// /x}" inf \ + ${PREFILL_ENABLE_EP} ${PREFILL_ENABLE_DP} \ + ${DECODE_ENABLE_EP} ${DECODE_ENABLE_DP} \ + ${PREFILL_TP} ${DECODE_TP} \ + ${RANDOM_RANGE_RATIO}) + +if [[ $? -ne 0 ]]; then + echo "Failed to submit job" >&2 + exit 1 +fi + +echo "$JOB_ID" diff --git a/benchmarks/multi_node/agentic/dsv4_fp4_mi355x_sglang-disagg.sh b/benchmarks/multi_node/agentic/dsv4_fp4_mi355x_sglang-disagg.sh new file mode 100755 index 000000000..45c2c96b7 --- /dev/null +++ b/benchmarks/multi_node/agentic/dsv4_fp4_mi355x_sglang-disagg.sh @@ -0,0 +1,160 @@ +#!/usr/bin/env bash + +# Agentic trace-replay recipe for a disaggregated SGLang server on MI355X +# (DeepSeek-V4-Pro FP4, 1P1D TP8). +# +# CI-style sibling of dsr1_fp4_mi355x_sglang-disagg.sh: driven entirely by +# environment variables and submits a SLURM job via submit.sh. The agentic / +# HiCache-offload configuration mirrors the DSR1 recipe but uses DSV4-Pro +# specific flags (dsv4 attention backend, page-size 256, SWA settings). + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + CONC_LIST \ + ISL \ + OSL \ + IMAGE \ + SPEC_DECODING \ + MODEL_PATH \ + PREFILL_NUM_WORKERS \ + PREFILL_TP \ + PREFILL_EP \ + PREFILL_DP_ATTN \ + DECODE_NUM_WORKERS \ + DECODE_TP \ + DECODE_EP \ + DECODE_DP_ATTN \ + PREFILL_NODES \ + DECODE_NODES \ + RANDOM_RANGE_RATIO \ + DURATION \ + OFFLOADING \ + IS_AGENTIC \ + FRAMEWORK + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +set -x + +# Use upstreamed multi_node scripts (no external clone needed) +cd "$GITHUB_WORKSPACE/benchmarks/multi_node/amd_utils" || exit 1 + +# Set up SGL launch script-specific environment variables +export TIME_LIMIT="${TIME_LIMIT:-08:00:00}" +export MODEL_PATH=$MODEL_PATH +export MODEL_NAME=$MODEL_NAME +export CONTAINER_IMAGE=$IMAGE + +# ── Identity / result naming ── +export MODEL_PREFIX="${MODEL_PREFIX:-dsv4}" +export PRECISION="${PRECISION:-fp4}" +export RESULT_FILENAME="${RESULT_FILENAME:-${RUNNER_NAME:-dsv4-fp4-agentic}}" + +# ── Agentic benchmark params ── +export DURATION="${DURATION:-1800}" +# DSV4-Pro max model len for agentic traces (matches single-node recipe). +export MAX_MODEL_LEN="${MAX_MODEL_LEN:-262144}" + +# ── In-tree sglang patches ── +# mori_conn.py targets hybrid-state bugs (GLM-5, Qwen3.5). DSV4-Pro uses a +# pure MoE/DSA architecture without hybrid state; skip to avoid interference. +export MORI_CONN_PATCH="${MORI_CONN_PATCH:-skip}" + +# ── Aiter fault mitigation ── +# --disable-custom-all-reduce avoids a known aiter fault on MI355X. +export DISABLE_CUSTOM_ALL_REDUCE="${DISABLE_CUSTOM_ALL_REDUCE:-0}" + +# ── KV cache offloading (HiCache) ── +# OFFLOADING=hicache | none (passed from YAML; default none for disagg). +# HICACHE_TIER: L2 -> GPU + CPU-DRAM host pool. L3 -> + Mooncake store. +export OFFLOADING="${OFFLOADING:-none}" +export HICACHE_TIER="${HICACHE_TIER:-L3}" +export HICACHE_TOTAL_CPU_DRAM_GB="${HICACHE_TOTAL_CPU_DRAM_GB:-64}" +export HICACHE_HOST_POOL_COUNT="${HICACHE_HOST_POOL_COUNT:-1}" +# DSV4 uses page-size 256 (set in models.yaml); HiCache must match. +export HICACHE_PAGE_SIZE="${HICACHE_PAGE_SIZE:-256}" +# Per-rank L2 host pool in GB. +export HICACHE_SIZE_GB="${HICACHE_SIZE_GB:-100}" + +# ── HiCache layout/backend by tier ── +# L3 (Mooncake): page_first + direct + write_through + storage=mooncake +# L2 (CPU DRAM): layer_first + direct + write_through_selective + storage=none +# NOTE: write_through_selective evicts only under GPU memory pressure, avoiding +# the mori RDMA race that causes GPU memory access faults with write_through. +if [[ "${HICACHE_TIER^^}" == "L3" ]]; then + export HICACHE_MEM_LAYOUT="${HICACHE_MEM_LAYOUT:-page_first}" + export HICACHE_IO_BACKEND="${HICACHE_IO_BACKEND:-direct}" + export HICACHE_WRITE_POLICY="${HICACHE_WRITE_POLICY:-write_through}" + export HICACHE_STORAGE_BACKEND="${HICACHE_STORAGE_BACKEND:-mooncake}" +else + export HICACHE_MEM_LAYOUT="${HICACHE_MEM_LAYOUT:-layer_first}" + export HICACHE_IO_BACKEND="${HICACHE_IO_BACKEND:-direct}" + export HICACHE_WRITE_POLICY="${HICACHE_WRITE_POLICY:-write_through_selective}" + export HICACHE_STORAGE_BACKEND="${HICACHE_STORAGE_BACKEND:-}" +fi +export HICACHE_DECODE="${HICACHE_DECODE:-0}" +# Shared nodes: use non-default Mooncake ports to avoid collisions. +export MC_MASTER_PORT="${MC_MASTER_PORT:-58137}" +export MC_METRICS_PORT="${MC_METRICS_PORT:-19003}" +export MC_PATCH_HOSTPOOL="${MC_PATCH_HOSTPOOL:-1}" +export MC_PROTOCOL="${MC_PROTOCOL:-tcp}" +export MC_GLOBAL_SEG="${MC_GLOBAL_SEG:-30gb}" +export MC_DEVICE="${MC_DEVICE:-rdma0}" +export MC_MASTER_ADDR="${MC_MASTER_ADDR:-}" + +# ── MoRIIO RDMA Send Queue tuning ── +export MORI_IO_SQ_BACKOFF_TIMEOUT_US="${MORI_IO_SQ_BACKOFF_TIMEOUT_US:-500000}" +export MORI_IO_QP_MAX_SEND_WR="${MORI_IO_QP_MAX_SEND_WR:-32768}" + +# ── SGLang PD router policy + server metrics ── +export PREFILL_ROUTER_POLICY="${PREFILL_ROUTER_POLICY:-random}" +export ENABLE_METRICS="${ENABLE_METRICS:-1}" + +# ── MTP ── +export DECODE_MTP_SIZE="${DECODE_MTP_SIZE:-0}" + +# Derive EP/DP enable flags from the topology inputs. +if [[ "${PREFILL_EP:-1}" -eq 1 ]]; then +export PREFILL_ENABLE_EP=false +else +export PREFILL_ENABLE_EP=true +fi + +if [[ "$PREFILL_DP_ATTN" == "true" ]]; then +export PREFILL_ENABLE_DP=true +else +export PREFILL_ENABLE_DP=false +fi + +if [[ "${DECODE_EP:-1}" -eq 1 ]]; then +export DECODE_ENABLE_EP=false +else +export DECODE_ENABLE_EP=true +fi + +if [[ "$DECODE_DP_ATTN" == "true" ]]; then +export DECODE_ENABLE_DP=true +else +export DECODE_ENABLE_DP=false +fi + +# Launch the job. CONC_LIST is space-delimited in YAML; submit.sh wants 'x'. +JOB_ID=$(bash ./submit.sh $PREFILL_NODES \ + $PREFILL_NUM_WORKERS \ + $DECODE_NODES \ + $DECODE_NUM_WORKERS \ + $ISL $OSL "${CONC_LIST// /x}" inf \ + ${PREFILL_ENABLE_EP} ${PREFILL_ENABLE_DP} \ + ${DECODE_ENABLE_EP} ${DECODE_ENABLE_DP} \ + ${PREFILL_TP} ${DECODE_TP} \ + ${RANDOM_RANGE_RATIO}) + +if [[ $? -ne 0 ]]; then + echo "Failed to submit job" >&2 + exit 1 +fi + +echo "$JOB_ID" diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh index 6b0e4206a..8854949be 100755 --- a/benchmarks/multi_node/amd_utils/env.sh +++ b/benchmarks/multi_node/amd_utils/env.sh @@ -12,6 +12,17 @@ set -x ENGINE="${ENGINE:-sglang-disagg}" export PYTHONDONTWRITEBYTECODE=1 +# HiCache / Mooncake settings are delivered via a bind-mounted config file (see +# job.slurm) instead of individual docker -e flags. Source it with auto-export so +# the values land in the environment before the "${VAR:-default}" fallbacks below +# apply. Guarded so non-container / single-node runs without the mount still work. +if [[ -f /config/hicache_mc.env ]]; then + set -a + source /config/hicache_mc.env + set +a + echo "[INFO] Loaded HiCache/Mooncake config from /config/hicache_mc.env" +fi + # ============================================================================= # Shared: IBDEVICES detection # ============================================================================= @@ -125,15 +136,22 @@ else export SGLANG_USE_AITER=1 export AITER_LOG_LEVEL=ERROR + # Align with mori-scheduler/scripts/multi_node reference: persist the AITER MLA + # workspace (MLA prefill path) and enable the MXFP4 MoE scale-factor for this + # MXFP4 model. Overridable. + export SGLANG_AITER_MLA_PERSIST="${SGLANG_AITER_MLA_PERSIST:-1}" + export AITER_MXFP4_MOE_SF="${AITER_MXFP4_MOE_SF:-1}" export SGLANG_MORI_DISPATCH_DTYPE=auto export MORI_COMBINE_DTYPE_PREFILL=fp8_direct_cast export MORI_COMBINE_DTYPE_DECODE=fp8 export SGLANG_MORI_QP_PER_TRANSFER=4 export SGLANG_MORI_NUM_WORKERS=4 - export MORI_IO_SQ_BACKOFF_TIMEOUT_US=50000 + # Keep these as overridable defaults (not hard assignments), otherwise + # later tuning blocks cannot raise them for high-concurrency runs. + export MORI_IO_SQ_BACKOFF_TIMEOUT_US="${MORI_IO_SQ_BACKOFF_TIMEOUT_US:-500000}" - export MORI_IO_QP_MAX_SEND_WR=16384 + export MORI_IO_QP_MAX_SEND_WR="${MORI_IO_QP_MAX_SEND_WR:-16384}" export MORI_IO_QP_MAX_CQE=32768 export MORI_IO_QP_MAX_SGE=4 @@ -151,6 +169,8 @@ else # Disable allocating memory in one pass export MORI_SHMEM_MODE=ISOLATION + # mori shmem heap size (matches mori-scheduler reference). Overridable. + export MORI_SHMEM_HEAP_SIZE="${MORI_SHMEM_HEAP_SIZE:-1G}" # Enable spec v2 export SGLANG_ENABLE_SPEC_V2=1 @@ -177,6 +197,14 @@ else # 1 mirrors router logs to stdout via tee (useful for live debugging). export SGLANG_ROUTER_STDOUT_LOGS="${SGLANG_ROUTER_STDOUT_LOGS:-0}" + # MoRIIO SQ tuning defaults (can be overridden by caller env). + # Keep explicit exports here so tuned values are guaranteed to reach the + # sglang.launch_server process even if upstream env threading regresses. + export MORI_IO_SQ_BACKOFF_TIMEOUT_US="${MORI_IO_SQ_BACKOFF_TIMEOUT_US:-500000}" + export MORI_IO_QP_MAX_SEND_WR="${MORI_IO_QP_MAX_SEND_WR:-}" + + export MC_TE_METRIC=1 + # QoS/DSCP configuration # Priority order: 1) Set by runner, 2) Detect via nicctl, 3) Detect from hostname if [[ -n "$MORI_RDMA_TC" ]]; then diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm index 5e8e67606..2dc6227ee 100755 --- a/benchmarks/multi_node/amd_utils/job.slurm +++ b/benchmarks/multi_node/amd_utils/job.slurm @@ -201,11 +201,41 @@ else fi } - if check_model_path "$MODEL_DIR/$MODEL_NAME" "$MODEL_DIR"; then + # Extract hf_dir from models.yaml (same as vllm-disagg path above) + SGL_DISK_DIR_NAME=$(awk '/^'"$MODEL_NAME"':/{found=1; next} + found && /^[^ ]/{exit} + found && /hf_dir:/{gsub(/[" ]/, "", $2); print $2; exit}' "$MODELS_YAML") + SGL_DISK_DIR_NAME="${SGL_DISK_DIR_NAME:-$MODEL_NAME}" + + # Prefer the caller-supplied MODEL_PATH (recipe scripts set this explicitly); + # fall back to MODEL_DIR/hf_dir then MODEL_DIR/MODEL_NAME. + if [[ -n "${MODEL_PATH:-}" && "$MODEL_PATH" != "$MODEL_DIR" ]]; then + # Caller already resolved the path (e.g. MODEL_PATH=/it-share/hf_cache/models--...) + # Use it directly if it exists on all nodes, otherwise try subdirectory combos. + if check_model_path "$MODEL_PATH" "MODEL_PATH (caller-supplied)"; then + echo "Selected MODEL_PATH: $MODEL_PATH (caller-supplied, available on all nodes)" + elif check_model_path "$MODEL_PATH/$SGL_DISK_DIR_NAME" "$MODEL_PATH/$SGL_DISK_DIR_NAME"; then + MODEL_PATH="$MODEL_PATH/$SGL_DISK_DIR_NAME" + echo "Selected MODEL_PATH: $MODEL_PATH (available on all nodes)" + elif check_model_path "$MODEL_PATH/$MODEL_NAME" "$MODEL_PATH/$MODEL_NAME"; then + MODEL_PATH="$MODEL_PATH/$MODEL_NAME" + echo "Selected MODEL_PATH: $MODEL_PATH (available on all nodes)" + else + echo "FATAL ERROR: Model '$MODEL_NAME' not found on ALL allocated nodes in:" + echo " - $MODEL_PATH" + echo " - $MODEL_PATH/$SGL_DISK_DIR_NAME" + echo " - $MODEL_PATH/$MODEL_NAME" + exit 1 + fi + elif check_model_path "$MODEL_DIR/$SGL_DISK_DIR_NAME" "$MODEL_DIR/$SGL_DISK_DIR_NAME"; then + MODEL_PATH="$MODEL_DIR/$SGL_DISK_DIR_NAME" + echo "Selected MODEL_PATH: $MODEL_PATH (available on all nodes)" + elif check_model_path "$MODEL_DIR/$MODEL_NAME" "$MODEL_DIR"; then MODEL_PATH="$MODEL_DIR/$MODEL_NAME" echo "Selected MODEL_PATH: $MODEL_PATH (available on all nodes)" else echo "FATAL ERROR: Model '$MODEL_NAME' not found on ALL allocated nodes in:" + echo " - $MODEL_DIR/$SGL_DISK_DIR_NAME" echo " - $MODEL_DIR/$MODEL_NAME" exit 1 fi @@ -362,7 +392,6 @@ DOCKER_ENV_COMMON=( -e BENCH_NUM_PROMPTS_MULTIPLIER=\$BENCH_NUM_PROMPTS_MULTIPLIER -e BENCH_MAX_CONCURRENCY=\$BENCH_MAX_CONCURRENCY -e TQDM_MININTERVAL=\$TQDM_MININTERVAL - -e DRY_RUN=\$DRY_RUN -e BENCHMARK_LOGS_DIR=/benchmark_logs -e ENGINE=\$ENGINE -e WS_PATH=${WS_PATH} @@ -378,11 +407,25 @@ DOCKER_ENV_COMMON=( -e PREFILL_TP_SIZE=\$PREFILL_TP_SIZE -e PREFILL_ENABLE_EP=\$PREFILL_ENABLE_EP -e PREFILL_ENABLE_DP=\$PREFILL_ENABLE_DP + -e PREFILL_CONTEXT_LENGTH=\${PREFILL_CONTEXT_LENGTH:-} + -e PREFILL_CHUNKED_PREFILL_SIZE=\${PREFILL_CHUNKED_PREFILL_SIZE:-} + -e SGLANG_AITER_MLA_PERSIST=\${SGLANG_AITER_MLA_PERSIST:-0} + -e DISABLE_CUSTOM_ALL_REDUCE=\${DISABLE_CUSTOM_ALL_REDUCE:-0} + -e MAX_MODEL_LEN=\${MAX_MODEL_LEN:-} + -e DURATION=\${DURATION:-1800} + -e IS_AGENTIC=\${IS_AGENTIC:-0} + -e OFFLOADING=\${OFFLOADING:-none} + -e ENABLE_METRICS=\${ENABLE_METRICS:-0} + -e PREFILL_ROUTER_POLICY=\${PREFILL_ROUTER_POLICY:-random} + -e DECODE_ROUTER_POLICY=\${DECODE_ROUTER_POLICY:-random} + -e MORI_IO_SQ_BACKOFF_TIMEOUT_US=\${MORI_IO_SQ_BACKOFF_TIMEOUT_US:-} + -e MORI_IO_QP_MAX_SEND_WR=\${MORI_IO_QP_MAX_SEND_WR:-} -e DECODE_TP_SIZE=\$DECODE_TP_SIZE -e DECODE_ENABLE_EP=\$DECODE_ENABLE_EP -e DECODE_ENABLE_DP=\$DECODE_ENABLE_DP -e DECODE_MTP_SIZE=\$DECODE_MTP_SIZE -e IS_MULTINODE=\$IS_MULTINODE + -e DRY_RUN=\${DRY_RUN:-0} ) # Engine-specific env vars @@ -408,6 +451,31 @@ else ) fi +# HiCache / Mooncake settings are delivered via a bind-mounted config file rather +# than a long list of docker -e flags. Write it once to the shared benchmark-logs +# dir (already a host path, visible on every node) and mount it read-only at +# /config/hicache_mc.env, where env.sh sources it before applying its defaults. +# Empty values are preserved so env.sh's "${VAR:-default}" fallbacks still apply. +HICACHE_MC_CONFIG="${BENCHMARK_LOGS_DIR}/hicache_mc_${SLURM_JOB_ID}.env" +cat > "$HICACHE_MC_CONFIG" < $HICACHE_MC_CONFIG" + # Engine-specific container filter for pre-clean CONT_FILTER="name=^container_${ENGINE}_" @@ -489,6 +557,7 @@ fi -v /tmp:/run_logs \ -v ${BENCHMARK_LOGS_DIR}:/benchmark_logs \ -v ${DI_REPO_DIR}:${DOCKER_MOUNT_PATH} \ + -v ${HICACHE_MC_CONFIG}:/config/hicache_mc.env:ro \ ${EXTRA_DOCKER_MOUNTS:-} \ ${DOCKER_ENV_COMMON[*]} \ ${DOCKER_ENV_ENGINE[*]} \ diff --git a/benchmarks/multi_node/amd_utils/models.yaml b/benchmarks/multi_node/amd_utils/models.yaml index e68c448ce..60ce93752 100644 --- a/benchmarks/multi_node/amd_utils/models.yaml +++ b/benchmarks/multi_node/amd_utils/models.yaml @@ -359,6 +359,7 @@ DeepSeek-R1-0528-MXFP4-v2: # prefill.disable_cuda_graph routes prefill to --disable-cuda-graph; decode keeps # --cuda-graph-bs. See dsv4_mi355x_sglang_disagg_plan.md. DeepSeek-V4-Pro: + hf_dir: "models--deepseek-ai--DeepSeek-V4-Pro" base_flags: "--decode-log-interval 100 --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend dsv4 --page-size 256 --swa-full-tokens-ratio 0.1 --disable-shared-experts-fusion --tool-call-parser deepseekv4 --reasoning-parser deepseek-v4 --disaggregation-transfer-backend mori" dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" prefill: diff --git a/benchmarks/multi_node/amd_utils/server_sglang.sh b/benchmarks/multi_node/amd_utils/server_sglang.sh index 38fbdfc8e..67b10a2fd 100755 --- a/benchmarks/multi_node/amd_utils/server_sglang.sh +++ b/benchmarks/multi_node/amd_utils/server_sglang.sh @@ -419,6 +419,12 @@ if [ "$NODE_RANK" -eq 0 ]; then echo "================================================" + # Install LAUNCH_PREFIX dependencies if needed (e.g. numactl missing in container) + if [[ "${LAUNCH_PREFIX:-}" == *numactl* ]] && ! command -v numactl &>/dev/null; then + echo "[server_sglang] Installing numactl (required by LAUNCH_PREFIX)..." + apt-get install -y -q numactl >/dev/null 2>&1 || { echo "[server_sglang] WARNING: numactl install failed"; } + fi + # start the head prefill server PREFILL_MORI_MOE_ENV="" set -x @@ -426,7 +432,7 @@ if [ "$NODE_RANK" -eq 0 ]; then PREFILL_MORI_MOE_ENV="SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_PREFILL}" fi set +x - PREFILL_CMD="SGLANG_MORI_COMBINE_DTYPE=${MORI_COMBINE_DTYPE_PREFILL} ${PREFILL_SDMA_ENV} ${PREFILL_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK_PREFILL:-${MORI_MAX_DISPATCH_TOKENS_PREFILL}} python3 -m sglang.launch_server \ + PREFILL_CMD="SGLANG_MORI_COMBINE_DTYPE=${MORI_COMBINE_DTYPE_PREFILL} ${PREFILL_SDMA_ENV} ${PREFILL_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK_PREFILL:-${MORI_MAX_DISPATCH_TOKENS_PREFILL}} MORI_IO_SQ_BACKOFF_TIMEOUT_US=${MORI_IO_SQ_BACKOFF_TIMEOUT_US} MORI_IO_QP_MAX_SEND_WR=${MORI_IO_QP_MAX_SEND_WR} ${LAUNCH_PREFIX:-} python3 -m sglang.launch_server \ --model-path $MODEL_DIR/$MODEL_NAME \ --disaggregation-mode prefill \ --disaggregation-ib-device ${IBDEVICES} \ @@ -520,11 +526,24 @@ if [ "$NODE_RANK" -eq 0 ]; then export IS_MTP=false fi - # n_prefill n_decode prefill_gpus decode_gpus model_dir model_name log_path isl osl concurrency_list req_rate random_range_ratio num_prompts_multiplier - BENCH_CMD="bash $SGLANG_WS_PATH/bench.sh ${xP} ${yD} $((PREFILL_TP_SIZE*xP)) $((DECODE_TP_SIZE*yD)) \ - $MODEL_DIR $MODEL_NAME /run_logs/slurm_job-${SLURM_JOB_ID} ${BENCH_INPUT_LEN} \ - ${BENCH_OUTPUT_LEN} "${BENCH_MAX_CONCURRENCY}" ${BENCH_REQUEST_RATE} \ - ${BENCH_RANDOM_RANGE_RATIO} ${BENCH_NUM_PROMPTS_MULTIPLIER}" + # Select the benchmark runner. + # IS_AGENTIC=1/true → agentic trace replay (trace_replay.sh) + # IS_AGENTIC unset/0 → fixed-seq-len throughput benchmark (bench.sh) + if [[ "${IS_AGENTIC:-0}" == "1" || "${IS_AGENTIC:-}" == "true" ]]; then + # trace_replay.sh signature: model_path model_name concurrency_list log_path + BENCH_CMD="bash $SGLANG_WS_PATH/trace_replay.sh \ + $MODEL_DIR $MODEL_NAME $BENCH_MAX_CONCURRENCY /run_logs/slurm_job-${SLURM_JOB_ID}" + echo "Benchmark runner: trace_replay.sh (agentic, OFFLOADING=${OFFLOADING:-none}, CONC=${BENCH_MAX_CONCURRENCY})" + else + # bench.sh signature: + # n_prefill n_decode prefill_gpus decode_gpus model_dir model_name log_path + # isl osl concurrency_list req_rate random_range_ratio num_prompts_multiplier + BENCH_CMD="bash $SGLANG_WS_PATH/bench.sh ${xP} ${yD} $((PREFILL_TP_SIZE*xP)) $((DECODE_TP_SIZE*yD)) \ + $MODEL_DIR $MODEL_NAME /run_logs/slurm_job-${SLURM_JOB_ID} ${BENCH_INPUT_LEN} \ + ${BENCH_OUTPUT_LEN} \"${BENCH_MAX_CONCURRENCY}\" ${BENCH_REQUEST_RATE} \ + ${BENCH_RANDOM_RANGE_RATIO} ${BENCH_NUM_PROMPTS_MULTIPLIER}" + echo "Benchmark runner: bench.sh (fixed-seq-len)" + fi if [[ "${EVAL_ONLY:-false}" == "true" ]]; then echo "EVAL_ONLY mode: skipping throughput benchmark" @@ -658,7 +677,7 @@ elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$NODE_OFFSET" ]; then PREFILL_MORI_MOE_ENV="SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_PREFILL}" fi set +x - PREFILL_CMD="SGLANG_MORI_COMBINE_DTYPE=${MORI_COMBINE_DTYPE_PREFILL} ${PREFILL_SDMA_ENV} ${PREFILL_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK_PREFILL:-${MORI_MAX_DISPATCH_TOKENS_PREFILL}} python3 -m sglang.launch_server \ + PREFILL_CMD="SGLANG_MORI_COMBINE_DTYPE=${MORI_COMBINE_DTYPE_PREFILL} ${PREFILL_SDMA_ENV} ${PREFILL_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK_PREFILL:-${MORI_MAX_DISPATCH_TOKENS_PREFILL}} MORI_IO_SQ_BACKOFF_TIMEOUT_US=${MORI_IO_SQ_BACKOFF_TIMEOUT_US} MORI_IO_QP_MAX_SEND_WR=${MORI_IO_QP_MAX_SEND_WR} ${LAUNCH_PREFIX:-} python3 -m sglang.launch_server \ --model-path $MODEL_DIR/${MODEL_NAME} \ --disaggregation-mode prefill \ --disaggregation-ib-device ${IBDEVICES} \ @@ -720,13 +739,19 @@ else echo "Decode node rank: $RANK" echo "Decode parallelism: TP=${DECODE_TP_SIZE}, EP enabled: ${DECODE_ENABLE_EP}, DP enabled: ${DECODE_ENABLE_DP}" + # Install LAUNCH_PREFIX dependencies if needed (e.g. numactl missing in container) + if [[ "${LAUNCH_PREFIX:-}" == *numactl* ]] && ! command -v numactl &>/dev/null; then + echo "[server_sglang] Installing numactl (required by LAUNCH_PREFIX)..." + apt-get install -y -q numactl >/dev/null 2>&1 || { echo "[server_sglang] WARNING: numactl install failed"; } + fi + DECODE_MORI_MOE_ENV="" set -x if [[ -n "$MORI_MOE_MAX_INPUT_TOKENS_DECODE" ]]; then DECODE_MORI_MOE_ENV="SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_DECODE}" fi set +x - DECODE_CMD="SGLANG_MORI_COMBINE_DTYPE=${MORI_COMBINE_DTYPE_DECODE} ${DECODE_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK_DECODE:-${MORI_MAX_DISPATCH_TOKENS_DECODE}} python3 -m sglang.launch_server \ + DECODE_CMD="SGLANG_MORI_COMBINE_DTYPE=${MORI_COMBINE_DTYPE_DECODE} ${DECODE_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK_DECODE:-${MORI_MAX_DISPATCH_TOKENS_DECODE}} MORI_IO_SQ_BACKOFF_TIMEOUT_US=${MORI_IO_SQ_BACKOFF_TIMEOUT_US} MORI_IO_QP_MAX_SEND_WR=${MORI_IO_QP_MAX_SEND_WR} ${LAUNCH_PREFIX:-} python3 -m sglang.launch_server \ --model-path ${MODEL_DIR}/${MODEL_NAME} \ --disaggregation-mode decode \ --disaggregation-ib-device ${IBDEVICES} \ diff --git a/benchmarks/multi_node/amd_utils/trace_replay.sh b/benchmarks/multi_node/amd_utils/trace_replay.sh new file mode 100644 index 000000000..d3e6b2547 --- /dev/null +++ b/benchmarks/multi_node/amd_utils/trace_replay.sh @@ -0,0 +1,93 @@ +#!/bin/bash +# Dual-Engine Disaggregated Benchmark Runner +# +# ENGINE=sglang (default): SGLang benchmark +# ENGINE=vllm: vLLM benchmark +# +# Produces JSON result files via benchmark_serving.py so that the CI pipeline +# can collect and process results. +# +# Usage: bash bench.sh \ +# \ +# + +ENGINE="${ENGINE:-sglang-disagg}" + +model_path=$1 +model_name=$2 +concurrency_list=${3:-"1"} +MODEL_PATH="${MODEL_PATH:-${model_path}/${model_name}}" +# vllm-disagg uses --served-model-name MODEL_NAME; sglang defaults to MODEL_PATH +if [[ "$ENGINE" == "vllm-disagg" ]]; then + MODEL="${MODEL_NAME:-${MODEL_PATH}}" +else + MODEL="${MODEL_PATH}" +fi +log_path=${4:-/run_logs} + +# Split BENCH_MAX_CONCURRENCY (x-delimited, e.g. "8x16x32") into an array. +# Falls back to 1 if unset so the loop always runs at least once. +IFS='x' read -r -a chosen_concurrencies <<< "${concurrency_list}" + + +ROUTER_PORT="${ROUTER_PORT:-30000}" + +export TRANSFORMERS_VERBOSITY=error +export TOKENIZERS_PARALLELISM=false + +# echo "Config ${chosen_isl}; ${chosen_osl}; ${chosen_concurrencies[0]}; ${chosen_req_rate}" + +RESULT_DIR="${RESULT_DIR:-${log_path}/agentic}" +mkdir -p "$RESULT_DIR" + +source "$(dirname "$0")/../../benchmark_lib.sh" + +# REPO_ROOT="$(cd "$(dirname "$0")/../../.." && pwd)" + +PORT="${ROUTER_PORT}" +MODEL="${MODEL:-${BENCH_MODEL}}" +DURATION="${DURATION:-1800}" +export MODEL DURATION MAX_MODEL_LEN +RESULT_DIR="${RESULT_DIR:-${profile_folder}}" +# RESULT_FILENAME_BASE="${RESULT_FILENAME:-agentic_bench}" + +mkdir -p "$RESULT_DIR" + +export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060826_256k +resolve_trace_source +install_agentic_deps + +ANY_FAILED=0 +for max_concurrency in "${chosen_concurrencies[@]}"; do + + echo "==========================================" + echo "Agentic trace replay: conc=$max_concurrency" + echo "==========================================" + + # Write artifacts directly into RESULT_DIR (flat), consistent with the + # single-node agentic recipes and agentic_srt.sh. The CI matrix explodes + # agentic runs to one concurrency per job, so the per-conc loop runs once. + CONC_RESULT_DIR="$RESULT_DIR/conc${max_concurrency}" + mkdir -p "$CONC_RESULT_DIR" + + CONC="$max_concurrency" + USERS="$max_concurrency" + export CONC USERS + build_replay_cmd "$CONC_RESULT_DIR" + + # Per-conc result name consumed by write_agentic_result_json/process_agentic_result.py. + # export RESULT_FILENAME="${RESULT_FILENAME_BASE}_conc${max_concurrency}" + if ! run_agentic_replay_and_write_outputs "$CONC_RESULT_DIR"; then + echo "WARNING: agentic trace replay for conc=$max_concurrency failed (replay or validation) after writing available results" >&2 + ANY_FAILED=1 + fi + + echo "-----------------------------------------" + +done + +# export RESULT_FILENAME="$RESULT_FILENAME_BASE" + +if [ "$ANY_FAILED" -ne 0 ]; then + echo "WARNING: at least one conc had a non-zero exit; per-conc result files were still written when possible." >&2 +fi diff --git a/utils/matrix_logic/generate_sweep_configs.py b/utils/matrix_logic/generate_sweep_configs.py index 53efcca9f..ff679a828 100644 --- a/utils/matrix_logic/generate_sweep_configs.py +++ b/utils/matrix_logic/generate_sweep_configs.py @@ -437,6 +437,7 @@ def generate_full_sweep(args, all_config_data, runner_data): Fields.PREFILL.value: prefill, Fields.DECODE.value: decode, Fields.CONC.value: conc, + Fields.OFFLOADING.value: offloading, Fields.DURATION.value: duration, Fields.EXP_NAME.value: ( f"{model_code}_p{prefill[Fields.NUM_WORKER.value]}x{prefill[Fields.TP.value]}" @@ -845,6 +846,7 @@ def generate_test_config_sweep(args, all_config_data, runner_data=None): Fields.PREFILL.value: prefill, Fields.DECODE.value: decode, Fields.CONC.value: conc, + Fields.OFFLOADING.value: offloading, Fields.DURATION.value: duration, Fields.EXP_NAME.value: ( f"{model_code}_p{prefill[Fields.NUM_WORKER.value]}x{prefill[Fields.TP.value]}" diff --git a/utils/matrix_logic/validation.py b/utils/matrix_logic/validation.py index 4e3f0bbd7..0bc8efedc 100644 --- a/utils/matrix_logic/validation.py +++ b/utils/matrix_logic/validation.py @@ -180,6 +180,9 @@ class MultiNodeAgenticMatrixEntry(BaseModel): prefill: WorkerConfig decode: WorkerConfig conc: int + offloading: Literal["none", "cpu", "ssd", "lmcache", "lmcache-mp", "hicache"] = Field( + default="none", alias=Fields.OFFLOADING.value + ) duration: int = Field(default=1800, alias=Fields.DURATION.value) exp_name: str = Field(alias=Fields.EXP_NAME.value) disagg: bool From 3b74ce537b103d422624d9d1a513e830c389692d Mon Sep 17 00:00:00 2001 From: billishyahao Date: Fri, 12 Jun 2026 05:35:20 +0000 Subject: [PATCH 15/21] fix image --- .github/configs/amd-master.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index ec5dc2b70..e25384931 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -2141,13 +2141,13 @@ dsr1-fp4-mi355x-sglang-disagg-8k1k-mtp: # topology families captured from the validated manual recipe (see # dsv4_mi355x_sglang_disagg_plan.md): # - pure-TP 1P1D (TP8, mori KV transfer) -# - DEP 1P1D (TP8/EP8/DP8, mori KV transfer + mori MoE a2a, dp-attention) +# - DEP 2P1D (TP8/EP8/DP8, mori KV transfer + mori MoE a2a, dp-attention) # DSv4-specific serving knobs (attention-backend dsv4, page-size 256, unified_kv_triton, # AITER indexer, deepseekv4 parsers) live in amd_utils/{models.yaml,env.sh}; the bench # client uses --dsv4 framing (amd_utils/bench.sh). STP only for now (reference recipe has # no spec decoding); MTP is a follow-up. dsv4-fp4-mi355x-sglang-disagg: - image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260610 + image: rocm/sgl-dev:sglang-0.5.12.post1-rocm720-mi35x-mori-0610-dsv4 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: mi355x-disagg From 3f6f20f565dba1c162379390d4e6239e84cd9a9f Mon Sep 17 00:00:00 2001 From: billishyahao Date: Fri, 12 Jun 2026 06:08:12 +0000 Subject: [PATCH 16/21] fix the image --- .github/configs/amd-master.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index e25384931..8c4dcf0b8 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -2147,7 +2147,7 @@ dsr1-fp4-mi355x-sglang-disagg-8k1k-mtp: # client uses --dsv4 framing (amd_utils/bench.sh). STP only for now (reference recipe has # no spec decoding); MTP is a follow-up. dsv4-fp4-mi355x-sglang-disagg: - image: rocm/sgl-dev:sglang-0.5.12.post1-rocm720-mi35x-mori-0610-dsv4 + image: rocm/sgl-dev:sglang-0.5.12.post1-rocm720-mi35x-mori-0610-dsv4-ep model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: mi355x-disagg From 31b392a752889fdee9c0da12b09af373171a4d81 Mon Sep 17 00:00:00 2001 From: billishyahao Date: Fri, 12 Jun 2026 06:41:59 +0000 Subject: [PATCH 17/21] fix --- benchmarks/multi_node/amd_utils/env.sh | 4 ++-- benchmarks/multi_node/amd_utils/server_sglang.sh | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh index 8854949be..3ca0308b1 100755 --- a/benchmarks/multi_node/amd_utils/env.sh +++ b/benchmarks/multi_node/amd_utils/env.sh @@ -273,8 +273,8 @@ $1 == "DSCP" && $2 == ":" && $NF == p { # Per-role MoRI dispatch sizing (used by the harness chunked/MoE math) export MORI_MAX_DISPATCH_TOKENS_PREFILL=8192 export MORI_MAX_DISPATCH_TOKENS_DECODE=64 - export MORI_MOE_MAX_INPUT_TOKENS_PREFILL=2048 - export MORI_MOE_MAX_INPUT_TOKENS_DECODE=332 + # export MORI_MOE_MAX_INPUT_TOKENS_PREFILL=2048 + # export MORI_MOE_MAX_INPUT_TOKENS_DECODE=332 # PER_RANK dispatch tokens are pinned independently of the sizing above # (16384 prefill / 128 decode in the reference recipe). server_sglang.sh diff --git a/benchmarks/multi_node/amd_utils/server_sglang.sh b/benchmarks/multi_node/amd_utils/server_sglang.sh index 67b10a2fd..46e12a64a 100755 --- a/benchmarks/multi_node/amd_utils/server_sglang.sh +++ b/benchmarks/multi_node/amd_utils/server_sglang.sh @@ -196,7 +196,7 @@ if [[ "$PREFILL_ENABLE_DP" == "true" ]] && [[ "$PREFILL_ENABLE_EP" == "true" ]]; prefill_max_running_requests=$BENCH_MAX_CONC_VALUE prefill_dp_ranks=$PREFILL_TP_SIZE # MORI_MAX_DISPATCH_TOKENS_PREFILL stays at 8192 (no change) - MORI_MOE_MAX_INPUT_TOKENS_PREFILL=$((MORI_MAX_DISPATCH_TOKENS_PREFILL * prefill_dp_ranks / 2)) + # MORI_MOE_MAX_INPUT_TOKENS_PREFILL=$((MORI_MAX_DISPATCH_TOKENS_PREFILL * prefill_dp_ranks / 2)) echo "[DP+EP override] Prefill: max-running-requests=$prefill_max_running_requests, MOE_MAX_INPUT=$MORI_MOE_MAX_INPUT_TOKENS_PREFILL" fi @@ -217,7 +217,7 @@ if [[ "$DECODE_ENABLE_DP" == "true" ]] && [[ "$DECODE_ENABLE_EP" == "true" ]]; t decode_max_running_requests=$BENCH_MAX_CONC_VALUE decode_dp_ranks=$DECODE_TP_SIZE MORI_MAX_DISPATCH_TOKENS_DECODE=$((BENCH_MAX_CONC_VALUE / decode_dp_ranks)) - MORI_MOE_MAX_INPUT_TOKENS_DECODE=$((MORI_MAX_DISPATCH_TOKENS_DECODE * decode_dp_ranks * 7 / 10)) + # MORI_MOE_MAX_INPUT_TOKENS_DECODE=$((MORI_MAX_DISPATCH_TOKENS_DECODE * decode_dp_ranks * 7 / 10)) # Update derived variable SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD=$((MORI_MAX_DISPATCH_TOKENS_DECODE * 2)) export SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD @@ -253,7 +253,7 @@ fi if [[ "$DECODE_MTP_SIZE" -gt 0 ]]; then MORI_MAX_DISPATCH_TOKENS_DECODE=$((MORI_MAX_DISPATCH_TOKENS_DECODE * (DECODE_MTP_SIZE + 1))) - MORI_MOE_MAX_INPUT_TOKENS_DECODE=$((MORI_MOE_MAX_INPUT_TOKENS_DECODE * (DECODE_MTP_SIZE + 1))) + # MORI_MOE_MAX_INPUT_TOKENS_DECODE=$((MORI_MOE_MAX_INPUT_TOKENS_DECODE * (DECODE_MTP_SIZE + 1))) fi # ============================================================================= From dbbfc64521847376eb35812718058e061385c2c9 Mon Sep 17 00:00:00 2001 From: billishyahao Date: Fri, 12 Jun 2026 07:53:01 +0000 Subject: [PATCH 18/21] fix --- benchmarks/multi_node/amd_utils/env.sh | 4 ++-- benchmarks/multi_node/amd_utils/models.yaml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh index 3ca0308b1..ed48813e2 100755 --- a/benchmarks/multi_node/amd_utils/env.sh +++ b/benchmarks/multi_node/amd_utils/env.sh @@ -273,8 +273,8 @@ $1 == "DSCP" && $2 == ":" && $NF == p { # Per-role MoRI dispatch sizing (used by the harness chunked/MoE math) export MORI_MAX_DISPATCH_TOKENS_PREFILL=8192 export MORI_MAX_DISPATCH_TOKENS_DECODE=64 - # export MORI_MOE_MAX_INPUT_TOKENS_PREFILL=2048 - # export MORI_MOE_MAX_INPUT_TOKENS_DECODE=332 + unset MORI_MOE_MAX_INPUT_TOKENS_PREFILL + unset MORI_MOE_MAX_INPUT_TOKENS_DECODE # PER_RANK dispatch tokens are pinned independently of the sizing above # (16384 prefill / 128 decode in the reference recipe). server_sglang.sh diff --git a/benchmarks/multi_node/amd_utils/models.yaml b/benchmarks/multi_node/amd_utils/models.yaml index 60ce93752..28e785222 100644 --- a/benchmarks/multi_node/amd_utils/models.yaml +++ b/benchmarks/multi_node/amd_utils/models.yaml @@ -360,7 +360,7 @@ DeepSeek-R1-0528-MXFP4-v2: # --cuda-graph-bs. See dsv4_mi355x_sglang_disagg_plan.md. DeepSeek-V4-Pro: hf_dir: "models--deepseek-ai--DeepSeek-V4-Pro" - base_flags: "--decode-log-interval 100 --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend dsv4 --page-size 256 --swa-full-tokens-ratio 0.1 --disable-shared-experts-fusion --tool-call-parser deepseekv4 --reasoning-parser deepseek-v4 --disaggregation-transfer-backend mori" + base_flags: "--decode-log-interval 100 --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend dsv4 --page-size 256 --swa-full-tokens-ratio 0.1 --disable-shared-experts-fusion --tool-call-parser deepseekv4 --reasoning-parser deepseek-v4 --disaggregation-transfer-backend mori" dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" prefill: mem_fraction_static: 0.8 From 836f3968cb8cfd53dc1dcf3c445a25edf27a7616 Mon Sep 17 00:00:00 2001 From: billishyahao Date: Mon, 15 Jun 2026 01:02:57 +0000 Subject: [PATCH 19/21] bump image --- .github/configs/amd-master.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 8c4dcf0b8..ef6b96150 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -2147,7 +2147,7 @@ dsr1-fp4-mi355x-sglang-disagg-8k1k-mtp: # client uses --dsv4 framing (amd_utils/bench.sh). STP only for now (reference recipe has # no spec decoding); MTP is a follow-up. dsv4-fp4-mi355x-sglang-disagg: - image: rocm/sgl-dev:sglang-0.5.12.post1-rocm720-mi35x-mori-0610-dsv4-ep + image: lmsysorg/sglang-rocm:v0.5.13-rocm720-mi35x-20260614 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: mi355x-disagg From c21ad067aac39c0206e2eccfe7807d448b429d1b Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Tue, 16 Jun 2026 06:56:23 +0000 Subject: [PATCH 20/21] remove numactl Signed-off-by: Theresa Shan --- benchmarks/multi_node/amd_utils/server_sglang.sh | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/benchmarks/multi_node/amd_utils/server_sglang.sh b/benchmarks/multi_node/amd_utils/server_sglang.sh index 46e12a64a..9fba74661 100755 --- a/benchmarks/multi_node/amd_utils/server_sglang.sh +++ b/benchmarks/multi_node/amd_utils/server_sglang.sh @@ -419,12 +419,6 @@ if [ "$NODE_RANK" -eq 0 ]; then echo "================================================" - # Install LAUNCH_PREFIX dependencies if needed (e.g. numactl missing in container) - if [[ "${LAUNCH_PREFIX:-}" == *numactl* ]] && ! command -v numactl &>/dev/null; then - echo "[server_sglang] Installing numactl (required by LAUNCH_PREFIX)..." - apt-get install -y -q numactl >/dev/null 2>&1 || { echo "[server_sglang] WARNING: numactl install failed"; } - fi - # start the head prefill server PREFILL_MORI_MOE_ENV="" set -x @@ -739,11 +733,6 @@ else echo "Decode node rank: $RANK" echo "Decode parallelism: TP=${DECODE_TP_SIZE}, EP enabled: ${DECODE_ENABLE_EP}, DP enabled: ${DECODE_ENABLE_DP}" - # Install LAUNCH_PREFIX dependencies if needed (e.g. numactl missing in container) - if [[ "${LAUNCH_PREFIX:-}" == *numactl* ]] && ! command -v numactl &>/dev/null; then - echo "[server_sglang] Installing numactl (required by LAUNCH_PREFIX)..." - apt-get install -y -q numactl >/dev/null 2>&1 || { echo "[server_sglang] WARNING: numactl install failed"; } - fi DECODE_MORI_MOE_ENV="" set -x From e37fbc253728b8a6fda0bfd2d0da53382227bfee Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Tue, 16 Jun 2026 13:55:33 +0000 Subject: [PATCH 21/21] update dsv4 recipe Signed-off-by: Theresa Shan --- .github/configs/amd-master.yaml | 36 +++++----- .../agentic/dsv4_fp4_mi355x_sglang-disagg.sh | 67 ++++++++++--------- benchmarks/multi_node/amd_utils/models.yaml | 7 +- .../multi_node/amd_utils/server_sglang.sh | 11 +++ 4 files changed, 67 insertions(+), 54 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index ef6b96150..7dc64c714 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -2877,21 +2877,21 @@ dsv4-fp4-mi355x-sglang-disagg-agentic-hicache: additional-settings: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=0" - - spec-decoding: "none" - conc-list: [ 1 ] - offloading: hicache - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=0" + # - spec-decoding: "none" + # conc-list: [ 1 ] + # offloading: hicache + # prefill: + # num-worker: 1 + # tp: 8 + # ep: 1 + # dp-attn: false + # additional-settings: + # - "PREFILL_NODES=1" + # decode: + # num-worker: 1 + # tp: 8 + # ep: 1 + # dp-attn: false + # additional-settings: + # - "DECODE_NODES=1" + # - "DECODE_MTP_SIZE=0" diff --git a/benchmarks/multi_node/agentic/dsv4_fp4_mi355x_sglang-disagg.sh b/benchmarks/multi_node/agentic/dsv4_fp4_mi355x_sglang-disagg.sh index 45c2c96b7..beba7dd46 100755 --- a/benchmarks/multi_node/agentic/dsv4_fp4_mi355x_sglang-disagg.sh +++ b/benchmarks/multi_node/agentic/dsv4_fp4_mi355x_sglang-disagg.sh @@ -71,39 +71,42 @@ export DISABLE_CUSTOM_ALL_REDUCE="${DISABLE_CUSTOM_ALL_REDUCE:-0}" # OFFLOADING=hicache | none (passed from YAML; default none for disagg). # HICACHE_TIER: L2 -> GPU + CPU-DRAM host pool. L3 -> + Mooncake store. export OFFLOADING="${OFFLOADING:-none}" -export HICACHE_TIER="${HICACHE_TIER:-L3}" -export HICACHE_TOTAL_CPU_DRAM_GB="${HICACHE_TOTAL_CPU_DRAM_GB:-64}" -export HICACHE_HOST_POOL_COUNT="${HICACHE_HOST_POOL_COUNT:-1}" -# DSV4 uses page-size 256 (set in models.yaml); HiCache must match. -export HICACHE_PAGE_SIZE="${HICACHE_PAGE_SIZE:-256}" -# Per-rank L2 host pool in GB. -export HICACHE_SIZE_GB="${HICACHE_SIZE_GB:-100}" - -# ── HiCache layout/backend by tier ── -# L3 (Mooncake): page_first + direct + write_through + storage=mooncake -# L2 (CPU DRAM): layer_first + direct + write_through_selective + storage=none -# NOTE: write_through_selective evicts only under GPU memory pressure, avoiding -# the mori RDMA race that causes GPU memory access faults with write_through. -if [[ "${HICACHE_TIER^^}" == "L3" ]]; then - export HICACHE_MEM_LAYOUT="${HICACHE_MEM_LAYOUT:-page_first}" - export HICACHE_IO_BACKEND="${HICACHE_IO_BACKEND:-direct}" - export HICACHE_WRITE_POLICY="${HICACHE_WRITE_POLICY:-write_through}" - export HICACHE_STORAGE_BACKEND="${HICACHE_STORAGE_BACKEND:-mooncake}" -else - export HICACHE_MEM_LAYOUT="${HICACHE_MEM_LAYOUT:-layer_first}" - export HICACHE_IO_BACKEND="${HICACHE_IO_BACKEND:-direct}" - export HICACHE_WRITE_POLICY="${HICACHE_WRITE_POLICY:-write_through_selective}" - export HICACHE_STORAGE_BACKEND="${HICACHE_STORAGE_BACKEND:-}" +# HiCache/Mooncake tunables only matter when KV offloading is enabled. +if [[ "$OFFLOADING" == "hicache" ]]; then + export HICACHE_TIER="${HICACHE_TIER:-L2}" + export HICACHE_TOTAL_CPU_DRAM_GB="${HICACHE_TOTAL_CPU_DRAM_GB:-64}" + export HICACHE_HOST_POOL_COUNT="${HICACHE_HOST_POOL_COUNT:-1}" + # DSV4 uses page-size 256 (set in models.yaml); HiCache must match. + export HICACHE_PAGE_SIZE="${HICACHE_PAGE_SIZE:-256}" + # Per-rank L2 host pool in GB. + export HICACHE_SIZE_GB="${HICACHE_SIZE_GB:-100}" + + # ── HiCache layout/backend by tier ── + # L3 (Mooncake): page_first + direct + write_through + storage=mooncake + # L2 (CPU DRAM): layer_first + direct + write_through_selective + storage=none + # NOTE: write_through_selective evicts only under GPU memory pressure, avoiding + # the mori RDMA race that causes GPU memory access faults with write_through. + if [[ "${HICACHE_TIER^^}" == "L3" ]]; then + export HICACHE_MEM_LAYOUT="${HICACHE_MEM_LAYOUT:-page_first}" + export HICACHE_IO_BACKEND="${HICACHE_IO_BACKEND:-direct}" + export HICACHE_WRITE_POLICY="${HICACHE_WRITE_POLICY:-write_through}" + export HICACHE_STORAGE_BACKEND="${HICACHE_STORAGE_BACKEND:-mooncake}" + else + export HICACHE_MEM_LAYOUT="${HICACHE_MEM_LAYOUT:-layer_first}" + export HICACHE_IO_BACKEND="${HICACHE_IO_BACKEND:-direct}" + export HICACHE_WRITE_POLICY="${HICACHE_WRITE_POLICY:-write_through_selective}" + export HICACHE_STORAGE_BACKEND="${HICACHE_STORAGE_BACKEND:-}" + fi + export HICACHE_DECODE="${HICACHE_DECODE:-0}" + # Shared nodes: use non-default Mooncake ports to avoid collisions. + export MC_MASTER_PORT="${MC_MASTER_PORT:-58137}" + export MC_METRICS_PORT="${MC_METRICS_PORT:-19003}" + export MC_PATCH_HOSTPOOL="${MC_PATCH_HOSTPOOL:-1}" + export MC_PROTOCOL="${MC_PROTOCOL:-tcp}" + export MC_GLOBAL_SEG="${MC_GLOBAL_SEG:-30gb}" + export MC_DEVICE="${MC_DEVICE:-rdma0}" + export MC_MASTER_ADDR="${MC_MASTER_ADDR:-}" fi -export HICACHE_DECODE="${HICACHE_DECODE:-0}" -# Shared nodes: use non-default Mooncake ports to avoid collisions. -export MC_MASTER_PORT="${MC_MASTER_PORT:-58137}" -export MC_METRICS_PORT="${MC_METRICS_PORT:-19003}" -export MC_PATCH_HOSTPOOL="${MC_PATCH_HOSTPOOL:-1}" -export MC_PROTOCOL="${MC_PROTOCOL:-tcp}" -export MC_GLOBAL_SEG="${MC_GLOBAL_SEG:-30gb}" -export MC_DEVICE="${MC_DEVICE:-rdma0}" -export MC_MASTER_ADDR="${MC_MASTER_ADDR:-}" # ── MoRIIO RDMA Send Queue tuning ── export MORI_IO_SQ_BACKOFF_TIMEOUT_US="${MORI_IO_SQ_BACKOFF_TIMEOUT_US:-500000}" diff --git a/benchmarks/multi_node/amd_utils/models.yaml b/benchmarks/multi_node/amd_utils/models.yaml index 28e785222..35cba0cd8 100644 --- a/benchmarks/multi_node/amd_utils/models.yaml +++ b/benchmarks/multi_node/amd_utils/models.yaml @@ -357,9 +357,8 @@ DeepSeek-R1-0528-MXFP4-v2: # thinking/reasoning-effort, dispatch dtypes, per-role PER_RANK dispatch tokens) is set # in env.sh's DeepSeek-V4-Pro block. The bench client uses --dsv4 framing (bench.sh). # prefill.disable_cuda_graph routes prefill to --disable-cuda-graph; decode keeps -# --cuda-graph-bs. See dsv4_mi355x_sglang_disagg_plan.md. +# --cuda-graph-bs. See dsv4_mi355x_sglang_disagg_plan.md. hf_dir: "models--deepseek-ai--DeepSeek-V4-Pro" DeepSeek-V4-Pro: - hf_dir: "models--deepseek-ai--DeepSeek-V4-Pro" base_flags: "--decode-log-interval 100 --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend dsv4 --page-size 256 --swa-full-tokens-ratio 0.1 --disable-shared-experts-fusion --tool-call-parser deepseekv4 --reasoning-parser deepseek-v4 --disaggregation-transfer-backend mori" dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" prefill: @@ -369,12 +368,12 @@ DeepSeek-V4-Pro: dp: max_running_requests: 1024 chunked_prefill_size: 131072 - context_length: 9217 + context_length: 1000000 max_total_tokens: 262144 no_dp: max_running_requests: 128 chunked_prefill_size: 131072 - context_length: 9217 + context_length: 1000000 max_total_tokens: 262144 decode: mem_fraction_static: 0.85 diff --git a/benchmarks/multi_node/amd_utils/server_sglang.sh b/benchmarks/multi_node/amd_utils/server_sglang.sh index 9fba74661..2f6c8145e 100755 --- a/benchmarks/multi_node/amd_utils/server_sglang.sh +++ b/benchmarks/multi_node/amd_utils/server_sglang.sh @@ -231,9 +231,14 @@ if [[ "$PREFILL_DISABLE_CUDA_GRAPH" == "True" ]] || [[ "$PREFILL_DISABLE_CUDA_GR else PREFILL_MODE_FLAGS="--mem-fraction-static ${PREFILL_MEM_FRACTION_STATIC} --max-running-requests ${prefill_max_running_requests} --chunked-prefill-size ${prefill_chunked_prefill_size} --cuda-graph-bs ${prefill_cuda_graph_bs[*]} " fi + if [[ "$PREFILL_DISABLE_RADIX_CACHE" == "True" ]] || [[ "$PREFILL_DISABLE_RADIX_CACHE" == "true" ]]; then PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --disable-radix-cache" fi +# Agentic runs: keep radix/prefix cache enabled by replacing --disable-radix-cache with empty. +if [[ "${IS_AGENTIC:-0}" == "1" || "${IS_AGENTIC:-}" == "true" ]]; then + PREFILL_MODE_FLAGS="${PREFILL_MODE_FLAGS//--disable-radix-cache/}" +fi if [[ -n "$prefill_context_length" ]]; then PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --context-length ${prefill_context_length}" fi @@ -367,6 +372,12 @@ build_server_config() { PREFILL_SERVER_CONFIG=$(build_server_config "prefill" "$MODEL_NAME" "$PREFILL_TP_SIZE" "$PREFILL_ENABLE_EP" "$PREFILL_ENABLE_DP" "$DECODE_MTP_SIZE") DECODE_SERVER_CONFIG=$(build_server_config "decode" "$MODEL_NAME" "$DECODE_TP_SIZE" "$DECODE_ENABLE_EP" "$DECODE_ENABLE_DP" "$DECODE_MTP_SIZE") +# Expose Prometheus /metrics on the servers when requested (ENABLE_METRICS=1). +if [[ "${ENABLE_METRICS:-0}" == "1" ]]; then + [[ "$PREFILL_SERVER_CONFIG" != *"--enable-metrics"* ]] && PREFILL_SERVER_CONFIG="$PREFILL_SERVER_CONFIG --enable-metrics" + [[ "$DECODE_SERVER_CONFIG" != *"--enable-metrics"* ]] && DECODE_SERVER_CONFIG="$DECODE_SERVER_CONFIG --enable-metrics" +fi + if [[ -n "$MODEL_NAME" ]]; then echo "Using model-specific configuration for: $MODEL_NAME" fi