diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index c8fe1ea97..0a770efa4 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -10312,6 +10312,175 @@ dsv4-fp4-gb300-dynamo-sglang-mtp: ep: 8 dp-attn: true +dsv4-fp4-gb300-dynamo-sglang-mtp-1k1k: + image: lmsysorg/sglang:nightly-dev-cu13-20260615-c127ba64 + model: deepseek-ai/DeepSeek-V4-Pro + model-prefix: dsv4 + runner: gb300-nv + precision: fp4 + framework: dynamo-sglang + multinode: true + disagg: true + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - spec-decoding: "mtp" + conc-list: [8192] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-1p1d-dep8-conc8192-mtp.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [8192] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-1p1d-dep16-conc8192-mtp.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [8192] + prefill: + num-worker: 2 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-2p1d-dep16-conc8192-mtp.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [64] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p1d-dep4-tp4-mtp.yaml" + decode: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [64] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p2d-dep4-tp4-mtp.yaml" + decode: + num-worker: 2 + tp: 4 + ep: 1 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [64] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p4d-dep4-tp4-mtp.yaml" + decode: + num-worker: 4 + tp: 4 + ep: 1 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [64] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p6d-dep4-tp4-mtp.yaml" + decode: + num-worker: 6 + tp: 4 + ep: 1 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [2] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p1d-tp4-tp4-mtp.yaml" + decode: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [2] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p2d-tp4-tp4-mtp.yaml" + decode: + num-worker: 2 + tp: 4 + ep: 1 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [2] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p4d-tp4-tp4-mtp.yaml" + decode: + num-worker: 4 + tp: 4 + ep: 1 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [2] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p6d-tp4-tp4-mtp.yaml" + decode: + num-worker: 6 + tp: 4 + ep: 1 + dp-attn: false + kimik2.5-int4-h100-vllm: image: vllm/vllm-openai:v0.20.2 diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-1p1d-dep16-conc8192-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-1p1d-dep16-conc8192-mtp.yaml new file mode 100644 index 000000000..fed845263 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-1p1d-dep16-conc8192-mtp.yaml @@ -0,0 +1,160 @@ +name: "dsv4-pro-gb300-disagg-1k1k-mtp-1p1d-dep16-conc8192" + +slurm: + time_limit: "03:00:00" + +sbatch_directives: + cpus-per-task: '144' + mem: '0' + +dynamo: + hash: "34d55a596fb8d3d44daefe425ec1e303131f4d2c" + install: true + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 8 + +model: + path: dsv4-pro + container: "lmsysorg/sglang:nightly-dev-cu13-20260615-c127ba64" + precision: fp4 + +resources: + gpu_type: gb300 + gpus_per_node: 4 + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + decode_nodes: 4 + decode_workers: 1 + gpus_per_decode: 16 + +backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: '1' + SGLANG_RADIX_DISABLE_REUSE: '1' + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: '1' + SGLANG_DEFAULT_THINKING: '1' + SGLANG_DSV4_REASONING_EFFORT: max + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: '1' + SGLANG_OPT_USE_JIT_NORM: '1' + SGLANG_OPT_USE_JIT_INDEXER_METADATA: '1' + SGLANG_OPT_USE_TOPK_V2: '1' + + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: '1' + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: '1' + SGLANG_OPT_USE_FAST_MASK_EP: '1' + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: '1' + SGLANG_OPT_FIX_HASH_MEGA_MOE: '1' + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: '9216' + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: '1' + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: '1' + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '0' + + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + MC_FORCE_MNNVL: '1' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: '1' + + decode_environment: + PYTHONUNBUFFERED: '1' + SGLANG_RADIX_DISABLE_REUSE: '1' + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: '1' + SGLANG_DEFAULT_THINKING: '1' + SGLANG_DSV4_REASONING_EFFORT: max + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: '1' + SGLANG_OPT_USE_JIT_NORM: '1' + SGLANG_OPT_USE_JIT_INDEXER_METADATA: '1' + SGLANG_OPT_USE_TOPK_V2: '1' + + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: '1' + SGLANG_OPT_USE_FAST_MASK_EP: '1' + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: '1' + SGLANG_OPT_FIX_HASH_MEGA_MOE: '1' + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: '4096' + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: '1' + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: '1' + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '0' + + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + MC_FORCE_MNNVL: '1' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: '1' + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "0" # CAR_V2 is single-node only. + + sglang_config: + prefill: + served-model-name: deepseek-ai/DeepSeek-V4-Pro + model-path: "/model/" + trust-remote-code: true + tool-call-parser: deepseekv4 + + disaggregation-mode: prefill + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + + enable-dp-attention: true + enable-dp-lm-head: true + + moe-a2a-backend: deepep + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + mem-fraction-static: 0.9 + max-running-requests: 1024 + cuda-graph-max-bs: 1024 + chunked-prefill-size: 32768 + stream-interval: 60 + + decode: + served-model-name: deepseek-ai/DeepSeek-V4-Pro + model-path: "/model/" + trust-remote-code: true + tool-call-parser: deepseekv4 + + disaggregation-mode: decode + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 16 + data-parallel-size: 16 + expert-parallel-size: 16 + + enable-dp-attention: true + enable-dp-lm-head: true + + moe-a2a-backend: deepep + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + speculative-algo: "EAGLE" + speculative-num-steps: 3 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 4 + + mem-fraction-static: 0.9 + max-running-requests: 18432 + cuda-graph-max-bs: 1024 + swa-full-tokens-ratio: 0.15 + context-length: 16384 + stream-interval: 60 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + random_range_ratio: 0.8 + concurrencies: "8192" + req_rate: "inf" + use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-1p1d-dep8-conc8192-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-1p1d-dep8-conc8192-mtp.yaml new file mode 100644 index 000000000..65f4f4d0a --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-1p1d-dep8-conc8192-mtp.yaml @@ -0,0 +1,161 @@ +name: "dsv4-pro-gb300-disagg-1k1k-mtp-1p1d-dep8-conc8192" + +slurm: + time_limit: "03:00:00" + +sbatch_directives: + cpus-per-task: '144' + mem: '0' + +dynamo: + hash: "34d55a596fb8d3d44daefe425ec1e303131f4d2c" + install: true + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 8 + +model: + path: dsv4-pro + container: "lmsysorg/sglang:nightly-dev-cu13-20260615-c127ba64" + precision: fp4 + +resources: + gpu_type: gb300 + gpus_per_node: 4 + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + decode_nodes: 2 + decode_workers: 1 + gpus_per_decode: 8 + +backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: '1' + SGLANG_RADIX_DISABLE_REUSE: '1' + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: '1' + SGLANG_DEFAULT_THINKING: '1' + SGLANG_DSV4_REASONING_EFFORT: max + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: '1' + SGLANG_OPT_USE_JIT_NORM: '1' + SGLANG_OPT_USE_JIT_INDEXER_METADATA: '1' + SGLANG_OPT_USE_TOPK_V2: '1' + + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: '1' + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: '1' + SGLANG_OPT_USE_FAST_MASK_EP: '1' + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: '1' + SGLANG_OPT_FIX_HASH_MEGA_MOE: '1' + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: '9216' + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: '1' + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: '1' + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '0' + + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + MC_FORCE_MNNVL: '1' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: '1' + + decode_environment: + PYTHONUNBUFFERED: '1' + SGLANG_RADIX_DISABLE_REUSE: '1' + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: '1' + SGLANG_DEFAULT_THINKING: '1' + SGLANG_DSV4_REASONING_EFFORT: max + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: '1' + SGLANG_OPT_USE_JIT_NORM: '1' + SGLANG_OPT_USE_JIT_INDEXER_METADATA: '1' + SGLANG_OPT_USE_TOPK_V2: '1' + + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: '1' + SGLANG_OPT_USE_FAST_MASK_EP: '1' + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: '1' + SGLANG_OPT_FIX_HASH_MEGA_MOE: '1' + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: '2048' + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: '1' + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: '1' + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '0' + + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + MC_FORCE_MNNVL: '1' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: '1' + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "0" # CAR_V2 is single-node only. + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + + sglang_config: + prefill: + served-model-name: deepseek-ai/DeepSeek-V4-Pro + model-path: "/model/" + trust-remote-code: true + tool-call-parser: deepseekv4 + + disaggregation-mode: prefill + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + + enable-dp-attention: true + enable-dp-lm-head: true + + moe-a2a-backend: deepep + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + mem-fraction-static: 0.9 + max-running-requests: 256 + cuda-graph-max-bs: 256 + chunked-prefill-size: 32768 + stream-interval: 60 + + decode: + served-model-name: deepseek-ai/DeepSeek-V4-Pro + model-path: "/model/" + trust-remote-code: true + tool-call-parser: deepseekv4 + + disaggregation-mode: decode + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 8 + data-parallel-size: 8 + expert-parallel-size: 8 + + enable-dp-attention: true + enable-dp-lm-head: true + + moe-a2a-backend: deepep + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + speculative-algo: "EAGLE" + speculative-num-steps: 3 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 4 + + mem-fraction-static: 0.94 + max-running-requests: 1536 + cuda-graph-max-bs: 256 + swa-full-tokens-ratio: 0.15 + context-length: 16384 + stream-interval: 60 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + random_range_ratio: 0.8 + concurrencies: "8192" + req_rate: "inf" + use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-2p1d-dep16-conc8192-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-2p1d-dep16-conc8192-mtp.yaml new file mode 100644 index 000000000..bcafd24bd --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-2p1d-dep16-conc8192-mtp.yaml @@ -0,0 +1,160 @@ +name: "dsv4-pro-gb300-disagg-1k1k-mtp-2p1d-dep16-conc8192" + +slurm: + time_limit: "03:00:00" + +sbatch_directives: + cpus-per-task: '144' + mem: '0' + +dynamo: + hash: "34d55a596fb8d3d44daefe425ec1e303131f4d2c" + install: true + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 8 + +model: + path: dsv4-pro + container: "lmsysorg/sglang:nightly-dev-cu13-20260615-c127ba64" + precision: fp4 + +resources: + gpu_type: gb300 + gpus_per_node: 4 + prefill_nodes: 2 + prefill_workers: 2 + gpus_per_prefill: 4 + decode_nodes: 4 + decode_workers: 1 + gpus_per_decode: 16 + +backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: '1' + SGLANG_RADIX_DISABLE_REUSE: '1' + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: '1' + SGLANG_DEFAULT_THINKING: '1' + SGLANG_DSV4_REASONING_EFFORT: max + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: '1' + SGLANG_OPT_USE_JIT_NORM: '1' + SGLANG_OPT_USE_JIT_INDEXER_METADATA: '1' + SGLANG_OPT_USE_TOPK_V2: '1' + + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: '1' + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: '1' + SGLANG_OPT_USE_FAST_MASK_EP: '1' + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: '1' + SGLANG_OPT_FIX_HASH_MEGA_MOE: '1' + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: '9216' + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: '1' + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: '1' + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '0' + + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + MC_FORCE_MNNVL: '1' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: '1' + + decode_environment: + PYTHONUNBUFFERED: '1' + SGLANG_RADIX_DISABLE_REUSE: '1' + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: '1' + SGLANG_DEFAULT_THINKING: '1' + SGLANG_DSV4_REASONING_EFFORT: max + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: '1' + SGLANG_OPT_USE_JIT_NORM: '1' + SGLANG_OPT_USE_JIT_INDEXER_METADATA: '1' + SGLANG_OPT_USE_TOPK_V2: '1' + + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: '1' + SGLANG_OPT_USE_FAST_MASK_EP: '1' + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: '1' + SGLANG_OPT_FIX_HASH_MEGA_MOE: '1' + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: '4096' + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: '1' + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: '1' + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '0' + + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + MC_FORCE_MNNVL: '1' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: '1' + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "0" # CAR_V2 is single-node only. + + sglang_config: + prefill: + served-model-name: deepseek-ai/DeepSeek-V4-Pro + model-path: "/model/" + trust-remote-code: true + tool-call-parser: deepseekv4 + + disaggregation-mode: prefill + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + + enable-dp-attention: true + enable-dp-lm-head: true + + moe-a2a-backend: deepep + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + mem-fraction-static: 0.9 + max-running-requests: 1024 + cuda-graph-max-bs: 1024 + chunked-prefill-size: 32768 + stream-interval: 60 + + decode: + served-model-name: deepseek-ai/DeepSeek-V4-Pro + model-path: "/model/" + trust-remote-code: true + tool-call-parser: deepseekv4 + + disaggregation-mode: decode + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 16 + data-parallel-size: 16 + expert-parallel-size: 16 + + enable-dp-attention: true + enable-dp-lm-head: true + + moe-a2a-backend: deepep + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + speculative-algo: "EAGLE" + speculative-num-steps: 3 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 4 + + mem-fraction-static: 0.9 + max-running-requests: 18432 + cuda-graph-max-bs: 1024 + swa-full-tokens-ratio: 0.15 + context-length: 16384 + stream-interval: 60 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + random_range_ratio: 0.8 + concurrencies: "8192" + req_rate: "inf" + use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p1d-dep4-tp4-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p1d-dep4-tp4-mtp.yaml new file mode 100644 index 000000000..126207601 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p1d-dep4-tp4-mtp.yaml @@ -0,0 +1,134 @@ +name: "dsv4-pro-gb300-disagg-1k1k-mtp-low-latency-1p1d-dep4-tp4" + +frontend: + type: sglang + enable_multiple_frontends: false + args: + policy: "cache_aware" + +model: + path: "dsv4-pro" + container: "lmsysorg/sglang:nightly-dev-cu13-20260615-c127ba64" + precision: "mxfp4" + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 1 + prefill_workers: 1 + decode_nodes: 1 + decode_workers: 1 + +backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_DISABLE_REUSE: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "9216" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" + + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_DISABLE_REUSE: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2 + # is single-node only and corrupts results in 2-node decode setups. + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + disable-radix-cache: true + + disaggregation-mode: "prefill" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + + enable-dp-attention: true + enable-dp-lm-head: true + + moe-a2a-backend: "deepep" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + mem-fraction-static: 0.90 + max-running-requests: 512 + cuda-graph-max-bs: 512 + chunked-prefill-size: 4096 + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + disable-radix-cache: true + + disaggregation-mode: "decode" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 4 + data-parallel-size: 1 + expert-parallel-size: 1 + + moe-runner-backend: "flashinfer_mxfp4" + disable-flashinfer-autotune: true + + speculative-algo: "EAGLE" + speculative-num-steps: 3 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 4 + + mem-fraction-static: 0.9 + max-running-requests: 64 + cuda-graph-max-bs: 64 + swa-full-tokens-ratio: 0.1 + context-length: 4096 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + random_range_ratio: 0.8 + concurrencies: "1x2x4x8x16x32x64" + req_rate: "inf" + use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p1d-tp4-tp4-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p1d-tp4-tp4-mtp.yaml new file mode 100644 index 000000000..346933557 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p1d-tp4-tp4-mtp.yaml @@ -0,0 +1,120 @@ +name: "dsv4-pro-gb300-disagg-1k1k-mtp-low-latency-1p1d-tp4-tp4" + +frontend: + type: sglang + enable_multiple_frontends: false + args: + policy: "cache_aware" + +model: + path: "dsv4-pro" + container: "lmsysorg/sglang:nightly-dev-cu13-20260615-c127ba64" + precision: "mxfp4" + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 1 + prefill_workers: 1 + decode_nodes: 1 + decode_workers: 1 + +backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_DISABLE_REUSE: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_DISABLE_REUSE: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2 + # is single-node only and corrupts results in 2-node decode setups. + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + disable-radix-cache: true + + disaggregation-mode: "prefill" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 4 + data-parallel-size: 1 + expert-parallel-size: 1 + + moe-runner-backend: "flashinfer_mxfp4" + disable-flashinfer-autotune: true + + mem-fraction-static: 0.90 + max-running-requests: 8 + cuda-graph-max-bs: 8 + chunked-prefill-size: 4096 + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + disable-radix-cache: true + + disaggregation-mode: "decode" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 4 + data-parallel-size: 1 + expert-parallel-size: 1 + + moe-runner-backend: "flashinfer_mxfp4" + disable-flashinfer-autotune: true + + speculative-algo: "EAGLE" + speculative-num-steps: 3 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 4 + + mem-fraction-static: 0.9 + max-running-requests: 8 + cuda-graph-max-bs: 8 + swa-full-tokens-ratio: 0.1 + context-length: 4096 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + random_range_ratio: 0.8 + concurrencies: "1x2" + req_rate: "inf" + use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p2d-dep4-tp4-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p2d-dep4-tp4-mtp.yaml new file mode 100644 index 000000000..e7c712e50 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p2d-dep4-tp4-mtp.yaml @@ -0,0 +1,134 @@ +name: "dsv4-pro-gb300-disagg-1k1k-mtp-low-latency-1p2d-dep4-tp4" + +frontend: + type: sglang + enable_multiple_frontends: false + args: + policy: "cache_aware" + +model: + path: "dsv4-pro" + container: "lmsysorg/sglang:nightly-dev-cu13-20260615-c127ba64" + precision: "mxfp4" + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 1 + prefill_workers: 1 + decode_nodes: 2 + decode_workers: 2 + +backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_DISABLE_REUSE: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "9216" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" + + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_DISABLE_REUSE: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2 + # is single-node only and corrupts results in 2-node decode setups. + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + disable-radix-cache: true + + disaggregation-mode: "prefill" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + + enable-dp-attention: true + enable-dp-lm-head: true + + moe-a2a-backend: "deepep" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + mem-fraction-static: 0.90 + max-running-requests: 512 + cuda-graph-max-bs: 512 + chunked-prefill-size: 4096 + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + disable-radix-cache: true + + disaggregation-mode: "decode" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 4 + data-parallel-size: 1 + expert-parallel-size: 1 + + moe-runner-backend: "flashinfer_mxfp4" + disable-flashinfer-autotune: true + + speculative-algo: "EAGLE" + speculative-num-steps: 3 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 4 + + mem-fraction-static: 0.9 + max-running-requests: 64 + cuda-graph-max-bs: 64 + swa-full-tokens-ratio: 0.1 + context-length: 4096 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + random_range_ratio: 0.8 + concurrencies: "1x2x4x8x16x32x64" + req_rate: "inf" + use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p2d-tp4-tp4-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p2d-tp4-tp4-mtp.yaml new file mode 100644 index 000000000..134e08b34 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p2d-tp4-tp4-mtp.yaml @@ -0,0 +1,120 @@ +name: "dsv4-pro-gb300-disagg-1k1k-mtp-low-latency-1p2d-tp4-tp4" + +frontend: + type: sglang + enable_multiple_frontends: false + args: + policy: "cache_aware" + +model: + path: "dsv4-pro" + container: "lmsysorg/sglang:nightly-dev-cu13-20260615-c127ba64" + precision: "mxfp4" + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 1 + prefill_workers: 1 + decode_nodes: 2 + decode_workers: 2 + +backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_DISABLE_REUSE: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_DISABLE_REUSE: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2 + # is single-node only and corrupts results in 2-node decode setups. + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + disable-radix-cache: true + + disaggregation-mode: "prefill" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 4 + data-parallel-size: 1 + expert-parallel-size: 1 + + moe-runner-backend: "flashinfer_mxfp4" + disable-flashinfer-autotune: true + + mem-fraction-static: 0.90 + max-running-requests: 8 + cuda-graph-max-bs: 8 + chunked-prefill-size: 4096 + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + disable-radix-cache: true + + disaggregation-mode: "decode" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 4 + data-parallel-size: 1 + expert-parallel-size: 1 + + moe-runner-backend: "flashinfer_mxfp4" + disable-flashinfer-autotune: true + + speculative-algo: "EAGLE" + speculative-num-steps: 3 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 4 + + mem-fraction-static: 0.9 + max-running-requests: 8 + cuda-graph-max-bs: 8 + swa-full-tokens-ratio: 0.1 + context-length: 4096 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + random_range_ratio: 0.8 + concurrencies: "1x2" + req_rate: "inf" + use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p4d-dep4-tp4-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p4d-dep4-tp4-mtp.yaml new file mode 100644 index 000000000..97eff2d40 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p4d-dep4-tp4-mtp.yaml @@ -0,0 +1,134 @@ +name: "dsv4-pro-gb300-disagg-1k1k-mtp-low-latency-1p4d-dep4-tp4" + +frontend: + type: sglang + enable_multiple_frontends: false + args: + policy: "cache_aware" + +model: + path: "dsv4-pro" + container: "lmsysorg/sglang:nightly-dev-cu13-20260615-c127ba64" + precision: "mxfp4" + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 1 + prefill_workers: 1 + decode_nodes: 4 + decode_workers: 4 + +backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_DISABLE_REUSE: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "9216" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" + + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_DISABLE_REUSE: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2 + # is single-node only and corrupts results in 2-node decode setups. + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + disable-radix-cache: true + + disaggregation-mode: "prefill" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + + enable-dp-attention: true + enable-dp-lm-head: true + + moe-a2a-backend: "deepep" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + mem-fraction-static: 0.90 + max-running-requests: 512 + cuda-graph-max-bs: 512 + chunked-prefill-size: 4096 + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + disable-radix-cache: true + + disaggregation-mode: "decode" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 4 + data-parallel-size: 1 + expert-parallel-size: 1 + + moe-runner-backend: "flashinfer_mxfp4" + disable-flashinfer-autotune: true + + speculative-algo: "EAGLE" + speculative-num-steps: 3 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 4 + + mem-fraction-static: 0.9 + max-running-requests: 64 + cuda-graph-max-bs: 64 + swa-full-tokens-ratio: 0.1 + context-length: 4096 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + random_range_ratio: 0.8 + concurrencies: "1x2x4x8x16x32x64" + req_rate: "inf" + use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p4d-tp4-tp4-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p4d-tp4-tp4-mtp.yaml new file mode 100644 index 000000000..11e059294 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p4d-tp4-tp4-mtp.yaml @@ -0,0 +1,120 @@ +name: "dsv4-pro-gb300-disagg-1k1k-mtp-low-latency-1p4d-tp4-tp4" + +frontend: + type: sglang + enable_multiple_frontends: false + args: + policy: "cache_aware" + +model: + path: "dsv4-pro" + container: "lmsysorg/sglang:nightly-dev-cu13-20260615-c127ba64" + precision: "mxfp4" + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 1 + prefill_workers: 1 + decode_nodes: 4 + decode_workers: 4 + +backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_DISABLE_REUSE: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_DISABLE_REUSE: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2 + # is single-node only and corrupts results in 2-node decode setups. + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + disable-radix-cache: true + + disaggregation-mode: "prefill" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 4 + data-parallel-size: 1 + expert-parallel-size: 1 + + moe-runner-backend: "flashinfer_mxfp4" + disable-flashinfer-autotune: true + + mem-fraction-static: 0.90 + max-running-requests: 8 + cuda-graph-max-bs: 8 + chunked-prefill-size: 4096 + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + disable-radix-cache: true + + disaggregation-mode: "decode" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 4 + data-parallel-size: 1 + expert-parallel-size: 1 + + moe-runner-backend: "flashinfer_mxfp4" + disable-flashinfer-autotune: true + + speculative-algo: "EAGLE" + speculative-num-steps: 3 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 4 + + mem-fraction-static: 0.9 + max-running-requests: 8 + cuda-graph-max-bs: 8 + swa-full-tokens-ratio: 0.1 + context-length: 4096 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + random_range_ratio: 0.8 + concurrencies: "1x2" + req_rate: "inf" + use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p6d-dep4-tp4-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p6d-dep4-tp4-mtp.yaml new file mode 100644 index 000000000..b974668d2 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p6d-dep4-tp4-mtp.yaml @@ -0,0 +1,134 @@ +name: "dsv4-pro-gb300-disagg-1k1k-mtp-low-latency-1p6d-dep4-tp4" + +frontend: + type: sglang + enable_multiple_frontends: false + args: + policy: "cache_aware" + +model: + path: "dsv4-pro" + container: "lmsysorg/sglang:nightly-dev-cu13-20260615-c127ba64" + precision: "mxfp4" + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 1 + prefill_workers: 1 + decode_nodes: 6 + decode_workers: 6 + +backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_DISABLE_REUSE: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "9216" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" + + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_DISABLE_REUSE: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2 + # is single-node only and corrupts results in 2-node decode setups. + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + disable-radix-cache: true + + disaggregation-mode: "prefill" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + + enable-dp-attention: true + enable-dp-lm-head: true + + moe-a2a-backend: "deepep" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + mem-fraction-static: 0.90 + max-running-requests: 512 + cuda-graph-max-bs: 512 + chunked-prefill-size: 4096 + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + disable-radix-cache: true + + disaggregation-mode: "decode" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 4 + data-parallel-size: 1 + expert-parallel-size: 1 + + moe-runner-backend: "flashinfer_mxfp4" + disable-flashinfer-autotune: true + + speculative-algo: "EAGLE" + speculative-num-steps: 3 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 4 + + mem-fraction-static: 0.9 + max-running-requests: 64 + cuda-graph-max-bs: 64 + swa-full-tokens-ratio: 0.1 + context-length: 4096 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + random_range_ratio: 0.8 + concurrencies: "1x2x4x8x16x32x64" + req_rate: "inf" + use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p6d-tp4-tp4-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p6d-tp4-tp4-mtp.yaml new file mode 100644 index 000000000..b059593e8 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p6d-tp4-tp4-mtp.yaml @@ -0,0 +1,120 @@ +name: "dsv4-pro-gb300-disagg-1k1k-mtp-low-latency-1p6d-tp4-tp4" + +frontend: + type: sglang + enable_multiple_frontends: false + args: + policy: "cache_aware" + +model: + path: "dsv4-pro" + container: "lmsysorg/sglang:nightly-dev-cu13-20260615-c127ba64" + precision: "mxfp4" + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 1 + prefill_workers: 1 + decode_nodes: 6 + decode_workers: 6 + +backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_DISABLE_REUSE: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_DISABLE_REUSE: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2 + # is single-node only and corrupts results in 2-node decode setups. + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + disable-radix-cache: true + + disaggregation-mode: "prefill" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 4 + data-parallel-size: 1 + expert-parallel-size: 1 + + moe-runner-backend: "flashinfer_mxfp4" + disable-flashinfer-autotune: true + + mem-fraction-static: 0.90 + max-running-requests: 8 + cuda-graph-max-bs: 8 + chunked-prefill-size: 4096 + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + disable-radix-cache: true + + disaggregation-mode: "decode" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 4 + data-parallel-size: 1 + expert-parallel-size: 1 + + moe-runner-backend: "flashinfer_mxfp4" + disable-flashinfer-autotune: true + + speculative-algo: "EAGLE" + speculative-num-steps: 3 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 4 + + mem-fraction-static: 0.9 + max-running-requests: 8 + cuda-graph-max-bs: 8 + swa-full-tokens-ratio: 0.1 + context-length: 4096 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + random_range_ratio: 0.8 + concurrencies: "1x2" + req_rate: "inf" + use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer" diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 67a91d5d2..744e0f971 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3887,3 +3887,13 @@ - "Align MiniMax-M3 B300 vLLM fixed-sequence serving with MiniMax-M2.5 FP8 settings by setting VLLM_FLOAT32_MATMUL_PRECISION=high and restoring max cudagraph capture size 2048." - "Add TP4+EP4 coverage for MiniMax-M3 B300: DP-attention rows for 1k1k/8k1k and the missing non-DP-attention row for 8k1k." pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1781 + +- config-keys: + - dsv4-fp4-gb300-dynamo-sglang-mtp-1k1k + description: + - "Add DeepSeek-V4-Pro FP4 GB300 disaggregated SGLang MTP 1k/1k coverage using lmsysorg/sglang:nightly-dev-cu13-20260615-c127ba64" + - "New top-level config (sibling of dsv4-fp4-gb300-dynamo-sglang-mtp which stays on the older 8k/1k image nightly-dev-20260527-14f81a67); separate entry is required because the launcher builds one squashfs per top-level image and the 1k/1k recipes pin a newer container string" + - "Wires 11 disagg recipes adapted from NVIDIA/srt-slurm PR #177 (recipes/dsv4-pro/sglang/gb300-fp4/1k1k/disagg/mtp/), staged locally under benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/: 3 high-throughput conc=8192 recipes (1p1d-dep8, 1p1d-dep16, 2p1d-dep16), 4 flat low-latency dep4 recipes (1p1d/1p2d/1p4d/1p6d-dep4-tp4 at conc=64), and 4 flat low-latency tp4 recipes (1p1d/1p2d/1p4d/1p6d-tp4-tp4 at conc=2)" + - "Flat dep4 and tp4 splits replace the upstream disagg-low-latency-dep4-mtp.yaml and disagg-low-latency-tp4-mtp.yaml templates (each base + zip_override_1k1k_lowlat with decode_nodes [1,2,4,6]). The templated form silently broke under the gb300-cw launcher: srtctl fans the single recipe into 4 Slurm submissions per `srtctl apply`, but launch_gb300-cw.sh:282 captures the job IDs with `grep -oP '✅ Job \\K[0-9]+'` into a scalar JOB_ID, jamming 4 newline-separated IDs into one variable. Downstream uses (LOGS_DIR path interpolation at :293-294, squeue polls at :313-314, error message at :315) all corrupt, producing the observed `ERROR: Job 7079\\n7082 failed before creating log file` and orphaning the real Slurm jobs. Single-config files match the 8k1k convention (one recipe per decode-worker count) and force srtctl to emit exactly 1 job per launcher invocation" + - "Decode side runs EAGLE (num-steps=3, eagle-topk=1, num-draft-tokens=4) on all 11 recipes; all recipes pin lmsysorg/sglang:nightly-dev-cu13-20260615-c127ba64" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1697 diff --git a/runners/launch_gb300-nv.sh b/runners/launch_gb300-nv.sh index d21c91d10..aec68d7ee 100644 --- a/runners/launch_gb300-nv.sh +++ b/runners/launch_gb300-nv.sh @@ -143,6 +143,12 @@ elif [[ $FRAMEWORK == "dynamo-sglang" && $MODEL_PREFIX == "glm5" ]]; then git checkout sa-submission-q2-2026 mkdir -p recipes/sglang/glm5 cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5" recipes/sglang/glm5 +elif [[ $FRAMEWORK == "dynamo-sglang" && $MODEL_PREFIX == "dsv4" ]]; then + git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" + cd "$SRT_REPO_DIR" + git checkout main + mkdir -p recipes/sglang/deepseek-v4 + cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4" recipes/sglang/deepseek-v4 elif [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "minimaxm2.5" && $PRECISION == "fp8" ]]; then git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR"