diff --git a/tests/e2e/nightly/multi_node/config/DeepSeek-R1-W8A8-A2.yaml b/tests/e2e/nightly/multi_node/config/DeepSeek-R1-W8A8-A2.yaml index 636f7d7b..262ee15b 100644 --- a/tests/e2e/nightly/multi_node/config/DeepSeek-R1-W8A8-A2.yaml +++ b/tests/e2e/nightly/multi_node/config/DeepSeek-R1-W8A8-A2.yaml @@ -3,11 +3,16 @@ model: "vllm-ascend/DeepSeek-R1-0528-W8A8" num_nodes: 2 npu_per_node: 8 env_common: + VLLM_ASCEND_ENABLE_MLAPO: 1 + VLLM_ASCEND_BALANCE_SCHEDULING: 1 + HCCL_INTRA_PCIE_ENABLE: 1 + HCCL_INTRA_ROCE_ENABLE: 0 + PYTORCH_NPU_ALLOC_CONF: expandable_segments:True VLLM_USE_MODELSCOPE: true - HCCL_BUFFSIZE: 1024 + HCCL_BUFFSIZE: 200 SERVER_PORT: 8080 OMP_PROC_BIND: false - OMP_NUM_THREADS: 10 + OMP_NUM_THREADS: 1 deployment: @@ -23,15 +28,15 @@ deployment: --no-enable-prefix-caching --max-num-seqs 16 --tensor-parallel-size 4 - --max-model-len 36864 - --max-num-batched-tokens 6000 + --max-model-len 16384 + --max-num-batched-tokens 4096 --enable-expert-parallel + --async-scheduling --trust-remote-code --quantization ascend - --gpu-memory-utilization 0.9 - --enforce-eager - --speculative-config '{"num_speculative_tokens": 1, "method":"mtp"}' - --additional-config '{"enable_weight_nz_layout":true}' + --gpu-memory-utilization 0.92 + --speculative-config '{"num_speculative_tokens": 3, "method":"mtp"}' + --compilation-config '{"cudagraph_capture_sizes":[4,16,32,48,64], "cudagraph_mode": "FULL_DECODE_ONLY"}' - server_cmd: > @@ -45,13 +50,13 @@ deployment: --no-enable-prefix-caching --max-num-seqs 16 --tensor-parallel-size 4 - --max-model-len 36864 - --max-num-batched-tokens 6000 + --max-model-len 16384 + --max-num-batched-tokens 4096 --enable-expert-parallel + --async-scheduling --trust-remote-code --quantization ascend - --gpu-memory-utilization 0.9 - --enforce-eager - --speculative-config '{"num_speculative_tokens": 1, "method":"mtp"}' - --additional-config '{"enable_weight_nz_layout":true}' + --gpu-memory-utilization 0.92 + --speculative-config '{"num_speculative_tokens": 3, "method":"mtp"}' + --compilation-config '{"cudagraph_capture_sizes":[4,16,32,48,64], "cudagraph_mode": "FULL_DECODE_ONLY"}' benchmarks: diff --git a/tests/e2e/nightly/multi_node/config/DeepSeek-R1-W8A8-EPLB.yaml b/tests/e2e/nightly/multi_node/config/DeepSeek-R1-W8A8-EPLB.yaml index 6d6a4a43..37195ad5 100644 --- a/tests/e2e/nightly/multi_node/config/DeepSeek-R1-W8A8-EPLB.yaml +++ b/tests/e2e/nightly/multi_node/config/DeepSeek-R1-W8A8-EPLB.yaml @@ -56,7 +56,7 @@ deployment: } }' --additional-config - '{"enable_prefill_optimizations":true,"enable_weight_nz_layout":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}' + '{"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}' - server_cmd: > @@ -94,7 +94,7 @@ deployment: } }' --additional-config - '{"enable_prefill_optimizations":true,"enable_weight_nz_layout":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}' + '{"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}' - server_cmd: > vllm serve vllm-ascend/DeepSeek-R1-0528-W8A8 diff --git a/tests/e2e/nightly/multi_node/config/DeepSeek-R1-W8A8-longseq.yaml b/tests/e2e/nightly/multi_node/config/DeepSeek-R1-W8A8-longseq.yaml index e6bbd7ae..8b91daa1 100644 --- a/tests/e2e/nightly/multi_node/config/DeepSeek-R1-W8A8-longseq.yaml +++ b/tests/e2e/nightly/multi_node/config/DeepSeek-R1-W8A8-longseq.yaml @@ -3,11 +3,12 @@ model: "vllm-ascend/DeepSeek-R1-0528-W8A8" num_nodes: 2 npu_per_node: 16 env_common: + HCCL_OP_EXPANSION_MODE: AIV VLLM_USE_MODELSCOPE: true - HCCL_BUFFSIZE: 1024 + HCCL_BUFFSIZE: 768 SERVER_PORT: 8080 OMP_PROC_BIND: false - OMP_NUM_THREADS: 10 + OMP_NUM_THREADS: 1 PYTORCH_NPU_ALLOC_CONF: expandable_segments:True HCCL_DETERMINISTIC: True TASK_QUEUE_ENABLE: 1 @@ -34,10 +35,10 @@ deployment: --seed 1024 --quantization ascend --max-num-seqs 4 - --max-model-len 36864 + --max-model-len 32768 --max-num-batched-tokens 16384 --trust-remote-code - --gpu-memory-utilization 0.9 + --gpu-memory-utilization 0.85 --enable-chunked-prefill --speculative-config '{"num_speculative_tokens": 3, "method":"mtp"}' --kv-transfer-config @@ -72,10 +73,10 @@ deployment: --seed 1024 --quantization ascend --max-num-seqs 4 - --max-model-len 36864 + --max-model-len 32768 --max-num-batched-tokens 256 --trust-remote-code - --gpu-memory-utilization 0.9 + --gpu-memory-utilization 0.85 --compilation_config '{"cudagraph_capture_sizes":[4,8,12,16],"cudagraph_mode": "FULL_DECODE_ONLY"}' --enable-chunked-prefill --speculative-config '{"num_speculative_tokens": 3, "method":"mtp"}' @@ -103,7 +104,7 @@ benchmarks: dataset_path: vllm-ascend/gsm8k request_conf: vllm_api_general_chat dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_chat_prompt - max_out_len: 32768 + max_out_len: 24576 batch_size: 512 baseline: 95 threshold: 5 diff --git a/tests/e2e/nightly/multi_node/config/DeepSeek-R1-W8A8.yaml b/tests/e2e/nightly/multi_node/config/DeepSeek-R1-W8A8.yaml index fd38c221..660120f3 100644 --- a/tests/e2e/nightly/multi_node/config/DeepSeek-R1-W8A8.yaml +++ b/tests/e2e/nightly/multi_node/config/DeepSeek-R1-W8A8.yaml @@ -3,11 +3,12 @@ model: "vllm-ascend/DeepSeek-R1-0528-W8A8" num_nodes: 4 npu_per_node: 16 env_common: + HCCL_OP_EXPANSION_MODE: AIV VLLM_USE_MODELSCOPE: true HCCL_BUFFSIZE: 1024 SERVER_PORT: 8080 OMP_PROC_BIND: false - OMP_NUM_THREADS: 10 + OMP_NUM_THREADS: 1 PYTORCH_NPU_ALLOC_CONF: expandable_segments:True HCCL_DETERMINISTIC: True TASK_QUEUE_ENABLE: 1 @@ -36,6 +37,7 @@ deployment: --max-num-batched-tokens 16384 --trust-remote-code --gpu-memory-utilization 0.9 + --no-enable-prefix-caching --speculative-config '{"num_speculative_tokens": 1, "method":"mtp"}' --kv-transfer-config '{"kv_connector": "MooncakeConnectorV1", @@ -55,7 +57,7 @@ deployment: } }' --additional-config - '{"enable_prefill_optimizations":true,"enable_weight_nz_layout":true}' + '{"recompute_scheduler_enable":true}' - server_cmd: > @@ -74,6 +76,7 @@ deployment: --max-num-batched-tokens 16384 --trust-remote-code --gpu-memory-utilization 0.9 + --no-enable-prefix-caching --speculative-config '{"num_speculative_tokens": 1, "method":"mtp"}' --kv-transfer-config '{"kv_connector": "MooncakeConnectorV1", @@ -93,7 +96,7 @@ deployment: } }' --additional-config - '{"enable_prefill_optimizations":true,"enable_weight_nz_layout":true}' + '{"recompute_scheduler_enable":true}' - server_cmd: > vllm serve vllm-ascend/DeepSeek-R1-0528-W8A8 @@ -113,7 +116,9 @@ deployment: --max-num-batched-tokens 256 --trust-remote-code --gpu-memory-utilization 0.9 + --no-enable-prefix-caching --speculative-config '{"num_speculative_tokens": 1, "method":"mtp"}' + --compilation-config '{"cudagraph_mode": "FULL_DECODE_ONLY"}' --kv-transfer-config '{"kv_connector": "MooncakeConnectorV1", "kv_role": "kv_consumer", @@ -132,7 +137,11 @@ deployment: } }' --additional-config - '{"multistream_overlap_shared_expert":true}' + '{"recompute_scheduler_enable":true, + "enable_shared_expert_dp":true, + "multistream_overlap_shared_expert":true, + "finegrained_tp_config": {"lmhead_tensor_parallel_size":8} + }' - server_cmd: > vllm serve vllm-ascend/DeepSeek-R1-0528-W8A8 @@ -151,7 +160,9 @@ deployment: --max-num-batched-tokens 256 --trust-remote-code --gpu-memory-utilization 0.9 + --no-enable-prefix-caching --speculative-config '{"num_speculative_tokens": 1, "method":"mtp"}' + --compilation-config '{"cudagraph_mode": "FULL_DECODE_ONLY"}' --kv-transfer-config '{"kv_connector": "MooncakeConnectorV1", "kv_role": "kv_consumer", @@ -170,7 +181,11 @@ deployment: } }' --additional-config - '{"multistream_overlap_shared_expert":true}' + '{"recompute_scheduler_enable":true, + "enable_shared_expert_dp":true, + "multistream_overlap_shared_expert":true, + "finegrained_tp_config": {"lmhead_tensor_parallel_size":8} + }' benchmarks: perf: case_type: performance diff --git a/tests/e2e/nightly/multi_node/config/DeepSeek-V3.yaml b/tests/e2e/nightly/multi_node/config/DeepSeek-V3.yaml index bdfc4103..f3fb8a24 100644 --- a/tests/e2e/nightly/multi_node/config/DeepSeek-V3.yaml +++ b/tests/e2e/nightly/multi_node/config/DeepSeek-V3.yaml @@ -16,7 +16,8 @@ npu_per_node: 16 env_common: VLLM_USE_MODELSCOPE: true OMP_PROC_BIND: false - OMP_NUM_THREADS: 100 + OMP_NUM_THREADS: 1 + TASK_QUEUE_ENABLE: 1 HCCL_BUFFSIZE: 1024 SERVER_PORT: 8080 NUMEXPR_MAX_THREADS: 128 diff --git a/tests/e2e/nightly/multi_node/config/Qwen3-235B-A22B-A2.yaml b/tests/e2e/nightly/multi_node/config/Qwen3-235B-A22B-A2.yaml index d56f00fb..941cd1dc 100644 --- a/tests/e2e/nightly/multi_node/config/Qwen3-235B-A22B-A2.yaml +++ b/tests/e2e/nightly/multi_node/config/Qwen3-235B-A22B-A2.yaml @@ -27,7 +27,7 @@ deployment: --enable-expert-parallel --max-num-seqs 128 --max-model-len 40960 - --max-num-batched-tokens 256 + --max-num-batched-tokens 2048 --trust-remote-code --gpu-memory-utilization 0.9 --async-scheduling @@ -44,7 +44,7 @@ deployment: --seed 1024 --max-num-seqs 128 --max-model-len 40960 - --max-num-batched-tokens 256 + --max-num-batched-tokens 2048 --enable-expert-parallel --trust-remote-code --gpu-memory-utilization 0.9 diff --git a/tests/e2e/nightly/multi_node/config/Qwen3-235B-A22B.yaml b/tests/e2e/nightly/multi_node/config/Qwen3-235B-A22B.yaml index 40ae591e..5c5f7508 100644 --- a/tests/e2e/nightly/multi_node/config/Qwen3-235B-A22B.yaml +++ b/tests/e2e/nightly/multi_node/config/Qwen3-235B-A22B.yaml @@ -3,9 +3,12 @@ model: "Qwen/Qwen3-235B-A22B" num_nodes: 2 npu_per_node: 16 env_common: + HCCL_OP_EXPANSION_MODE: AIV + TASK_QUEUE_ENABLE: 1 VLLM_USE_MODELSCOPE: true OMP_PROC_BIND: false - OMP_NUM_THREADS: 100 + OMP_NUM_THREADS: 1 + PYTORCH_NPU_ALLOC_CONF: expandable_segments:True HCCL_BUFFSIZE: 1024 SERVER_PORT: 8080 NUMEXPR_MAX_THREADS: 128 diff --git a/tests/e2e/nightly/multi_node/config/Qwen3-235B-W8A8-EPLB.yaml b/tests/e2e/nightly/multi_node/config/Qwen3-235B-W8A8-EPLB.yaml index 339891d4..6abdd0b5 100644 --- a/tests/e2e/nightly/multi_node/config/Qwen3-235B-W8A8-EPLB.yaml +++ b/tests/e2e/nightly/multi_node/config/Qwen3-235B-W8A8-EPLB.yaml @@ -3,9 +3,11 @@ model: "vllm-ascend/Qwen3-235B-A22B-W8A8" num_nodes: 2 npu_per_node: 16 env_common: + HCCL_OP_EXPANSION_MODE: AIV VLLM_USE_MODELSCOPE: true + TASK_QUEUE_ENABLE: 1 OMP_PROC_BIND: false - OMP_NUM_THREADS: 100 + OMP_NUM_THREADS: 1 HCCL_BUFFSIZE: 1024 SERVER_PORT: 8080 DYNAMIC_EPLB: true diff --git a/tests/e2e/nightly/multi_node/config/Qwen3-235B-W8A8-longseq.yaml b/tests/e2e/nightly/multi_node/config/Qwen3-235B-W8A8-longseq.yaml index f4397e47..f4cacda4 100644 --- a/tests/e2e/nightly/multi_node/config/Qwen3-235B-W8A8-longseq.yaml +++ b/tests/e2e/nightly/multi_node/config/Qwen3-235B-W8A8-longseq.yaml @@ -3,9 +3,11 @@ model: "vllm-ascend/Qwen3-235B-A22B-W8A8" num_nodes: 2 npu_per_node: 16 env_common: + HCCL_OP_EXPANSION_MODE: AIV VLLM_USE_MODELSCOPE: true + TASK_QUEUE_ENABLE: 1 OMP_PROC_BIND: false - OMP_NUM_THREADS: 100 + OMP_NUM_THREADS: 1 HCCL_BUFFSIZE: 1024 SERVER_PORT: 8080 NUMEXPR_MAX_THREADS: 128 diff --git a/tests/e2e/nightly/multi_node/config/Qwen3-235B-W8A8.yaml b/tests/e2e/nightly/multi_node/config/Qwen3-235B-W8A8.yaml index 9a1056b3..3572dbbc 100644 --- a/tests/e2e/nightly/multi_node/config/Qwen3-235B-W8A8.yaml +++ b/tests/e2e/nightly/multi_node/config/Qwen3-235B-W8A8.yaml @@ -3,9 +3,11 @@ model: "vllm-ascend/Qwen3-235B-A22B-W8A8" num_nodes: 2 npu_per_node: 16 env_common: + HCCL_OP_EXPANSION_MODE: AIV VLLM_USE_MODELSCOPE: true + TASK_QUEUE_ENABLE: 1 OMP_PROC_BIND: false - OMP_NUM_THREADS: 100 + OMP_NUM_THREADS: 1 HCCL_BUFFSIZE: 1024 SERVER_PORT: 8080 NUMEXPR_MAX_THREADS: 128