diff --git a/.github/workflows/_e2e_nightly_single_node.yaml b/.github/workflows/_e2e_nightly_single_node.yaml index 4d07e031..66ed7ba4 100644 --- a/.github/workflows/_e2e_nightly_single_node.yaml +++ b/.github/workflows/_e2e_nightly_single_node.yaml @@ -110,19 +110,6 @@ jobs: fi cd .. - - name: Install custom-ops (for DeepSeek-V3.2-Exp) - if: ${{ inputs.name == 'deepseek3_2-exp-w8a8' }} - shell: bash -l {0} - run: | - wget https://vllm-ascend.obs.cn-north-4.myhuaweicloud.com/vllm-ascend/a3/CANN-custom_ops-sfa-linux.aarch64.run - chmod +x ./CANN-custom_ops-sfa-linux.aarch64.run - ./CANN-custom_ops-sfa-linux.aarch64.run --quiet - export ASCEND_CUSTOM_OPP_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/vendors/customize:${ASCEND_CUSTOM_OPP_PATH} - export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/vendors/customize/op_api/lib/:${LD_LIBRARY_PATH} - wget https://vllm-ascend.obs.cn-north-4.myhuaweicloud.com/vllm-ascend/a3/custom_ops-1.0-cp311-cp311-linux_aarch64.whl - pip install custom_ops-1.0-cp311-cp311-linux_aarch64.whl - . /usr/local/Ascend/ascend-toolkit/set_env.sh - - name: Install Ascend toolkit & triton_ascend shell: bash -l {0} run: | diff --git a/.github/workflows/nightly_test_a3.yaml b/.github/workflows/nightly_test_a3.yaml index 6c861358..b46b45bc 100644 --- a/.github/workflows/nightly_test_a3.yaml +++ b/.github/workflows/nightly_test_a3.yaml @@ -150,13 +150,12 @@ jobs: - name: kimi-k2-thinking os: linux-aarch64-a3-16 tests: tests/e2e/nightly/single_node/models/test_kimi_k2_thinking.py - # TODO: Replace deepseek3.2-exp with deepseek3.2 after nightly tests pass - # - name: deepseek3_2-exp-w8a8 - # os: linux-aarch64-a3-16 - # tests: tests/e2e/nightly/single_node/models/test_deepseek_v3_2_exp_w8a8.py - - name: deepseek-r1-w8a8-hmb + - name: deepseek-r1-w8a8-hbm os: linux-aarch64-a3-16 - tests: tests/e2e/nightly/single_node/models/test_deepseek_r1_w8a8_hmb.py + tests: tests/e2e/nightly/single_node/models/test_deepseek_r1_w8a8_hbm.py + - name: deepseek3_2-w8a8 + os: linux-aarch64-a3-16 + tests: tests/e2e/nightly/single_node/models/test_deepseek_v3_2_w8a8.py uses: ./.github/workflows/_e2e_nightly_single_node.yaml with: vllm: v0.13.0 diff --git a/tests/e2e/nightly/multi_node/config/DeepSeek-V3_2-Exp-bf16.yaml b/tests/e2e/nightly/multi_node/config/DeepSeek-V3_2-Exp-bf16.yaml deleted file mode 100644 index 77577f03..00000000 --- a/tests/e2e/nightly/multi_node/config/DeepSeek-V3_2-Exp-bf16.yaml +++ /dev/null @@ -1,51 +0,0 @@ -test_name: "test DeepSeek-V3.2-Exp-bf16 multi-dp" -model: "Yanguan/DeepSeek-V3.2-Exp-bf16" -num_nodes: 2 -npu_per_node: 16 -env_common: - VLLM_USE_MODELSCOPE: true - OMP_PROC_BIND: false - OMP_NUM_THREADS: 100 - HCCL_BUFFSIZE: 1024 - SERVER_PORT: 8080 - VLLM_ASCEND_ENABLE_MLAPO: 0 - -deployment: - - - server_cmd: > - vllm serve "Yanguan/DeepSeek-V3.2-Exp-bf16" - --host 0.0.0.0 - --port $SERVER_PORT - --data-parallel-address $LOCAL_IP - --data-parallel-size 2 - --data-parallel-size-local 1 - --data-parallel-rpc-port 13389 - --tensor-parallel-size 16 - --seed 1024 - --enable-expert-parallel - --max-num-seqs 16 - --max-model-len 17450 - --max-num-batched-tokens 17450 - --trust-remote-code - --no-enable-prefix-caching - --gpu-memory-utilization 0.9 - - - - server_cmd: > - vllm serve "Yanguan/DeepSeek-V3.2-Exp-bf16" - --headless - --data-parallel-size 2 - --data-parallel-size-local 1 - --data-parallel-start-rank 1 - --data-parallel-address $MASTER_IP - --data-parallel-rpc-port 13389 - --tensor-parallel-size 16 - --seed 1024 - --max-num-seqs 16 - --max-model-len 17450 - --max-num-batched-tokens 17450 - --enable-expert-parallel - --trust-remote-code - --no-enable-prefix-caching - --gpu-memory-utilization 0.92 -benchmarks: diff --git a/tests/e2e/nightly/multi_node/scripts/lws.yaml.jinja2 b/tests/e2e/nightly/multi_node/scripts/lws.yaml.jinja2 index 10be7ad7..dffa0ea2 100644 --- a/tests/e2e/nightly/multi_node/scripts/lws.yaml.jinja2 +++ b/tests/e2e/nightly/multi_node/scripts/lws.yaml.jinja2 @@ -123,4 +123,4 @@ spec: selector: leaderworkerset.sigs.k8s.io/name: vllm role: leader - type: ClusterIP + type: ClusterIP \ No newline at end of file diff --git a/tests/e2e/nightly/single_node/models/test_deepseek_r1_w8a8_hmb.py b/tests/e2e/nightly/single_node/models/test_deepseek_r1_w8a8_hbm.py similarity index 100% rename from tests/e2e/nightly/single_node/models/test_deepseek_r1_w8a8_hmb.py rename to tests/e2e/nightly/single_node/models/test_deepseek_r1_w8a8_hbm.py diff --git a/tests/e2e/nightly/single_node/models/test_deepseek_v3_2_exp_w8a8.py b/tests/e2e/nightly/single_node/models/test_deepseek_v3_2_w8a8.py similarity index 70% rename from tests/e2e/nightly/single_node/models/test_deepseek_v3_2_exp_w8a8.py rename to tests/e2e/nightly/single_node/models/test_deepseek_v3_2_w8a8.py index 019bf1d5..8f0b2f64 100644 --- a/tests/e2e/nightly/single_node/models/test_deepseek_v3_2_exp_w8a8.py +++ b/tests/e2e/nightly/single_node/models/test_deepseek_v3_2_w8a8.py @@ -23,13 +23,10 @@ from vllm.utils.network_utils import get_open_port from tests.e2e.conftest import RemoteOpenAIServer from tools.aisbench import run_aisbench_cases -MODELS = [ - "vllm-ascend/DeepSeek-V3.2-Exp-W8A8", -] +MODELS = ["vllm-ascend/DeepSeek-V3.2-W8A8"] TENSOR_PARALLELS = [8] DATA_PARALLELS = [2] -FULL_GRAPH = [True, False] prompts = [ "San Francisco is a", @@ -53,11 +50,11 @@ aisbench_cases = [{ "dataset_path": "vllm-ascend/GSM8K-in3500-bs400", "request_conf": "vllm_api_stream_chat", "dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_str_perf", - "num_prompts": 16, + "num_prompts": 100, "max_out_len": 1500, - "batch_size": 8, - "request_rate": 0, - "baseline": 1, + "batch_size": 4, + "request_rate": 11.2, + "baseline": 120, "threshold": 0.97 }] @@ -66,25 +63,30 @@ aisbench_cases = [{ @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("tp_size", TENSOR_PARALLELS) @pytest.mark.parametrize("dp_size", DATA_PARALLELS) -@pytest.mark.parametrize("full_graph", FULL_GRAPH) -async def test_models(model: str, tp_size: int, dp_size: int, - full_graph: bool) -> None: +async def test_models(model: str, tp_size: int, dp_size: int) -> None: port = get_open_port() - env_dict = {"HCCL_BUFFSIZE": "1024", "VLLM_ASCEND_ENABLE_MLAPO": "0"} + env_dict = { + "HCCL_OP_EXPANSION_MODE": "AIV", + "OMP_PROC_BIND": "false", + "OMP_NUM_THREADS": "1", + "HCCL_BUFFSIZE": "1024", + "VLLM_ASCEND_ENABLE_MLAPO": "1", + "PYTORCH_NPU_ALLOC_CONF": "expandable_segments:True", + "VLLM_ASCEND_ENABLE_FLASHCOMM1": "0", + } + server_args = [ - "--no-enable-prefix-caching", "--enable-expert-parallel", - "--tensor-parallel-size", + "--enable-expert-parallel", "--tensor-parallel-size", str(tp_size), "--data-parallel-size", str(dp_size), "--port", - str(port), "--max-model-len", "16384", "--max-num-batched-tokens", - "16384", "--block-size", "16", "--trust-remote-code", "--quantization", - "ascend", "--gpu-memory-utilization", "0.9" + str(port), "--max-model-len", "8192", "--max-num-batched-tokens", + "8192", "--max-num-seqs", "4", "--trust-remote-code", "--quantization", + "ascend", "--gpu-memory-utilization", "0.92", "--compilation-config", + '{"cudagraph_capture_sizes":[3, 6, 9, 12], "cudagraph_mode":"FULL_DECODE_ONLY"}', + "--speculative-config", + '{"num_speculative_tokens": 2, "method":"deepseek_mtp"}', + "--reasoning-parser", "deepseek_v3", "--tokenizer_mode", "deepseek_v32" ] - if full_graph: - server_args += [ - "--compilation-config", - '{"cudagraph_capture": [16], "cudagraph_model":"FULL_DECODE_ONLY"}' - ] request_keyword_args: dict[str, Any] = { **api_keyword_args, }