diff --git a/.github/workflows/_e2e_test.yaml b/.github/workflows/_e2e_test.yaml index 245fa85d..17fb04ff 100644 --- a/.github/workflows/_e2e_test.yaml +++ b/.github/workflows/_e2e_test.yaml @@ -215,6 +215,7 @@ jobs: if: ${{ inputs.type == 'light' }} run: | pytest -sv --durations=0 tests/e2e/multicard/2-cards/test_qwen3_moe.py::test_qwen3_moe_distributed_mp_tp2_ep + pytest -sv --durations=0 tests/e2e/multicard/2-cards/test_offline_inference_distributed.py::test_deepseek3_2_w8a8_pruning_mtp_tp2_ep - name: Run vllm-project/vllm-ascend test (full) env: @@ -237,6 +238,7 @@ jobs: pytest -sv --durations=0 tests/e2e/multicard/2-cards/test_offline_inference_distributed.py::test_deepseek_v2_lite_fc1_tp2 pytest -sv --durations=0 tests/e2e/multicard/2-cards/test_offline_inference_distributed.py::test_qwen3_dense_fc1_tp2 pytest -sv --durations=0 tests/e2e/multicard/2-cards/test_offline_inference_distributed.py::test_qwen3_dense_prefetch_mlp_weight_tp2 + pytest -sv --durations=0 tests/e2e/multicard/2-cards/test_offline_inference_distributed.py::test_deepseek3_2_w8a8_pruning_mtp_tp2_ep pytest -sv --durations=0 tests/e2e/multicard/2-cards/test_offline_weight_load.py pytest -sv --durations=0 tests/e2e/multicard/2-cards/test_pipeline_parallel.py diff --git a/tests/e2e/multicard/2-cards/test_offline_inference_distributed.py b/tests/e2e/multicard/2-cards/test_offline_inference_distributed.py index d06dece3..f5ce1730 100644 --- a/tests/e2e/multicard/2-cards/test_offline_inference_distributed.py +++ b/tests/e2e/multicard/2-cards/test_offline_inference_distributed.py @@ -235,3 +235,29 @@ def test_qwen3_dense_prefetch_mlp_weight_tp2(model): quantization="ascend", ) as vllm_model: vllm_model.generate_greedy(example_prompts, max_tokens) + + +@patch.dict(os.environ, {"HCCL_OP_EXPANSION_MODE": "AIV"}) +@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_FLASHCOMM1": "0"}) +@patch.dict(os.environ, {"ASCEND_AGGREGATE_ENABLE": "1"}) +@patch.dict(os.environ, {"HCCL_BUFFSIZE": "1024"}) +def test_deepseek3_2_w8a8_pruning_mtp_tp2_ep(): + example_prompts = [ + "Hello, my name is", + ] + max_tokens = 5 + with VllmRunner("vllm-ascend/DeepSeek-V3.2-W8A8-Pruning", + tensor_parallel_size=2, + quantization="ascend", + enable_expert_parallel=True, + compilation_config={ + "cudagraph_capture_sizes": [3, 6, 9, 12], + "cudagraph_mode": "FULL_DECODE_ONLY" + }, + speculative_config={ + "num_speculative_tokens": 2, + "method": "deepseek_mtp" + }, + reasoning_parser="deepseek_v3", + tokenizer_mode="deepseek_v32") as vllm_model: + vllm_model.generate_greedy(example_prompts, max_tokens) diff --git a/vllm_ascend/spec_decode/eagle_proposer.py b/vllm_ascend/spec_decode/eagle_proposer.py index aef66fa4..ced415c1 100644 --- a/vllm_ascend/spec_decode/eagle_proposer.py +++ b/vllm_ascend/spec_decode/eagle_proposer.py @@ -106,6 +106,8 @@ class EagleProposer(VllmEagleProposer): self.dcp_size = self.runner.dcp_size self.pcp_rank = self.runner.pcp_rank self.dcp_rank = self.runner.dcp_rank + + self.use_aclgraph = self.runner._use_aclgraph() self.full_indices = range( self.runner.max_num_tokens * self.pcp_size * self.dcp_size + diff --git a/vllm_ascend/spec_decode/mtp_proposer.py b/vllm_ascend/spec_decode/mtp_proposer.py index f2710050..eab2846d 100644 --- a/vllm_ascend/spec_decode/mtp_proposer.py +++ b/vllm_ascend/spec_decode/mtp_proposer.py @@ -242,11 +242,11 @@ class MtpProposer(EagleProposer): assert self.runner is not None # Note(qcs): We may need to refactor these check logics. - if self.use_cuda_graph and num_scheduled_tokens <= self.runner.cudagraph_batch_sizes[ + if self.runner.use_aclgraph and num_scheduled_tokens <= self.runner.cudagraph_batch_sizes[ -1]: num_input_tokens = self.vllm_config.pad_for_cudagraph( num_scheduled_tokens) - elif self.use_cuda_graph and num_tokens <= self.runner.cudagraph_batch_sizes[ + elif self.use_aclgraph and num_tokens <= self.runner.cudagraph_batch_sizes[ -1]: # Acl graph mode, add padding to the batch size num_input_tokens = self.vllm_config.pad_for_cudagraph(num_tokens)