[CI] Add DeepSeek-V3.2-W8A8-Pruning e2e test (#5922)
### What this PR does / why we need it?
1. Fix DeepSeek-V3.2-W8A8-Pruning mtp
2. Add DeepSeek-V3.2-W8A8-Pruning e2e test
### How was this patch tested?
- vLLM version: v0.13.0
- vLLM main:
11b6af5280
Signed-off-by: hfadzxy <starmoon_zhang@163.com>
This commit is contained in:
2
.github/workflows/_e2e_test.yaml
vendored
2
.github/workflows/_e2e_test.yaml
vendored
@@ -215,6 +215,7 @@ jobs:
|
|||||||
if: ${{ inputs.type == 'light' }}
|
if: ${{ inputs.type == 'light' }}
|
||||||
run: |
|
run: |
|
||||||
pytest -sv --durations=0 tests/e2e/multicard/2-cards/test_qwen3_moe.py::test_qwen3_moe_distributed_mp_tp2_ep
|
pytest -sv --durations=0 tests/e2e/multicard/2-cards/test_qwen3_moe.py::test_qwen3_moe_distributed_mp_tp2_ep
|
||||||
|
pytest -sv --durations=0 tests/e2e/multicard/2-cards/test_offline_inference_distributed.py::test_deepseek3_2_w8a8_pruning_mtp_tp2_ep
|
||||||
|
|
||||||
- name: Run vllm-project/vllm-ascend test (full)
|
- name: Run vllm-project/vllm-ascend test (full)
|
||||||
env:
|
env:
|
||||||
@@ -237,6 +238,7 @@ jobs:
|
|||||||
pytest -sv --durations=0 tests/e2e/multicard/2-cards/test_offline_inference_distributed.py::test_deepseek_v2_lite_fc1_tp2
|
pytest -sv --durations=0 tests/e2e/multicard/2-cards/test_offline_inference_distributed.py::test_deepseek_v2_lite_fc1_tp2
|
||||||
pytest -sv --durations=0 tests/e2e/multicard/2-cards/test_offline_inference_distributed.py::test_qwen3_dense_fc1_tp2
|
pytest -sv --durations=0 tests/e2e/multicard/2-cards/test_offline_inference_distributed.py::test_qwen3_dense_fc1_tp2
|
||||||
pytest -sv --durations=0 tests/e2e/multicard/2-cards/test_offline_inference_distributed.py::test_qwen3_dense_prefetch_mlp_weight_tp2
|
pytest -sv --durations=0 tests/e2e/multicard/2-cards/test_offline_inference_distributed.py::test_qwen3_dense_prefetch_mlp_weight_tp2
|
||||||
|
pytest -sv --durations=0 tests/e2e/multicard/2-cards/test_offline_inference_distributed.py::test_deepseek3_2_w8a8_pruning_mtp_tp2_ep
|
||||||
|
|
||||||
pytest -sv --durations=0 tests/e2e/multicard/2-cards/test_offline_weight_load.py
|
pytest -sv --durations=0 tests/e2e/multicard/2-cards/test_offline_weight_load.py
|
||||||
pytest -sv --durations=0 tests/e2e/multicard/2-cards/test_pipeline_parallel.py
|
pytest -sv --durations=0 tests/e2e/multicard/2-cards/test_pipeline_parallel.py
|
||||||
|
|||||||
@@ -235,3 +235,29 @@ def test_qwen3_dense_prefetch_mlp_weight_tp2(model):
|
|||||||
quantization="ascend",
|
quantization="ascend",
|
||||||
) as vllm_model:
|
) as vllm_model:
|
||||||
vllm_model.generate_greedy(example_prompts, max_tokens)
|
vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||||
|
|
||||||
|
|
||||||
|
@patch.dict(os.environ, {"HCCL_OP_EXPANSION_MODE": "AIV"})
|
||||||
|
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_FLASHCOMM1": "0"})
|
||||||
|
@patch.dict(os.environ, {"ASCEND_AGGREGATE_ENABLE": "1"})
|
||||||
|
@patch.dict(os.environ, {"HCCL_BUFFSIZE": "1024"})
|
||||||
|
def test_deepseek3_2_w8a8_pruning_mtp_tp2_ep():
|
||||||
|
example_prompts = [
|
||||||
|
"Hello, my name is",
|
||||||
|
]
|
||||||
|
max_tokens = 5
|
||||||
|
with VllmRunner("vllm-ascend/DeepSeek-V3.2-W8A8-Pruning",
|
||||||
|
tensor_parallel_size=2,
|
||||||
|
quantization="ascend",
|
||||||
|
enable_expert_parallel=True,
|
||||||
|
compilation_config={
|
||||||
|
"cudagraph_capture_sizes": [3, 6, 9, 12],
|
||||||
|
"cudagraph_mode": "FULL_DECODE_ONLY"
|
||||||
|
},
|
||||||
|
speculative_config={
|
||||||
|
"num_speculative_tokens": 2,
|
||||||
|
"method": "deepseek_mtp"
|
||||||
|
},
|
||||||
|
reasoning_parser="deepseek_v3",
|
||||||
|
tokenizer_mode="deepseek_v32") as vllm_model:
|
||||||
|
vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||||
|
|||||||
@@ -106,6 +106,8 @@ class EagleProposer(VllmEagleProposer):
|
|||||||
self.dcp_size = self.runner.dcp_size
|
self.dcp_size = self.runner.dcp_size
|
||||||
self.pcp_rank = self.runner.pcp_rank
|
self.pcp_rank = self.runner.pcp_rank
|
||||||
self.dcp_rank = self.runner.dcp_rank
|
self.dcp_rank = self.runner.dcp_rank
|
||||||
|
|
||||||
|
self.use_aclgraph = self.runner._use_aclgraph()
|
||||||
|
|
||||||
self.full_indices = range(
|
self.full_indices = range(
|
||||||
self.runner.max_num_tokens * self.pcp_size * self.dcp_size +
|
self.runner.max_num_tokens * self.pcp_size * self.dcp_size +
|
||||||
|
|||||||
@@ -242,11 +242,11 @@ class MtpProposer(EagleProposer):
|
|||||||
assert self.runner is not None
|
assert self.runner is not None
|
||||||
|
|
||||||
# Note(qcs): We may need to refactor these check logics.
|
# Note(qcs): We may need to refactor these check logics.
|
||||||
if self.use_cuda_graph and num_scheduled_tokens <= self.runner.cudagraph_batch_sizes[
|
if self.runner.use_aclgraph and num_scheduled_tokens <= self.runner.cudagraph_batch_sizes[
|
||||||
-1]:
|
-1]:
|
||||||
num_input_tokens = self.vllm_config.pad_for_cudagraph(
|
num_input_tokens = self.vllm_config.pad_for_cudagraph(
|
||||||
num_scheduled_tokens)
|
num_scheduled_tokens)
|
||||||
elif self.use_cuda_graph and num_tokens <= self.runner.cudagraph_batch_sizes[
|
elif self.use_aclgraph and num_tokens <= self.runner.cudagraph_batch_sizes[
|
||||||
-1]:
|
-1]:
|
||||||
# Acl graph mode, add padding to the batch size
|
# Acl graph mode, add padding to the batch size
|
||||||
num_input_tokens = self.vllm_config.pad_for_cudagraph(num_tokens)
|
num_input_tokens = self.vllm_config.pad_for_cudagraph(num_tokens)
|
||||||
|
|||||||
Reference in New Issue
Block a user