Revert "[Perf][1/N] w8a8c8 support in dsv3.2/glm5 (#7029)" (#7288)

### What this PR does / why we need it?
This reverts commit 7ed9e9de69, which
introduces an issue that the patch doesn't work with recompute scheduler
enabled.
- vLLM version: v0.17.0
- vLLM main:
4034c3d32e
---------
Signed-off-by: MengqingCao <cmq0113@163.com>
This commit is contained in:
Mengqing Cao
2026-03-15 20:19:09 +08:00
committed by GitHub
parent 29f195a91c
commit 0c299f79b9
24 changed files with 79 additions and 4281 deletions

View File

@@ -266,33 +266,6 @@ def test_deepseek3_2_w8a8_pruning_mtp_tp2_ep():
vllm_model.generate_greedy(long_example_prompts, max_tokens)
@patch.dict(os.environ, {"HCCL_OP_EXPANSION_MODE": "AIV"})
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_FLASHCOMM1": "1"})
@patch.dict(os.environ, {"ASCEND_AGGREGATE_ENABLE": "1"})
@patch.dict(os.environ, {"HCCL_BUFFSIZE": "1024"})
def test_deepseek3_2_w8a8c8_pruning_mtp_tp2_ep():
short_example_prompts = [
"Hello ",
]
# "max_position_embeddings": 163840,
long_example_prompts = ["Hello " * (163839 - 500) + "Hello"]
max_tokens = 500
with VllmRunner(
"vllm-ascend/DeepSeek-V3.2-W8A8-Pruning",
tensor_parallel_size=2,
quantization="ascend",
enable_expert_parallel=True,
max_model_len=163840,
compilation_config={"cudagraph_capture_sizes": [2, 4, 6, 8, 10, 12], "cudagraph_mode": "FULL_DECODE_ONLY"},
speculative_config={"num_speculative_tokens": 1, "method": "deepseek_mtp"},
additional_config={"layer_sharding": ["q_b_proj", "o_proj"], "enable_sparse_c8": True},
reasoning_parser="deepseek_v3",
tokenizer_mode="deepseek_v32",
) as vllm_model:
vllm_model.generate_greedy(short_example_prompts, max_tokens)
vllm_model.generate_greedy(long_example_prompts, max_tokens)
@pytest.mark.parametrize("model", QWEN_W4A4_MODELS)
def test_qwen3_w4a4_distributed_tp2(model):
example_prompts = [