[bugfix] restore pr-7029 and fix patch error (#7294)
### What this PR does / why we need it?
This PR restores #7029, which adds W8A8C8 support for dsv3.2/glm5 using
the `lightning_indexer_quant` ops in the pd-mix stage.
The original PR was reverted by #7288 because the patch did not work
with the recompute scheduler.
This PR also fixes the patching issue so that it works correctly with
the recompute scheduler.
### Does this PR introduce _any_ user-facing change?
Yes. To enable LI C8, users need to set the `enable_sparse_c8` option to
`"true"` in `additional_config`.
- vLLM version: v0.17.0
- vLLM main:
4034c3d32e
---------
Signed-off-by: rjg-lyh <1318825571@qq.com>
This commit is contained in:
@@ -266,6 +266,33 @@ def test_deepseek3_2_w8a8_pruning_mtp_tp2_ep():
|
||||
vllm_model.generate_greedy(long_example_prompts, max_tokens)
|
||||
|
||||
|
||||
@patch.dict(os.environ, {"HCCL_OP_EXPANSION_MODE": "AIV"})
|
||||
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_FLASHCOMM1": "1"})
|
||||
@patch.dict(os.environ, {"ASCEND_AGGREGATE_ENABLE": "1"})
|
||||
@patch.dict(os.environ, {"HCCL_BUFFSIZE": "1024"})
|
||||
def test_deepseek3_2_w8a8c8_pruning_mtp_tp2_ep():
|
||||
short_example_prompts = [
|
||||
"Hello ",
|
||||
]
|
||||
# "max_position_embeddings": 163840,
|
||||
long_example_prompts = ["Hello " * (163839 - 500) + "Hello"]
|
||||
max_tokens = 500
|
||||
with VllmRunner(
|
||||
"vllm-ascend/DeepSeek-V3.2-W8A8-Pruning",
|
||||
tensor_parallel_size=2,
|
||||
quantization="ascend",
|
||||
enable_expert_parallel=True,
|
||||
max_model_len=163840,
|
||||
compilation_config={"cudagraph_capture_sizes": [2, 4, 6, 8, 10, 12], "cudagraph_mode": "FULL_DECODE_ONLY"},
|
||||
speculative_config={"num_speculative_tokens": 1, "method": "deepseek_mtp"},
|
||||
additional_config={"layer_sharding": ["q_b_proj", "o_proj"], "enable_sparse_c8": True},
|
||||
reasoning_parser="deepseek_v3",
|
||||
tokenizer_mode="deepseek_v32",
|
||||
) as vllm_model:
|
||||
vllm_model.generate_greedy(short_example_prompts, max_tokens)
|
||||
vllm_model.generate_greedy(long_example_prompts, max_tokens)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", QWEN_W4A4_MODELS)
|
||||
def test_qwen3_w4a4_distributed_tp2(model):
|
||||
example_prompts = [
|
||||
|
||||
Reference in New Issue
Block a user