[main][quantization] Support deepseek w4a8 per-channel quantization (#3011)

### What this PR does / why we need it? 1.Support deepseek w4a8 per-channel quantization 2.The eager mode supports converting weights to the NZ format ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? #### How to get weights using Modelslim ##### Installation steps git clone https://gitcode.com/Ascend/msit.git cd msit/msmodelslim bash install.sh ##### Generate w4a8 per-channel weights cd /example/DeepSeek Command reference: msmodelslim/example/DeepSeek/README.md - vLLM version: v0.10.2 - vLLM main: f225ea7dd9 --------- Signed-off-by: Wang Kunpeng <1289706727@qq.com>
2025-09-27 21:01:16 +08:00
parent e9359bd8fa
commit 859e861d92
6 changed files with 299 additions and 196 deletions
--- a/tests/e2e/multicard/test_offline_inference_distributed.py
+++ b/tests/e2e/multicard/test_offline_inference_distributed.py
@@ -35,6 +35,11 @@ QWEN_DENSE_MODELS = [
    "vllm-ascend/Qwen3-8B-W8A8", "vllm-ascend/Qwen2.5-0.5B-Instruct-W8A8"
 ]

+DEEPSEEK_W4A8_MODELS = [
+    "vllm-ascend/DeepSeek-V3-W4A8-Pruing",
+    "vllm-ascend/DeepSeek-V3.1-W4A8-puring"
+]
+

 def test_models_distributed_QwQ():
    example_prompts = [
@@ -109,14 +114,15 @@ def test_models_distributed_Qwen3_W4A8DYNAMIC():
        vllm_model.generate_greedy(example_prompts, max_tokens)


+@pytest.mark.parametrize("model", DEEPSEEK_W4A8_MODELS)
@patch.dict(os.environ, {"VLLM_ASCEND_MLA_PA": "1"})
-def test_models_distributed_DeepSeek_W4A8DYNAMIC():
+def test_models_distributed_DeepSeek_W4A8DYNAMIC(model):
    prompts = [
        "Hello, my name is",
    ]
    max_tokens = 5
    with VllmRunner(
-            snapshot_download("vllm-ascend/DeepSeek-V3-W4A8-Pruing"),
+            snapshot_download(model),
            dtype="auto",
            tensor_parallel_size=2,
            quantization="ascend",