[main][quantization] Support deepseek w4a8 per-channel quantization (#3011)
### What this PR does / why we need it?
1.Support deepseek w4a8 per-channel quantization
2.The eager mode supports converting weights to the NZ format
### Does this PR introduce _any_ user-facing change?
no
### How was this patch tested?
#### How to get weights using Modelslim
##### Installation steps
git clone https://gitcode.com/Ascend/msit.git
cd msit/msmodelslim
bash install.sh
##### Generate w4a8 per-channel weights
cd /example/DeepSeek
Command reference: msmodelslim/example/DeepSeek/README.md
- vLLM version: v0.10.2
- vLLM main:
f225ea7dd9
---------
Signed-off-by: Wang Kunpeng <1289706727@qq.com>
This commit is contained in:
@@ -35,6 +35,11 @@ QWEN_DENSE_MODELS = [
|
||||
"vllm-ascend/Qwen3-8B-W8A8", "vllm-ascend/Qwen2.5-0.5B-Instruct-W8A8"
|
||||
]
|
||||
|
||||
DEEPSEEK_W4A8_MODELS = [
|
||||
"vllm-ascend/DeepSeek-V3-W4A8-Pruing",
|
||||
"vllm-ascend/DeepSeek-V3.1-W4A8-puring"
|
||||
]
|
||||
|
||||
|
||||
def test_models_distributed_QwQ():
|
||||
example_prompts = [
|
||||
@@ -109,14 +114,15 @@ def test_models_distributed_Qwen3_W4A8DYNAMIC():
|
||||
vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", DEEPSEEK_W4A8_MODELS)
|
||||
@patch.dict(os.environ, {"VLLM_ASCEND_MLA_PA": "1"})
|
||||
def test_models_distributed_DeepSeek_W4A8DYNAMIC():
|
||||
def test_models_distributed_DeepSeek_W4A8DYNAMIC(model):
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
]
|
||||
max_tokens = 5
|
||||
with VllmRunner(
|
||||
snapshot_download("vllm-ascend/DeepSeek-V3-W4A8-Pruing"),
|
||||
snapshot_download(model),
|
||||
dtype="auto",
|
||||
tensor_parallel_size=2,
|
||||
quantization="ascend",
|
||||
|
||||
Reference in New Issue
Block a user