[Feature] Add support of new W4A4_LAOS_DYNAMIC quantization method (#5143)
Introduce W4A4 LAOS Quantization for better model compression and
inference efficiency on Ascend devices.
- vLLM version: v0.12.0
- vLLM main:
ad32e3e19c
Signed-off-by: hfadzxy <starmoon_zhang@163.com>
This commit is contained in:
@@ -41,6 +41,10 @@ QWEN_W4A8_MODELS = [
|
||||
"vllm-ascend/Qwen3-1.7B-W4A8-V1",
|
||||
]
|
||||
|
||||
QWEN_W4A4_MODELS = [
|
||||
"Eco-Tech/Qwen3-32B-w4a4-LAOS",
|
||||
]
|
||||
|
||||
DEEPSEEK_W4A8_MODELS = [
|
||||
"vllm-ascend/DeepSeek-V3.1-W4A8-puring",
|
||||
]
|
||||
@@ -261,3 +265,18 @@ def test_deepseek3_2_w8a8_pruning_mtp_tp2_ep():
|
||||
reasoning_parser="deepseek_v3",
|
||||
tokenizer_mode="deepseek_v32") as vllm_model:
|
||||
vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", QWEN_W4A4_MODELS)
|
||||
def test_qwen3_w4a4_distributed_tp2(model):
|
||||
example_prompts = [
|
||||
"Hello, my name is",
|
||||
]
|
||||
max_tokens = 5
|
||||
with VllmRunner(
|
||||
snapshot_download(model),
|
||||
tensor_parallel_size=2,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
quantization="ascend",
|
||||
) as vllm_model:
|
||||
vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
Reference in New Issue
Block a user