Revert "[Feature][Quant] Auto-detect quantization format from model f… (#6873)

This reverts commit 3953dcf784. to keep
the basic functions available

---------

Signed-off-by: wangli <wangli858794774@gmail.com>
This commit is contained in:
Li Wang
2026-03-10 11:27:32 +08:00
committed by GitHub
parent 40f7d93f1a
commit 33234aa0c5
7 changed files with 12 additions and 584 deletions

View File

@@ -49,43 +49,6 @@ def test_qwen3_w8a8_quant():
name_1="vllm_quant_w8a8_outputs",
)
# fmt: off
def test_qwen3_w8a8_quant_auto_detect():
"""Test that ModelSlim quantization is auto-detected without --quantization.
Uses the same W8A8 model as test_qwen3_w8a8_quant but omits the
quantization parameter, verifying that the auto-detection in
maybe_auto_detect_quantization() picks up quant_model_description.json
and produces identical results.
"""
max_tokens = 5
example_prompts = [
"vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs."
]
vllm_target_outputs = [([
85, 4086, 44, 374, 264, 1550, 42747, 628, 323, 4938, 72816, 44378, 323,
13480, 4712, 369, 444, 10994, 82, 13, 1084, 374, 6188, 311, 387
], 'vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs. It is designed to be'
)]
# fmt: on
with VllmRunner(
"vllm-ascend/Qwen3-0.6B-W8A8",
max_model_len=8192,
gpu_memory_utilization=0.7,
cudagraph_capture_sizes=[1, 2, 4, 8],
) as vllm_model:
vllm_quant_auto_detect_outputs = vllm_model.generate_greedy(
example_prompts, max_tokens)
check_outputs_equal(
outputs_0_lst=vllm_target_outputs,
outputs_1_lst=vllm_quant_auto_detect_outputs,
name_0="vllm_target_outputs",
name_1="vllm_quant_auto_detect_outputs",
)
# fmt: off
def test_qwen3_dense_w8a16():
max_tokens = 5