[test] add w4a8 accuracy case (#5110)
### What this PR does / why we need it?
This PR add w4a8 accuracy testcase for e2e test
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
By running the test
- vLLM version: v0.12.0
- vLLM main:
ad32e3e19c
---------
Signed-off-by: cuikai (C) <c00827167@china.huawei.com>
Co-authored-by: cuikai (C) <c00827167@china.huawei.com>
This commit is contained in:
3
.github/workflows/_e2e_test.yaml
vendored
3
.github/workflows/_e2e_test.yaml
vendored
@@ -192,11 +192,11 @@ jobs:
|
|||||||
# To avoid oom, we need to run the test in a single process.
|
# To avoid oom, we need to run the test in a single process.
|
||||||
pytest -sv --durations=0 tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_multistream_moe
|
pytest -sv --durations=0 tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_multistream_moe
|
||||||
pytest -sv --durations=0 tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_Qwen3_W4A8DYNAMIC
|
pytest -sv --durations=0 tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_Qwen3_W4A8DYNAMIC
|
||||||
pytest -sv --durations=0 tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_W4A8DYNAMIC
|
|
||||||
pytest -sv --durations=0 tests/e2e/multicard/test_offline_inference_distributed.py::test_sp_for_qwen3_moe
|
pytest -sv --durations=0 tests/e2e/multicard/test_offline_inference_distributed.py::test_sp_for_qwen3_moe
|
||||||
pytest -sv --durations=0 tests/e2e/multicard/test_offline_inference_distributed.py::test_fc2_for_qwen3_moe
|
pytest -sv --durations=0 tests/e2e/multicard/test_offline_inference_distributed.py::test_fc2_for_qwen3_moe
|
||||||
pytest -sv --durations=0 tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_Qwen_Dense_with_flashcomm_v1
|
pytest -sv --durations=0 tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_Qwen_Dense_with_flashcomm_v1
|
||||||
pytest -sv --durations=0 tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_Qwen_Dense_with_prefetch_mlp_weight
|
pytest -sv --durations=0 tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_Qwen_Dense_with_prefetch_mlp_weight
|
||||||
|
pytest -sv --durations=0 tests/e2e/multicard/test_offline_inference_distributed.py::test_deepseek_w4a8_accuracy
|
||||||
|
|
||||||
pytest -sv --durations=0 tests/e2e/multicard/test_prefix_caching.py
|
pytest -sv --durations=0 tests/e2e/multicard/test_prefix_caching.py
|
||||||
pytest -sv --durations=0 tests/e2e/multicard/test_pipeline_parallel.py
|
pytest -sv --durations=0 tests/e2e/multicard/test_pipeline_parallel.py
|
||||||
@@ -265,7 +265,6 @@ jobs:
|
|||||||
VLLM_USE_MODELSCOPE: True
|
VLLM_USE_MODELSCOPE: True
|
||||||
run: |
|
run: |
|
||||||
pytest -sv --durations=0 tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_multistream_moe
|
pytest -sv --durations=0 tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_multistream_moe
|
||||||
pytest -sv --durations=0 tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_W4A8DYNAMIC
|
|
||||||
pytest -sv --durations=0 tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_Kimi_K2_Thinking_W4A16
|
pytest -sv --durations=0 tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_Kimi_K2_Thinking_W4A16
|
||||||
pytest -sv --durations=0 tests/e2e/multicard/test_data_parallel_tp2.py
|
pytest -sv --durations=0 tests/e2e/multicard/test_data_parallel_tp2.py
|
||||||
|
|
||||||
|
|||||||
@@ -28,6 +28,7 @@ from modelscope import snapshot_download # type: ignore
|
|||||||
from vllm import SamplingParams
|
from vllm import SamplingParams
|
||||||
|
|
||||||
from tests.e2e.conftest import VllmRunner
|
from tests.e2e.conftest import VllmRunner
|
||||||
|
from tests.e2e.model_utils import check_outputs_equal
|
||||||
|
|
||||||
os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256"
|
os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256"
|
||||||
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
|
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
|
||||||
@@ -84,22 +85,6 @@ def test_models_distributed_Qwen3_W4A8DYNAMIC(model):
|
|||||||
vllm_model.generate_greedy(prompts, max_tokens)
|
vllm_model.generate_greedy(prompts, max_tokens)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("model", DEEPSEEK_W4A8_MODELS)
|
|
||||||
@patch.dict(os.environ, {"HCCL_BUFFSIZE": "1024"})
|
|
||||||
def test_models_distributed_DeepSeek_W4A8DYNAMIC(model):
|
|
||||||
prompts = [
|
|
||||||
"Hello, my name is",
|
|
||||||
]
|
|
||||||
max_tokens = 5
|
|
||||||
with VllmRunner(snapshot_download(model),
|
|
||||||
dtype="auto",
|
|
||||||
tensor_parallel_size=2,
|
|
||||||
quantization="ascend",
|
|
||||||
enforce_eager=True,
|
|
||||||
enable_expert_parallel=True) as vllm_model:
|
|
||||||
vllm_model.generate_greedy(prompts, max_tokens)
|
|
||||||
|
|
||||||
|
|
||||||
def test_sp_for_qwen3_moe() -> None:
|
def test_sp_for_qwen3_moe() -> None:
|
||||||
example_prompts = [
|
example_prompts = [
|
||||||
"Hello, my name is",
|
"Hello, my name is",
|
||||||
@@ -121,6 +106,38 @@ def test_sp_for_qwen3_moe() -> None:
|
|||||||
vllm_model.generate(example_prompts, sampling_params)
|
vllm_model.generate(example_prompts, sampling_params)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("model", DEEPSEEK_W4A8_MODELS)
|
||||||
|
@patch.dict(os.environ, {"HCCL_BUFFSIZE": "2048"})
|
||||||
|
def test_deepseek_w4a8_accuracy(model):
|
||||||
|
prompts = [
|
||||||
|
"Hello, my name is", "The president of the United States is",
|
||||||
|
"vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs"
|
||||||
|
]
|
||||||
|
vllm_ds_w4a8_answers = [
|
||||||
|
'逍遙而至地去 accrued', '平行于我udo madreHelen', 'ysteepaolis backwards Kj'
|
||||||
|
]
|
||||||
|
sampling_params = SamplingParams(max_tokens=5, temperature=0.0)
|
||||||
|
with VllmRunner(snapshot_download(model),
|
||||||
|
dtype="auto",
|
||||||
|
tensor_parallel_size=2,
|
||||||
|
quantization="ascend",
|
||||||
|
enable_expert_parallel=True) as vllm_model:
|
||||||
|
vllm_quant_outputs = vllm_model.model.generate(prompts,
|
||||||
|
sampling_params)
|
||||||
|
|
||||||
|
vllm_quant_outputs_list = []
|
||||||
|
for output in vllm_quant_outputs:
|
||||||
|
vllm_quant_outputs_list.append(
|
||||||
|
([output.outputs[0].index], output.outputs[0].text))
|
||||||
|
vllm_answer_list = []
|
||||||
|
vllm_answer_list = ([([0], answer) for answer in vllm_ds_w4a8_answers])
|
||||||
|
|
||||||
|
check_outputs_equal(outputs_0_lst=vllm_answer_list,
|
||||||
|
outputs_1_lst=vllm_quant_outputs_list,
|
||||||
|
name_0="vllm_quant_outputs",
|
||||||
|
name_1="vllm_answer_outputs")
|
||||||
|
|
||||||
|
|
||||||
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_FLASHCOMM1": "1"})
|
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_FLASHCOMM1": "1"})
|
||||||
@patch.dict(os.environ, {"VLLM_ASCEND_FLASHCOMM2_PARALLEL_SIZE": "1"})
|
@patch.dict(os.environ, {"VLLM_ASCEND_FLASHCOMM2_PARALLEL_SIZE": "1"})
|
||||||
def test_fc2_for_qwen3_moe() -> None:
|
def test_fc2_for_qwen3_moe() -> None:
|
||||||
|
|||||||
Reference in New Issue
Block a user