diff --git a/.github/workflows/_e2e_test.yaml b/.github/workflows/_e2e_test.yaml index f4c6a65e..c4442eeb 100644 --- a/.github/workflows/_e2e_test.yaml +++ b/.github/workflows/_e2e_test.yaml @@ -192,11 +192,11 @@ jobs: # To avoid oom, we need to run the test in a single process. pytest -sv --durations=0 tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_multistream_moe pytest -sv --durations=0 tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_Qwen3_W4A8DYNAMIC - pytest -sv --durations=0 tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_W4A8DYNAMIC pytest -sv --durations=0 tests/e2e/multicard/test_offline_inference_distributed.py::test_sp_for_qwen3_moe pytest -sv --durations=0 tests/e2e/multicard/test_offline_inference_distributed.py::test_fc2_for_qwen3_moe pytest -sv --durations=0 tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_Qwen_Dense_with_flashcomm_v1 pytest -sv --durations=0 tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_Qwen_Dense_with_prefetch_mlp_weight + pytest -sv --durations=0 tests/e2e/multicard/test_offline_inference_distributed.py::test_deepseek_w4a8_accuracy pytest -sv --durations=0 tests/e2e/multicard/test_prefix_caching.py pytest -sv --durations=0 tests/e2e/multicard/test_pipeline_parallel.py @@ -265,7 +265,6 @@ jobs: VLLM_USE_MODELSCOPE: True run: | pytest -sv --durations=0 tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_multistream_moe - pytest -sv --durations=0 tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_W4A8DYNAMIC pytest -sv --durations=0 tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_Kimi_K2_Thinking_W4A16 pytest -sv --durations=0 tests/e2e/multicard/test_data_parallel_tp2.py diff --git a/tests/e2e/multicard/test_offline_inference_distributed.py b/tests/e2e/multicard/test_offline_inference_distributed.py index 5cb90bc9..a1e24ecf 100644 --- a/tests/e2e/multicard/test_offline_inference_distributed.py +++ b/tests/e2e/multicard/test_offline_inference_distributed.py @@ -28,6 +28,7 @@ from modelscope import snapshot_download # type: ignore from vllm import SamplingParams from tests.e2e.conftest import VllmRunner +from tests.e2e.model_utils import check_outputs_equal os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256" os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" @@ -84,22 +85,6 @@ def test_models_distributed_Qwen3_W4A8DYNAMIC(model): vllm_model.generate_greedy(prompts, max_tokens) -@pytest.mark.parametrize("model", DEEPSEEK_W4A8_MODELS) -@patch.dict(os.environ, {"HCCL_BUFFSIZE": "1024"}) -def test_models_distributed_DeepSeek_W4A8DYNAMIC(model): - prompts = [ - "Hello, my name is", - ] - max_tokens = 5 - with VllmRunner(snapshot_download(model), - dtype="auto", - tensor_parallel_size=2, - quantization="ascend", - enforce_eager=True, - enable_expert_parallel=True) as vllm_model: - vllm_model.generate_greedy(prompts, max_tokens) - - def test_sp_for_qwen3_moe() -> None: example_prompts = [ "Hello, my name is", @@ -121,6 +106,38 @@ def test_sp_for_qwen3_moe() -> None: vllm_model.generate(example_prompts, sampling_params) +@pytest.mark.parametrize("model", DEEPSEEK_W4A8_MODELS) +@patch.dict(os.environ, {"HCCL_BUFFSIZE": "2048"}) +def test_deepseek_w4a8_accuracy(model): + prompts = [ + "Hello, my name is", "The president of the United States is", + "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs" + ] + vllm_ds_w4a8_answers = [ + '逍遙而至地去 accrued', '平行于我udo madreHelen', 'ysteepaolis backwards Kj' + ] + sampling_params = SamplingParams(max_tokens=5, temperature=0.0) + with VllmRunner(snapshot_download(model), + dtype="auto", + tensor_parallel_size=2, + quantization="ascend", + enable_expert_parallel=True) as vllm_model: + vllm_quant_outputs = vllm_model.model.generate(prompts, + sampling_params) + + vllm_quant_outputs_list = [] + for output in vllm_quant_outputs: + vllm_quant_outputs_list.append( + ([output.outputs[0].index], output.outputs[0].text)) + vllm_answer_list = [] + vllm_answer_list = ([([0], answer) for answer in vllm_ds_w4a8_answers]) + + check_outputs_equal(outputs_0_lst=vllm_answer_list, + outputs_1_lst=vllm_quant_outputs_list, + name_0="vllm_quant_outputs", + name_1="vllm_answer_outputs") + + @patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_FLASHCOMM1": "1"}) @patch.dict(os.environ, {"VLLM_ASCEND_FLASHCOMM2_PARALLEL_SIZE": "1"}) def test_fc2_for_qwen3_moe() -> None: