diff --git a/.github/workflows/_e2e_test.yaml b/.github/workflows/_e2e_test.yaml index 5ee2fd25..97ccc4a6 100644 --- a/.github/workflows/_e2e_test.yaml +++ b/.github/workflows/_e2e_test.yaml @@ -117,6 +117,7 @@ jobs: pytest -sv --durations=0 tests/e2e/singlecard/pooling/ pytest -sv --durations=0 tests/e2e/singlecard/compile/test_norm_quant_fusion.py pytest -sv --durations=0 tests/e2e/singlecard/test_cross_layer_attn_model.py + pytest -sv --durations=0 tests/e2e/singlecard/test_multistream_overlap_shared_expert.py # ------------------------------------ v1 spec decode test ------------------------------------ # pytest -sv --durations=0 tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py @@ -219,6 +220,7 @@ jobs: pytest -sv --durations=0 tests/e2e/multicard/test_offline_inference_distributed.py::test_qwen3_dense_fc1_tp2 pytest -sv --durations=0 tests/e2e/multicard/test_offline_inference_distributed.py::test_qwen3_dense_prefetch_mlp_weight_tp2 pytest -sv --durations=0 tests/e2e/multicard/test_offline_inference_distributed.py::test_deepseek_w4a8_accuracy_tp2 + pytest -sv --durations=0 tests/e2e/multicard/test_offline_inference_distributed.py::test_deepseek_v2_lite_fc1_tp2 pytest -sv --durations=0 tests/e2e/multicard/test_prefix_caching.py pytest -sv --durations=0 tests/e2e/multicard/test_pipeline_parallel.py diff --git a/tests/e2e/singlecard/test_multistream_overlap_shared_expert.py b/tests/e2e/singlecard/test_multistream_overlap_shared_expert.py index eb00cc67..caf09bd9 100644 --- a/tests/e2e/singlecard/test_multistream_overlap_shared_expert.py +++ b/tests/e2e/singlecard/test_multistream_overlap_shared_expert.py @@ -48,9 +48,11 @@ def test_models_with_multistream_overlap_shared_expert( model, max_model_len=1024, enforce_eager=True, + cudagraph_capture_sizes=[4, 8, 16, 32], additional_config={ "multistream_overlap_shared_expert": True, }, + quantization="ascend", ) as runner: vllm_moe_ms_eager_outputs = runner.model.generate( prompts, sampling_params) @@ -58,9 +60,11 @@ def test_models_with_multistream_overlap_shared_expert( with VllmRunner( model, max_model_len=1024, + cudagraph_capture_sizes=[4, 8, 16, 32], additional_config={ "multistream_overlap_shared_expert": True, }, + quantization="ascend", ) as runner: vllm_moe_ms_aclgraph_outputs = runner.model.generate( prompts, sampling_params) @@ -69,6 +73,8 @@ def test_models_with_multistream_overlap_shared_expert( model, max_model_len=1024, enforce_eager=True, + cudagraph_capture_sizes=[4, 8, 16, 32], + quantization="ascend", ) as runner: vllm_eager_outputs = runner.model.generate(prompts, sampling_params)