[CI] refect e2e ci test (#5246)
### What this PR does / why we need it?
efect e2e ci test:
1. tests/e2e/singlecard/pooling/test_embedding.py: remove the eager
parameter and rename test case
2. tests/e2e/singlecard/pooling/test_scoring.py: Rename test cases
3. tests/e2e/singlecard/pooling/test_classification.py: Rename test case
4. tests/e2e/singlecard/test_quantization.py: remove the eager parameter
and chage model to vllm-ascend/Qwen2.5-0.6B-W8A8 and Rename test case
5. tests/e2e/multicard/test_shared_expert_dp.py: Rename test cases
6. tests/e2e/singlecard/test_sampler.py: Rename test cases
7. tests/e2e/singlecard/test_aclgraph_accuracy.py: Rename test cases
8. tests/e2e/multicard/test_offline_inference_distributed.py: Rename
test cases and remove the eager parameter
9. tests/e2e/multicard/long_sequence/test_accuracy.py: Rename test cases
and remove the eager parameter
10. tests/e2e/multicard/long_sequence/test_basic.py: Rename test cases
and remove the eager parameter
11.tests/e2e/multicard/test_expert_parallel.py:remove the eager
parameter
12.tests/e2e/multicard/test_full_graph_mode.py:remove the eager
parameter
13.tests/e2e/multicard/test_ilama_lora_tp2.py:remove the eager parameter
14.tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py:remove
the eager parameter
15.tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py:remove the
eager parameter
16.tests/e2e/singlecard/test_aclgraph_accuracy.py:remove the eager
parameter
17.tests/e2e/singlecard/test_camem.py:remove the eager parameter
18.tests/e2e/singlecard/test_ilama_lora.py:remove the eager parameter
19.tests/e2e/singlecard/test_multistream_overlap_shared_expert.py:remove
the eager parameter
20.tests/e2e/singlecard/test_vlm.py:remove the eager parameter
21.tests/e2e/singlecard/test_xli:remove the eager parameter
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: release/v0.13.0
- vLLM main:
ad32e3e19c
Signed-off-by: hfadzxy <starmoon_zhang@163.com>
This commit is contained in:
@@ -36,7 +36,7 @@ MODELS = [
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("max_tokens", [32])
|
||||
def test_output_between_eager_and_aclgraph(
|
||||
def test_models_output_between_eager_and_aclgraph(
|
||||
model: str,
|
||||
max_tokens: int,
|
||||
) -> None:
|
||||
@@ -50,7 +50,6 @@ def test_output_between_eager_and_aclgraph(
|
||||
with VllmRunner(
|
||||
model,
|
||||
max_model_len=1024,
|
||||
enforce_eager=False,
|
||||
quantization="ascend",
|
||||
) as runner:
|
||||
vllm_aclgraph_outputs = runner.model.generate(
|
||||
@@ -68,7 +67,6 @@ def test_output_between_eager_and_aclgraph(
|
||||
with VllmRunner(
|
||||
model,
|
||||
max_model_len=1024,
|
||||
enforce_eager=False,
|
||||
) as runner:
|
||||
vllm_aclgraph_outputs = runner.model.generate(
|
||||
prompts, sampling_params)
|
||||
@@ -100,7 +98,7 @@ def test_output_between_eager_and_aclgraph(
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("max_tokens", [32])
|
||||
def test_output_between_eager_and_full_decode_only(
|
||||
def test_models_output_between_eager_and_full_decode_only(
|
||||
model: str,
|
||||
max_tokens: int,
|
||||
) -> None:
|
||||
@@ -155,7 +153,6 @@ def test_output_between_eager_and_full_decode_only(
|
||||
with VllmRunner(
|
||||
model,
|
||||
max_model_len=1024,
|
||||
enforce_eager=False,
|
||||
compilation_config={"cudagraph_mode": "FULL_DECODE_ONLY"},
|
||||
quantization="ascend",
|
||||
) as runner:
|
||||
@@ -166,7 +163,6 @@ def test_output_between_eager_and_full_decode_only(
|
||||
with VllmRunner(
|
||||
model,
|
||||
max_model_len=1024,
|
||||
enforce_eager=False,
|
||||
compilation_config={
|
||||
"cudagraph_capture_sizes": [4, 8, 32, 64],
|
||||
"cudagraph_mode": "FULL_DECODE_ONLY"
|
||||
@@ -196,7 +192,7 @@ def test_output_between_eager_and_full_decode_only(
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("max_tokens", [32])
|
||||
def test_output_between_eager_and_fullgraph_npugraph_ex(
|
||||
def test_models_output_between_eager_and_fullgraph_npugraph_ex(
|
||||
model: str,
|
||||
max_tokens: int,
|
||||
) -> None:
|
||||
@@ -251,7 +247,6 @@ def test_output_between_eager_and_fullgraph_npugraph_ex(
|
||||
with VllmRunner(
|
||||
model,
|
||||
max_model_len=1024,
|
||||
enforce_eager=False,
|
||||
compilation_config={"cudagraph_mode": "FULL_DECODE_ONLY"},
|
||||
additional_config={"enable_npugraph_ex": True},
|
||||
quantization="ascend",
|
||||
@@ -263,7 +258,6 @@ def test_output_between_eager_and_fullgraph_npugraph_ex(
|
||||
with VllmRunner(
|
||||
model,
|
||||
max_model_len=1024,
|
||||
enforce_eager=False,
|
||||
compilation_config={
|
||||
"cudagraph_capture_sizes": [4, 8, 32, 64],
|
||||
"cudagraph_mode": "FULL_DECODE_ONLY"
|
||||
|
||||
Reference in New Issue
Block a user