[CI] refect e2e ci test (#5246)
### What this PR does / why we need it?
efect e2e ci test:
1. tests/e2e/singlecard/pooling/test_embedding.py: remove the eager
parameter and rename test case
2. tests/e2e/singlecard/pooling/test_scoring.py: Rename test cases
3. tests/e2e/singlecard/pooling/test_classification.py: Rename test case
4. tests/e2e/singlecard/test_quantization.py: remove the eager parameter
and chage model to vllm-ascend/Qwen2.5-0.6B-W8A8 and Rename test case
5. tests/e2e/multicard/test_shared_expert_dp.py: Rename test cases
6. tests/e2e/singlecard/test_sampler.py: Rename test cases
7. tests/e2e/singlecard/test_aclgraph_accuracy.py: Rename test cases
8. tests/e2e/multicard/test_offline_inference_distributed.py: Rename
test cases and remove the eager parameter
9. tests/e2e/multicard/long_sequence/test_accuracy.py: Rename test cases
and remove the eager parameter
10. tests/e2e/multicard/long_sequence/test_basic.py: Rename test cases
and remove the eager parameter
11.tests/e2e/multicard/test_expert_parallel.py:remove the eager
parameter
12.tests/e2e/multicard/test_full_graph_mode.py:remove the eager
parameter
13.tests/e2e/multicard/test_ilama_lora_tp2.py:remove the eager parameter
14.tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py:remove
the eager parameter
15.tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py:remove the
eager parameter
16.tests/e2e/singlecard/test_aclgraph_accuracy.py:remove the eager
parameter
17.tests/e2e/singlecard/test_camem.py:remove the eager parameter
18.tests/e2e/singlecard/test_ilama_lora.py:remove the eager parameter
19.tests/e2e/singlecard/test_multistream_overlap_shared_expert.py:remove
the eager parameter
20.tests/e2e/singlecard/test_vlm.py:remove the eager parameter
21.tests/e2e/singlecard/test_xli:remove the eager parameter
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: release/v0.13.0
- vLLM main:
ad32e3e19c
Signed-off-by: hfadzxy <starmoon_zhang@163.com>
This commit is contained in:
2
.github/workflows/_e2e_test.yaml
vendored
2
.github/workflows/_e2e_test.yaml
vendored
@@ -92,7 +92,7 @@ jobs:
|
||||
# pytest -sv --durations=0 tests/e2e/singlecard/test_aclgraph_accuracy.py
|
||||
# pytest -sv --durations=0 tests/e2e/singlecard/test_quantization.py
|
||||
pytest -sv --durations=0 tests/e2e/singlecard/test_vlm.py::test_multimodal_vl
|
||||
pytest -sv --durations=0 tests/e2e/singlecard/pooling/test_classification.py::test_classify_correctness
|
||||
pytest -sv --durations=0 tests/e2e/singlecard/pooling/test_classification.py::test_qwen_pooling_classify_correctness
|
||||
|
||||
- name: Run e2e test
|
||||
env:
|
||||
|
||||
@@ -36,7 +36,7 @@ MODELS = [
|
||||
reason="0.12.0 is not supported for context sequence.")
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("max_tokens", [10])
|
||||
def test_output_between_tp_and_cp(
|
||||
def test_models_long_sequence_output_between_tp_and_cp(
|
||||
model: str,
|
||||
max_tokens: int,
|
||||
) -> None:
|
||||
@@ -69,7 +69,6 @@ def test_output_between_tp_and_cp(
|
||||
"tensor_parallel_size": 1,
|
||||
"decode_context_parallel_size": 1,
|
||||
"prefill_context_parallel_size": 2,
|
||||
"enforce_eager": False,
|
||||
"compilation_config": {
|
||||
"cudagraph_mode": "FULL_DECODE_ONLY",
|
||||
"cudagraph_capture_sizes": [4, 8, 24, 48, 60]
|
||||
|
||||
@@ -34,7 +34,7 @@ os.environ["HCCL_BUFFSIZE"] = "768"
|
||||
|
||||
@pytest.mark.skipif(vllm_version_is('0.12.0'),
|
||||
reason="0.12.0 is not supported for context sequence.")
|
||||
def test_pcp_dcp_basic():
|
||||
def test_models_pcp_dcp_basic():
|
||||
prompts = [
|
||||
"The capital of France is", "Hello, my name is Tom, I am",
|
||||
"The president of United States is", "AI future is"
|
||||
@@ -69,7 +69,7 @@ def test_pcp_dcp_basic():
|
||||
|
||||
@pytest.mark.skipif(vllm_version_is('0.12.0'),
|
||||
reason="0.12.0 is not supported for context sequence.")
|
||||
def test_pcp_dcp_full_graph():
|
||||
def test_models_pcp_dcp_full_graph():
|
||||
prompts = [
|
||||
"The capital of France is", "Hello, my name is Tom, I am",
|
||||
"The president of United States is", "AI future is"
|
||||
@@ -77,7 +77,6 @@ def test_pcp_dcp_full_graph():
|
||||
model = "deepseek-ai/DeepSeek-V2-Lite-Chat"
|
||||
sampling_params = SamplingParams(max_tokens=32, temperature=0.0)
|
||||
with VllmRunner(model,
|
||||
enforce_eager=False,
|
||||
max_model_len=1024,
|
||||
tensor_parallel_size=2,
|
||||
prefill_context_parallel_size=2,
|
||||
@@ -93,7 +92,6 @@ def test_pcp_dcp_full_graph():
|
||||
|
||||
model = "vllm-ascend/Qwen3-30B-A3B-W8A8"
|
||||
with VllmRunner(model,
|
||||
enforce_eager=False,
|
||||
max_model_len=1024,
|
||||
tensor_parallel_size=2,
|
||||
prefill_context_parallel_size=2,
|
||||
@@ -110,7 +108,7 @@ def test_pcp_dcp_full_graph():
|
||||
|
||||
@pytest.mark.skipif(vllm_version_is('0.12.0'),
|
||||
reason="0.12.0 is not supported for context sequence.")
|
||||
def test_pcp_dcp_piece_wise():
|
||||
def test_models_pcp_dcp_piece_wise():
|
||||
prompts = [
|
||||
"The capital of France is", "Hello, my name is Tom, I am",
|
||||
"The president of United States is", "AI future is"
|
||||
@@ -118,7 +116,6 @@ def test_pcp_dcp_piece_wise():
|
||||
model = "deepseek-ai/DeepSeek-V2-Lite-Chat"
|
||||
sampling_params = SamplingParams(max_tokens=32, temperature=0.0)
|
||||
with VllmRunner(model,
|
||||
enforce_eager=False,
|
||||
max_model_len=1024,
|
||||
tensor_parallel_size=2,
|
||||
prefill_context_parallel_size=2,
|
||||
@@ -130,7 +127,6 @@ def test_pcp_dcp_piece_wise():
|
||||
|
||||
model = "vllm-ascend/Qwen3-30B-A3B-W8A8"
|
||||
with VllmRunner(model,
|
||||
enforce_eager=False,
|
||||
max_model_len=1024,
|
||||
tensor_parallel_size=2,
|
||||
prefill_context_parallel_size=2,
|
||||
|
||||
@@ -15,14 +15,12 @@ def test_deepseek_correctness_ep(model_name):
|
||||
max_tokens = 5
|
||||
|
||||
# FIXME: Really strange that chunked prefill might lead to different results, investigate further
|
||||
with VllmRunner(model_name, tensor_parallel_size=2,
|
||||
enforce_eager=False) as vllm_model:
|
||||
with VllmRunner(model_name, tensor_parallel_size=2) as vllm_model:
|
||||
tp_output = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
with VllmRunner(model_name,
|
||||
tensor_parallel_size=2,
|
||||
enable_expert_parallel=True,
|
||||
enforce_eager=False) as vllm_model:
|
||||
enable_expert_parallel=True) as vllm_model:
|
||||
ep_output = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
check_outputs_equal(
|
||||
|
||||
@@ -41,7 +41,6 @@ def test_qwen3_moe_full_decode_only_tp2():
|
||||
with VllmRunner(model,
|
||||
max_model_len=1024,
|
||||
tensor_parallel_size=2,
|
||||
enforce_eager=False,
|
||||
compilation_config={
|
||||
"cudagraph_mode": "FULL_DECODE_ONLY",
|
||||
"cudagraph_capture_sizes": [4, 8, 24, 48, 60]
|
||||
@@ -53,7 +52,6 @@ def test_qwen3_moe_full_decode_only_tp2():
|
||||
model,
|
||||
max_model_len=1024,
|
||||
tensor_parallel_size=2,
|
||||
enforce_eager=False,
|
||||
) as runner:
|
||||
vllm_eager_outputs = runner.model.generate(prompts, sampling_params)
|
||||
|
||||
@@ -87,7 +85,6 @@ def test_qwen3_moe_full_graph_tp2():
|
||||
with VllmRunner(model,
|
||||
max_model_len=1024,
|
||||
tensor_parallel_size=2,
|
||||
enforce_eager=False,
|
||||
compilation_config={
|
||||
"cudagraph_mode": "FULL",
|
||||
"cudagraph_capture_sizes": [4, 8, 24, 48, 60]
|
||||
@@ -99,7 +96,6 @@ def test_qwen3_moe_full_graph_tp2():
|
||||
model,
|
||||
max_model_len=1024,
|
||||
tensor_parallel_size=2,
|
||||
enforce_eager=False,
|
||||
) as runner:
|
||||
vllm_eager_outputs = runner.model.generate(prompts, sampling_params)
|
||||
|
||||
|
||||
@@ -8,15 +8,16 @@ from tests.e2e.singlecard.test_ilama_lora import (EXPECTED_LORA_OUTPUT,
|
||||
|
||||
@pytest.mark.parametrize("distributed_executor_backend", ["mp"])
|
||||
def test_ilama_lora_tp2(distributed_executor_backend, ilama_lora_files):
|
||||
with VllmRunner(snapshot_download(MODEL_PATH),
|
||||
enable_lora=True,
|
||||
max_loras=4,
|
||||
dtype="half",
|
||||
max_model_len=1024,
|
||||
max_num_seqs=16,
|
||||
tensor_parallel_size=2,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
enforce_eager=False) as vllm_model:
|
||||
with VllmRunner(
|
||||
snapshot_download(MODEL_PATH),
|
||||
enable_lora=True,
|
||||
max_loras=4,
|
||||
dtype="half",
|
||||
max_model_len=1024,
|
||||
max_num_seqs=16,
|
||||
tensor_parallel_size=2,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
) as vllm_model:
|
||||
output = do_sample(vllm_model.model, ilama_lora_files, lora_id=2)
|
||||
|
||||
for i in range(len(EXPECTED_LORA_OUTPUT)):
|
||||
|
||||
@@ -189,7 +189,6 @@ def test_qwen3_dense_fc1_tp2(model):
|
||||
with VllmRunner(
|
||||
snapshot_download(model),
|
||||
max_model_len=8192,
|
||||
enforce_eager=False,
|
||||
dtype="auto",
|
||||
tensor_parallel_size=2,
|
||||
quantization="ascend",
|
||||
@@ -209,7 +208,6 @@ def test_qwen3_dense_prefetch_mlp_weight_tp2(model):
|
||||
with VllmRunner(
|
||||
snapshot_download(model),
|
||||
max_model_len=8192,
|
||||
enforce_eager=False,
|
||||
dtype="auto",
|
||||
tensor_parallel_size=2,
|
||||
quantization="ascend",
|
||||
|
||||
@@ -5,7 +5,7 @@ from transformers import AutoModelForSequenceClassification
|
||||
from tests.e2e.conftest import HfRunner, VllmRunner
|
||||
|
||||
|
||||
def test_classify_correctness() -> None:
|
||||
def test_qwen_pooling_classify_correctness() -> None:
|
||||
|
||||
model_name = snapshot_download("Howeee/Qwen2.5-1.5B-apeach")
|
||||
|
||||
|
||||
@@ -36,7 +36,6 @@ def test_embed_models_correctness(model: str):
|
||||
with VllmRunner(
|
||||
model_name,
|
||||
runner="pooling",
|
||||
enforce_eager=False,
|
||||
max_model_len=None,
|
||||
cudagraph_capture_sizes=[4],
|
||||
) as vllm_runner:
|
||||
@@ -58,14 +57,13 @@ def test_embed_models_correctness(model: str):
|
||||
)
|
||||
|
||||
|
||||
def test_bge_model_correctness():
|
||||
def test_bge_m3_correctness():
|
||||
queries = ['What is the capital of China?', 'Explain gravity']
|
||||
|
||||
model_name = snapshot_download("BAAI/bge-m3")
|
||||
with VllmRunner(
|
||||
model_name,
|
||||
runner="pooling",
|
||||
enforce_eager=False,
|
||||
) as vllm_aclgraph_runner:
|
||||
vllm_aclgraph_outputs = vllm_aclgraph_runner.embed(queries)
|
||||
|
||||
|
||||
@@ -34,7 +34,7 @@ def model_name(request):
|
||||
yield snapshot_download(request.param)
|
||||
|
||||
|
||||
def test_cross_encoder_1_to_1(model_name):
|
||||
def test_cross_encoder_score_1_to_1(model_name):
|
||||
text_pair = [TEXTS_1[0], TEXTS_2[0]]
|
||||
|
||||
with HfRunner(model_name, dtype=DTYPE, is_cross_encoder=True) as hf_model:
|
||||
@@ -53,7 +53,7 @@ def test_cross_encoder_1_to_1(model_name):
|
||||
assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01)
|
||||
|
||||
|
||||
def test_cross_encoder_1_to_N(model_name):
|
||||
def test_cross_encoder_score_1_to_N(model_name):
|
||||
text_pairs = [
|
||||
[TEXTS_1[0], TEXTS_2[0]],
|
||||
[TEXTS_1[0], TEXTS_2[1]],
|
||||
@@ -76,7 +76,7 @@ def test_cross_encoder_1_to_N(model_name):
|
||||
assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.01)
|
||||
|
||||
|
||||
def test_cross_encoder_N_to_N(model_name):
|
||||
def test_cross_encoder_score_N_to_N(model_name):
|
||||
text_pairs = [
|
||||
[TEXTS_1[0], TEXTS_2[0]],
|
||||
[TEXTS_1[1], TEXTS_2[1]],
|
||||
@@ -104,7 +104,7 @@ def emb_model_name(request):
|
||||
yield snapshot_download(request.param)
|
||||
|
||||
|
||||
def test_embedding_1_to_1(emb_model_name):
|
||||
def test_embedding_score_1_to_1(emb_model_name):
|
||||
text_pair = [TEXTS_1[0], TEXTS_2[0]]
|
||||
|
||||
with HfRunner(emb_model_name, dtype=DTYPE,
|
||||
@@ -127,7 +127,7 @@ def test_embedding_1_to_1(emb_model_name):
|
||||
assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01)
|
||||
|
||||
|
||||
def test_embedding_1_to_N(emb_model_name):
|
||||
def test_embedding_score_1_to_N(emb_model_name):
|
||||
text_pairs = [
|
||||
[TEXTS_1[0], TEXTS_2[0]],
|
||||
[TEXTS_1[0], TEXTS_2[1]],
|
||||
@@ -157,7 +157,7 @@ def test_embedding_1_to_N(emb_model_name):
|
||||
assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.01)
|
||||
|
||||
|
||||
def test_embedding_N_to_N(emb_model_name):
|
||||
def test_embedding_score_N_to_N(emb_model_name):
|
||||
text_pairs = [
|
||||
[TEXTS_1[0], TEXTS_2[0]],
|
||||
[TEXTS_1[1], TEXTS_2[1]],
|
||||
|
||||
@@ -171,4 +171,4 @@ def test_mtp2_correctness_piecewise_graph_with_pad(
|
||||
mtp_correctness(sampling_config,
|
||||
model_name,
|
||||
2,
|
||||
disable_padded_drafter_batch=False)
|
||||
disable_padded_drafter_batch=False)
|
||||
@@ -76,19 +76,22 @@ def test_ngram_correctness(
|
||||
should be the same when using ngram speculative decoding.
|
||||
'''
|
||||
|
||||
with VllmRunner(model_name, max_model_len=1024,
|
||||
enforce_eager=False) as ref_llm:
|
||||
with VllmRunner(
|
||||
model_name,
|
||||
max_model_len=1024,
|
||||
) as ref_llm:
|
||||
ref_outputs = ref_llm.model.chat(test_prompts, sampling_config)
|
||||
|
||||
with VllmRunner(model_name,
|
||||
speculative_config={
|
||||
"method": "ngram",
|
||||
"prompt_lookup_max": 5,
|
||||
"prompt_lookup_min": 3,
|
||||
"num_speculative_tokens": 3,
|
||||
},
|
||||
max_model_len=1024,
|
||||
enforce_eager=False) as runner:
|
||||
with VllmRunner(
|
||||
model_name,
|
||||
speculative_config={
|
||||
"method": "ngram",
|
||||
"prompt_lookup_max": 5,
|
||||
"prompt_lookup_min": 3,
|
||||
"num_speculative_tokens": 3,
|
||||
},
|
||||
max_model_len=1024,
|
||||
) as runner:
|
||||
spec_outputs = runner.model.chat(test_prompts, sampling_config)
|
||||
matches = 0
|
||||
misses = 0
|
||||
@@ -190,8 +193,7 @@ def test_suffix_correctness(
|
||||
Compare the outputs of a original LLM and a speculative LLM
|
||||
should be the same when using ngram speculative decoding.
|
||||
'''
|
||||
with VllmRunner(model_name, max_model_len=1024,
|
||||
enforce_eager=False) as ref_llm:
|
||||
with VllmRunner(model_name, max_model_len=1024) as ref_llm:
|
||||
ref_outputs = ref_llm.model.chat(test_prompts, sampling_config)
|
||||
|
||||
with VllmRunner(model_name,
|
||||
@@ -199,8 +201,7 @@ def test_suffix_correctness(
|
||||
"method": "suffix",
|
||||
"num_speculative_tokens": 8,
|
||||
},
|
||||
max_model_len=1024,
|
||||
enforce_eager=False) as runner:
|
||||
max_model_len=1024) as runner:
|
||||
spec_outputs = runner.model.chat(test_prompts, sampling_config)
|
||||
matches = 0
|
||||
misses = 0
|
||||
@@ -236,8 +237,7 @@ def test_suffix_acceptance(
|
||||
"num_speculative_tokens": 10,
|
||||
},
|
||||
max_model_len=1024,
|
||||
disable_log_stats=False,
|
||||
enforce_eager=False) as runner:
|
||||
disable_log_stats=False) as runner:
|
||||
for i in range(10):
|
||||
runner.model.chat(test_prompts[i], sampling_config)
|
||||
metrics = runner.model.get_metrics()
|
||||
@@ -278,7 +278,7 @@ def test_eagle_logprobs(
|
||||
max_tokens=10,
|
||||
ignore_eos=False)
|
||||
|
||||
ref_llm = LLM(model=model_name, max_model_len=2048, enforce_eager=False)
|
||||
ref_llm = LLM(model=model_name, max_model_len=2048)
|
||||
ref_outputs = ref_llm.chat([prompt], sampling_params)
|
||||
ref_logprobs = []
|
||||
for output in ref_outputs[0].outputs:
|
||||
@@ -300,7 +300,6 @@ def test_eagle_logprobs(
|
||||
"max_model_len": 128,
|
||||
},
|
||||
max_model_len=128,
|
||||
enforce_eager=False,
|
||||
) as runner:
|
||||
spec_outputs = runner.model.chat([prompt], sampling_params)
|
||||
|
||||
|
||||
@@ -36,7 +36,7 @@ MODELS = [
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("max_tokens", [32])
|
||||
def test_output_between_eager_and_aclgraph(
|
||||
def test_models_output_between_eager_and_aclgraph(
|
||||
model: str,
|
||||
max_tokens: int,
|
||||
) -> None:
|
||||
@@ -50,7 +50,6 @@ def test_output_between_eager_and_aclgraph(
|
||||
with VllmRunner(
|
||||
model,
|
||||
max_model_len=1024,
|
||||
enforce_eager=False,
|
||||
quantization="ascend",
|
||||
) as runner:
|
||||
vllm_aclgraph_outputs = runner.model.generate(
|
||||
@@ -68,7 +67,6 @@ def test_output_between_eager_and_aclgraph(
|
||||
with VllmRunner(
|
||||
model,
|
||||
max_model_len=1024,
|
||||
enforce_eager=False,
|
||||
) as runner:
|
||||
vllm_aclgraph_outputs = runner.model.generate(
|
||||
prompts, sampling_params)
|
||||
@@ -100,7 +98,7 @@ def test_output_between_eager_and_aclgraph(
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("max_tokens", [32])
|
||||
def test_output_between_eager_and_full_decode_only(
|
||||
def test_models_output_between_eager_and_full_decode_only(
|
||||
model: str,
|
||||
max_tokens: int,
|
||||
) -> None:
|
||||
@@ -155,7 +153,6 @@ def test_output_between_eager_and_full_decode_only(
|
||||
with VllmRunner(
|
||||
model,
|
||||
max_model_len=1024,
|
||||
enforce_eager=False,
|
||||
compilation_config={"cudagraph_mode": "FULL_DECODE_ONLY"},
|
||||
quantization="ascend",
|
||||
) as runner:
|
||||
@@ -166,7 +163,6 @@ def test_output_between_eager_and_full_decode_only(
|
||||
with VllmRunner(
|
||||
model,
|
||||
max_model_len=1024,
|
||||
enforce_eager=False,
|
||||
compilation_config={
|
||||
"cudagraph_capture_sizes": [4, 8, 32, 64],
|
||||
"cudagraph_mode": "FULL_DECODE_ONLY"
|
||||
@@ -196,7 +192,7 @@ def test_output_between_eager_and_full_decode_only(
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("max_tokens", [32])
|
||||
def test_output_between_eager_and_fullgraph_npugraph_ex(
|
||||
def test_models_output_between_eager_and_fullgraph_npugraph_ex(
|
||||
model: str,
|
||||
max_tokens: int,
|
||||
) -> None:
|
||||
@@ -251,7 +247,6 @@ def test_output_between_eager_and_fullgraph_npugraph_ex(
|
||||
with VllmRunner(
|
||||
model,
|
||||
max_model_len=1024,
|
||||
enforce_eager=False,
|
||||
compilation_config={"cudagraph_mode": "FULL_DECODE_ONLY"},
|
||||
additional_config={"enable_npugraph_ex": True},
|
||||
quantization="ascend",
|
||||
@@ -263,7 +258,6 @@ def test_output_between_eager_and_fullgraph_npugraph_ex(
|
||||
with VllmRunner(
|
||||
model,
|
||||
max_model_len=1024,
|
||||
enforce_eager=False,
|
||||
compilation_config={
|
||||
"cudagraph_capture_sizes": [4, 8, 32, 64],
|
||||
"cudagraph_mode": "FULL_DECODE_ONLY"
|
||||
|
||||
@@ -76,9 +76,7 @@ def test_end_to_end():
|
||||
prompt = "How are you?"
|
||||
sampling_params = SamplingParams(temperature=0, max_tokens=10)
|
||||
|
||||
with VllmRunner("Qwen/Qwen3-0.6B",
|
||||
enforce_eager=False,
|
||||
enable_sleep_mode=True) as runner:
|
||||
with VllmRunner("Qwen/Qwen3-0.6B", enable_sleep_mode=True) as runner:
|
||||
|
||||
output = runner.model.generate(prompt, sampling_params)
|
||||
# the benefit of `llm.sleep(level=2)` is mainly CPU memory usage,
|
||||
|
||||
@@ -45,13 +45,14 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
|
||||
|
||||
|
||||
def test_ilama_lora(ilama_lora_files):
|
||||
with VllmRunner(snapshot_download(MODEL_PATH),
|
||||
enable_lora=True,
|
||||
dtype="half",
|
||||
max_loras=4,
|
||||
max_model_len=1024,
|
||||
max_num_seqs=16,
|
||||
enforce_eager=False) as vllm_model:
|
||||
with VllmRunner(
|
||||
snapshot_download(MODEL_PATH),
|
||||
enable_lora=True,
|
||||
dtype="half",
|
||||
max_loras=4,
|
||||
max_model_len=1024,
|
||||
max_num_seqs=16,
|
||||
) as vllm_model:
|
||||
|
||||
output1 = do_sample(vllm_model.model, ilama_lora_files, lora_id=1)
|
||||
for i in range(len(EXPECTED_LORA_OUTPUT)):
|
||||
|
||||
@@ -58,7 +58,6 @@ def test_models_with_multistream_overlap_shared_expert(
|
||||
with VllmRunner(
|
||||
model,
|
||||
max_model_len=1024,
|
||||
enforce_eager=False,
|
||||
additional_config={
|
||||
"multistream_overlap_shared_expert": True,
|
||||
},
|
||||
|
||||
@@ -20,15 +20,14 @@ from modelscope import snapshot_download # type: ignore[import-untyped]
|
||||
from tests.e2e.conftest import VllmRunner
|
||||
|
||||
|
||||
def test_quant_W8A8():
|
||||
def test_qwen3_w8a8_quant():
|
||||
max_tokens = 5
|
||||
example_prompts = [
|
||||
"vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs."
|
||||
]
|
||||
with VllmRunner(
|
||||
snapshot_download("vllm-ascend/Qwen2.5-0.5B-Instruct-W8A8"),
|
||||
snapshot_download("vllm-ascend/Qwen3-0.6B-W8A8"),
|
||||
max_model_len=8192,
|
||||
enforce_eager=False,
|
||||
gpu_memory_utilization=0.7,
|
||||
quantization="ascend",
|
||||
) as vllm_model:
|
||||
|
||||
@@ -21,7 +21,7 @@ from vllm import SamplingParams
|
||||
from tests.e2e.conftest import VllmRunner
|
||||
|
||||
|
||||
def test_models_topk() -> None:
|
||||
def test_qwen3_topk() -> None:
|
||||
example_prompts = [
|
||||
"Hello, my name is",
|
||||
]
|
||||
@@ -36,7 +36,7 @@ def test_models_topk() -> None:
|
||||
runner.generate(example_prompts, sampling_params)
|
||||
|
||||
|
||||
def test_models_prompt_logprobs() -> None:
|
||||
def test_qwen3_prompt_logprobs() -> None:
|
||||
example_prompts = [
|
||||
"Hello, my name is",
|
||||
]
|
||||
@@ -49,7 +49,7 @@ def test_models_prompt_logprobs() -> None:
|
||||
num_logprobs=1)
|
||||
|
||||
|
||||
def test_exponential_overlap() -> None:
|
||||
def test_qwen3_exponential_overlap() -> None:
|
||||
example_prompts = [
|
||||
"Hello, my name is",
|
||||
]
|
||||
|
||||
@@ -46,7 +46,6 @@ def test_multimodal_vl(vl_config):
|
||||
|
||||
with VllmRunner(vl_config["model"],
|
||||
mm_processor_kwargs=vl_config["mm_processor_kwargs"],
|
||||
enforce_eager=False,
|
||||
max_model_len=8192,
|
||||
limit_mm_per_prompt={"image": 1}) as vllm_model:
|
||||
outputs = vllm_model.generate_greedy(
|
||||
|
||||
@@ -48,7 +48,6 @@ def test_models_with_xlite_decode_only(
|
||||
model,
|
||||
block_size=128,
|
||||
max_model_len=1024,
|
||||
enforce_eager=False,
|
||||
additional_config={"xlite_graph_config": {
|
||||
"enabled": True
|
||||
}},
|
||||
@@ -97,7 +96,6 @@ def test_models_with_xlite_full_mode(
|
||||
model,
|
||||
block_size=128,
|
||||
max_model_len=1024,
|
||||
enforce_eager=False,
|
||||
additional_config={
|
||||
"xlite_graph_config": {
|
||||
"enabled": True,
|
||||
|
||||
Reference in New Issue
Block a user