diff --git a/.github/workflows/_e2e_test.yaml b/.github/workflows/_e2e_test.yaml index f3edf491..5ee2fd25 100644 --- a/.github/workflows/_e2e_test.yaml +++ b/.github/workflows/_e2e_test.yaml @@ -92,7 +92,7 @@ jobs: # pytest -sv --durations=0 tests/e2e/singlecard/test_aclgraph_accuracy.py # pytest -sv --durations=0 tests/e2e/singlecard/test_quantization.py pytest -sv --durations=0 tests/e2e/singlecard/test_vlm.py::test_multimodal_vl - pytest -sv --durations=0 tests/e2e/singlecard/pooling/test_classification.py::test_classify_correctness + pytest -sv --durations=0 tests/e2e/singlecard/pooling/test_classification.py::test_qwen_pooling_classify_correctness - name: Run e2e test env: diff --git a/tests/e2e/multicard/long_sequence/test_accuracy.py b/tests/e2e/multicard/long_sequence/test_accuracy.py index a8bbf50a..af111899 100644 --- a/tests/e2e/multicard/long_sequence/test_accuracy.py +++ b/tests/e2e/multicard/long_sequence/test_accuracy.py @@ -36,7 +36,7 @@ MODELS = [ reason="0.12.0 is not supported for context sequence.") @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("max_tokens", [10]) -def test_output_between_tp_and_cp( +def test_models_long_sequence_output_between_tp_and_cp( model: str, max_tokens: int, ) -> None: @@ -69,7 +69,6 @@ def test_output_between_tp_and_cp( "tensor_parallel_size": 1, "decode_context_parallel_size": 1, "prefill_context_parallel_size": 2, - "enforce_eager": False, "compilation_config": { "cudagraph_mode": "FULL_DECODE_ONLY", "cudagraph_capture_sizes": [4, 8, 24, 48, 60] diff --git a/tests/e2e/multicard/long_sequence/test_basic.py b/tests/e2e/multicard/long_sequence/test_basic.py index 25422678..bc6f839a 100644 --- a/tests/e2e/multicard/long_sequence/test_basic.py +++ b/tests/e2e/multicard/long_sequence/test_basic.py @@ -34,7 +34,7 @@ os.environ["HCCL_BUFFSIZE"] = "768" @pytest.mark.skipif(vllm_version_is('0.12.0'), reason="0.12.0 is not supported for context sequence.") -def test_pcp_dcp_basic(): +def test_models_pcp_dcp_basic(): prompts = [ "The capital of France is", "Hello, my name is Tom, I am", "The president of United States is", "AI future is" @@ -69,7 +69,7 @@ def test_pcp_dcp_basic(): @pytest.mark.skipif(vllm_version_is('0.12.0'), reason="0.12.0 is not supported for context sequence.") -def test_pcp_dcp_full_graph(): +def test_models_pcp_dcp_full_graph(): prompts = [ "The capital of France is", "Hello, my name is Tom, I am", "The president of United States is", "AI future is" @@ -77,7 +77,6 @@ def test_pcp_dcp_full_graph(): model = "deepseek-ai/DeepSeek-V2-Lite-Chat" sampling_params = SamplingParams(max_tokens=32, temperature=0.0) with VllmRunner(model, - enforce_eager=False, max_model_len=1024, tensor_parallel_size=2, prefill_context_parallel_size=2, @@ -93,7 +92,6 @@ def test_pcp_dcp_full_graph(): model = "vllm-ascend/Qwen3-30B-A3B-W8A8" with VllmRunner(model, - enforce_eager=False, max_model_len=1024, tensor_parallel_size=2, prefill_context_parallel_size=2, @@ -110,7 +108,7 @@ def test_pcp_dcp_full_graph(): @pytest.mark.skipif(vllm_version_is('0.12.0'), reason="0.12.0 is not supported for context sequence.") -def test_pcp_dcp_piece_wise(): +def test_models_pcp_dcp_piece_wise(): prompts = [ "The capital of France is", "Hello, my name is Tom, I am", "The president of United States is", "AI future is" @@ -118,7 +116,6 @@ def test_pcp_dcp_piece_wise(): model = "deepseek-ai/DeepSeek-V2-Lite-Chat" sampling_params = SamplingParams(max_tokens=32, temperature=0.0) with VllmRunner(model, - enforce_eager=False, max_model_len=1024, tensor_parallel_size=2, prefill_context_parallel_size=2, @@ -130,7 +127,6 @@ def test_pcp_dcp_piece_wise(): model = "vllm-ascend/Qwen3-30B-A3B-W8A8" with VllmRunner(model, - enforce_eager=False, max_model_len=1024, tensor_parallel_size=2, prefill_context_parallel_size=2, diff --git a/tests/e2e/multicard/test_expert_parallel.py b/tests/e2e/multicard/test_expert_parallel.py index 762ca6d2..9149d12e 100644 --- a/tests/e2e/multicard/test_expert_parallel.py +++ b/tests/e2e/multicard/test_expert_parallel.py @@ -15,14 +15,12 @@ def test_deepseek_correctness_ep(model_name): max_tokens = 5 # FIXME: Really strange that chunked prefill might lead to different results, investigate further - with VllmRunner(model_name, tensor_parallel_size=2, - enforce_eager=False) as vllm_model: + with VllmRunner(model_name, tensor_parallel_size=2) as vllm_model: tp_output = vllm_model.generate_greedy(example_prompts, max_tokens) with VllmRunner(model_name, tensor_parallel_size=2, - enable_expert_parallel=True, - enforce_eager=False) as vllm_model: + enable_expert_parallel=True) as vllm_model: ep_output = vllm_model.generate_greedy(example_prompts, max_tokens) check_outputs_equal( diff --git a/tests/e2e/multicard/test_full_graph_mode.py b/tests/e2e/multicard/test_full_graph_mode.py index 362f8be7..e47b4e18 100644 --- a/tests/e2e/multicard/test_full_graph_mode.py +++ b/tests/e2e/multicard/test_full_graph_mode.py @@ -41,7 +41,6 @@ def test_qwen3_moe_full_decode_only_tp2(): with VllmRunner(model, max_model_len=1024, tensor_parallel_size=2, - enforce_eager=False, compilation_config={ "cudagraph_mode": "FULL_DECODE_ONLY", "cudagraph_capture_sizes": [4, 8, 24, 48, 60] @@ -53,7 +52,6 @@ def test_qwen3_moe_full_decode_only_tp2(): model, max_model_len=1024, tensor_parallel_size=2, - enforce_eager=False, ) as runner: vllm_eager_outputs = runner.model.generate(prompts, sampling_params) @@ -87,7 +85,6 @@ def test_qwen3_moe_full_graph_tp2(): with VllmRunner(model, max_model_len=1024, tensor_parallel_size=2, - enforce_eager=False, compilation_config={ "cudagraph_mode": "FULL", "cudagraph_capture_sizes": [4, 8, 24, 48, 60] @@ -99,7 +96,6 @@ def test_qwen3_moe_full_graph_tp2(): model, max_model_len=1024, tensor_parallel_size=2, - enforce_eager=False, ) as runner: vllm_eager_outputs = runner.model.generate(prompts, sampling_params) diff --git a/tests/e2e/multicard/test_ilama_lora_tp2.py b/tests/e2e/multicard/test_ilama_lora_tp2.py index 750039c3..230aac16 100644 --- a/tests/e2e/multicard/test_ilama_lora_tp2.py +++ b/tests/e2e/multicard/test_ilama_lora_tp2.py @@ -8,15 +8,16 @@ from tests.e2e.singlecard.test_ilama_lora import (EXPECTED_LORA_OUTPUT, @pytest.mark.parametrize("distributed_executor_backend", ["mp"]) def test_ilama_lora_tp2(distributed_executor_backend, ilama_lora_files): - with VllmRunner(snapshot_download(MODEL_PATH), - enable_lora=True, - max_loras=4, - dtype="half", - max_model_len=1024, - max_num_seqs=16, - tensor_parallel_size=2, - distributed_executor_backend=distributed_executor_backend, - enforce_eager=False) as vllm_model: + with VllmRunner( + snapshot_download(MODEL_PATH), + enable_lora=True, + max_loras=4, + dtype="half", + max_model_len=1024, + max_num_seqs=16, + tensor_parallel_size=2, + distributed_executor_backend=distributed_executor_backend, + ) as vllm_model: output = do_sample(vllm_model.model, ilama_lora_files, lora_id=2) for i in range(len(EXPECTED_LORA_OUTPUT)): diff --git a/tests/e2e/multicard/test_offline_inference_distributed.py b/tests/e2e/multicard/test_offline_inference_distributed.py index 558067ce..e5f4b2c2 100644 --- a/tests/e2e/multicard/test_offline_inference_distributed.py +++ b/tests/e2e/multicard/test_offline_inference_distributed.py @@ -189,7 +189,6 @@ def test_qwen3_dense_fc1_tp2(model): with VllmRunner( snapshot_download(model), max_model_len=8192, - enforce_eager=False, dtype="auto", tensor_parallel_size=2, quantization="ascend", @@ -209,7 +208,6 @@ def test_qwen3_dense_prefetch_mlp_weight_tp2(model): with VllmRunner( snapshot_download(model), max_model_len=8192, - enforce_eager=False, dtype="auto", tensor_parallel_size=2, quantization="ascend", diff --git a/tests/e2e/singlecard/pooling/test_classification.py b/tests/e2e/singlecard/pooling/test_classification.py index e59983c1..8bdd3660 100644 --- a/tests/e2e/singlecard/pooling/test_classification.py +++ b/tests/e2e/singlecard/pooling/test_classification.py @@ -5,7 +5,7 @@ from transformers import AutoModelForSequenceClassification from tests.e2e.conftest import HfRunner, VllmRunner -def test_classify_correctness() -> None: +def test_qwen_pooling_classify_correctness() -> None: model_name = snapshot_download("Howeee/Qwen2.5-1.5B-apeach") diff --git a/tests/e2e/singlecard/pooling/test_embedding.py b/tests/e2e/singlecard/pooling/test_embedding.py index a564dfbb..8800fc7a 100644 --- a/tests/e2e/singlecard/pooling/test_embedding.py +++ b/tests/e2e/singlecard/pooling/test_embedding.py @@ -36,7 +36,6 @@ def test_embed_models_correctness(model: str): with VllmRunner( model_name, runner="pooling", - enforce_eager=False, max_model_len=None, cudagraph_capture_sizes=[4], ) as vllm_runner: @@ -58,14 +57,13 @@ def test_embed_models_correctness(model: str): ) -def test_bge_model_correctness(): +def test_bge_m3_correctness(): queries = ['What is the capital of China?', 'Explain gravity'] model_name = snapshot_download("BAAI/bge-m3") with VllmRunner( model_name, runner="pooling", - enforce_eager=False, ) as vllm_aclgraph_runner: vllm_aclgraph_outputs = vllm_aclgraph_runner.embed(queries) diff --git a/tests/e2e/singlecard/pooling/test_scoring.py b/tests/e2e/singlecard/pooling/test_scoring.py index c196a0bf..9656b422 100644 --- a/tests/e2e/singlecard/pooling/test_scoring.py +++ b/tests/e2e/singlecard/pooling/test_scoring.py @@ -34,7 +34,7 @@ def model_name(request): yield snapshot_download(request.param) -def test_cross_encoder_1_to_1(model_name): +def test_cross_encoder_score_1_to_1(model_name): text_pair = [TEXTS_1[0], TEXTS_2[0]] with HfRunner(model_name, dtype=DTYPE, is_cross_encoder=True) as hf_model: @@ -53,7 +53,7 @@ def test_cross_encoder_1_to_1(model_name): assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01) -def test_cross_encoder_1_to_N(model_name): +def test_cross_encoder_score_1_to_N(model_name): text_pairs = [ [TEXTS_1[0], TEXTS_2[0]], [TEXTS_1[0], TEXTS_2[1]], @@ -76,7 +76,7 @@ def test_cross_encoder_1_to_N(model_name): assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.01) -def test_cross_encoder_N_to_N(model_name): +def test_cross_encoder_score_N_to_N(model_name): text_pairs = [ [TEXTS_1[0], TEXTS_2[0]], [TEXTS_1[1], TEXTS_2[1]], @@ -104,7 +104,7 @@ def emb_model_name(request): yield snapshot_download(request.param) -def test_embedding_1_to_1(emb_model_name): +def test_embedding_score_1_to_1(emb_model_name): text_pair = [TEXTS_1[0], TEXTS_2[0]] with HfRunner(emb_model_name, dtype=DTYPE, @@ -127,7 +127,7 @@ def test_embedding_1_to_1(emb_model_name): assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01) -def test_embedding_1_to_N(emb_model_name): +def test_embedding_score_1_to_N(emb_model_name): text_pairs = [ [TEXTS_1[0], TEXTS_2[0]], [TEXTS_1[0], TEXTS_2[1]], @@ -157,7 +157,7 @@ def test_embedding_1_to_N(emb_model_name): assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.01) -def test_embedding_N_to_N(emb_model_name): +def test_embedding_score_N_to_N(emb_model_name): text_pairs = [ [TEXTS_1[0], TEXTS_2[0]], [TEXTS_1[1], TEXTS_2[1]], diff --git a/tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py b/tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py index 20f7d145..d8c8fb23 100644 --- a/tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py +++ b/tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py @@ -171,4 +171,4 @@ def test_mtp2_correctness_piecewise_graph_with_pad( mtp_correctness(sampling_config, model_name, 2, - disable_padded_drafter_batch=False) + disable_padded_drafter_batch=False) \ No newline at end of file diff --git a/tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py b/tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py index e42e4bd1..1526997a 100644 --- a/tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py +++ b/tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py @@ -76,19 +76,22 @@ def test_ngram_correctness( should be the same when using ngram speculative decoding. ''' - with VllmRunner(model_name, max_model_len=1024, - enforce_eager=False) as ref_llm: + with VllmRunner( + model_name, + max_model_len=1024, + ) as ref_llm: ref_outputs = ref_llm.model.chat(test_prompts, sampling_config) - with VllmRunner(model_name, - speculative_config={ - "method": "ngram", - "prompt_lookup_max": 5, - "prompt_lookup_min": 3, - "num_speculative_tokens": 3, - }, - max_model_len=1024, - enforce_eager=False) as runner: + with VllmRunner( + model_name, + speculative_config={ + "method": "ngram", + "prompt_lookup_max": 5, + "prompt_lookup_min": 3, + "num_speculative_tokens": 3, + }, + max_model_len=1024, + ) as runner: spec_outputs = runner.model.chat(test_prompts, sampling_config) matches = 0 misses = 0 @@ -190,8 +193,7 @@ def test_suffix_correctness( Compare the outputs of a original LLM and a speculative LLM should be the same when using ngram speculative decoding. ''' - with VllmRunner(model_name, max_model_len=1024, - enforce_eager=False) as ref_llm: + with VllmRunner(model_name, max_model_len=1024) as ref_llm: ref_outputs = ref_llm.model.chat(test_prompts, sampling_config) with VllmRunner(model_name, @@ -199,8 +201,7 @@ def test_suffix_correctness( "method": "suffix", "num_speculative_tokens": 8, }, - max_model_len=1024, - enforce_eager=False) as runner: + max_model_len=1024) as runner: spec_outputs = runner.model.chat(test_prompts, sampling_config) matches = 0 misses = 0 @@ -236,8 +237,7 @@ def test_suffix_acceptance( "num_speculative_tokens": 10, }, max_model_len=1024, - disable_log_stats=False, - enforce_eager=False) as runner: + disable_log_stats=False) as runner: for i in range(10): runner.model.chat(test_prompts[i], sampling_config) metrics = runner.model.get_metrics() @@ -278,7 +278,7 @@ def test_eagle_logprobs( max_tokens=10, ignore_eos=False) - ref_llm = LLM(model=model_name, max_model_len=2048, enforce_eager=False) + ref_llm = LLM(model=model_name, max_model_len=2048) ref_outputs = ref_llm.chat([prompt], sampling_params) ref_logprobs = [] for output in ref_outputs[0].outputs: @@ -300,7 +300,6 @@ def test_eagle_logprobs( "max_model_len": 128, }, max_model_len=128, - enforce_eager=False, ) as runner: spec_outputs = runner.model.chat([prompt], sampling_params) diff --git a/tests/e2e/singlecard/test_aclgraph_accuracy.py b/tests/e2e/singlecard/test_aclgraph_accuracy.py index 9eb68894..baba92ea 100644 --- a/tests/e2e/singlecard/test_aclgraph_accuracy.py +++ b/tests/e2e/singlecard/test_aclgraph_accuracy.py @@ -36,7 +36,7 @@ MODELS = [ @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("max_tokens", [32]) -def test_output_between_eager_and_aclgraph( +def test_models_output_between_eager_and_aclgraph( model: str, max_tokens: int, ) -> None: @@ -50,7 +50,6 @@ def test_output_between_eager_and_aclgraph( with VllmRunner( model, max_model_len=1024, - enforce_eager=False, quantization="ascend", ) as runner: vllm_aclgraph_outputs = runner.model.generate( @@ -68,7 +67,6 @@ def test_output_between_eager_and_aclgraph( with VllmRunner( model, max_model_len=1024, - enforce_eager=False, ) as runner: vllm_aclgraph_outputs = runner.model.generate( prompts, sampling_params) @@ -100,7 +98,7 @@ def test_output_between_eager_and_aclgraph( @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("max_tokens", [32]) -def test_output_between_eager_and_full_decode_only( +def test_models_output_between_eager_and_full_decode_only( model: str, max_tokens: int, ) -> None: @@ -155,7 +153,6 @@ def test_output_between_eager_and_full_decode_only( with VllmRunner( model, max_model_len=1024, - enforce_eager=False, compilation_config={"cudagraph_mode": "FULL_DECODE_ONLY"}, quantization="ascend", ) as runner: @@ -166,7 +163,6 @@ def test_output_between_eager_and_full_decode_only( with VllmRunner( model, max_model_len=1024, - enforce_eager=False, compilation_config={ "cudagraph_capture_sizes": [4, 8, 32, 64], "cudagraph_mode": "FULL_DECODE_ONLY" @@ -196,7 +192,7 @@ def test_output_between_eager_and_full_decode_only( @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("max_tokens", [32]) -def test_output_between_eager_and_fullgraph_npugraph_ex( +def test_models_output_between_eager_and_fullgraph_npugraph_ex( model: str, max_tokens: int, ) -> None: @@ -251,7 +247,6 @@ def test_output_between_eager_and_fullgraph_npugraph_ex( with VllmRunner( model, max_model_len=1024, - enforce_eager=False, compilation_config={"cudagraph_mode": "FULL_DECODE_ONLY"}, additional_config={"enable_npugraph_ex": True}, quantization="ascend", @@ -263,7 +258,6 @@ def test_output_between_eager_and_fullgraph_npugraph_ex( with VllmRunner( model, max_model_len=1024, - enforce_eager=False, compilation_config={ "cudagraph_capture_sizes": [4, 8, 32, 64], "cudagraph_mode": "FULL_DECODE_ONLY" diff --git a/tests/e2e/singlecard/test_camem.py b/tests/e2e/singlecard/test_camem.py index cdf7527e..33a99a42 100644 --- a/tests/e2e/singlecard/test_camem.py +++ b/tests/e2e/singlecard/test_camem.py @@ -76,9 +76,7 @@ def test_end_to_end(): prompt = "How are you?" sampling_params = SamplingParams(temperature=0, max_tokens=10) - with VllmRunner("Qwen/Qwen3-0.6B", - enforce_eager=False, - enable_sleep_mode=True) as runner: + with VllmRunner("Qwen/Qwen3-0.6B", enable_sleep_mode=True) as runner: output = runner.model.generate(prompt, sampling_params) # the benefit of `llm.sleep(level=2)` is mainly CPU memory usage, diff --git a/tests/e2e/singlecard/test_ilama_lora.py b/tests/e2e/singlecard/test_ilama_lora.py index daeac4aa..1b941a06 100644 --- a/tests/e2e/singlecard/test_ilama_lora.py +++ b/tests/e2e/singlecard/test_ilama_lora.py @@ -45,13 +45,14 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]: def test_ilama_lora(ilama_lora_files): - with VllmRunner(snapshot_download(MODEL_PATH), - enable_lora=True, - dtype="half", - max_loras=4, - max_model_len=1024, - max_num_seqs=16, - enforce_eager=False) as vllm_model: + with VllmRunner( + snapshot_download(MODEL_PATH), + enable_lora=True, + dtype="half", + max_loras=4, + max_model_len=1024, + max_num_seqs=16, + ) as vllm_model: output1 = do_sample(vllm_model.model, ilama_lora_files, lora_id=1) for i in range(len(EXPECTED_LORA_OUTPUT)): diff --git a/tests/e2e/singlecard/test_multistream_overlap_shared_expert.py b/tests/e2e/singlecard/test_multistream_overlap_shared_expert.py index 89a78912..eb00cc67 100644 --- a/tests/e2e/singlecard/test_multistream_overlap_shared_expert.py +++ b/tests/e2e/singlecard/test_multistream_overlap_shared_expert.py @@ -58,7 +58,6 @@ def test_models_with_multistream_overlap_shared_expert( with VllmRunner( model, max_model_len=1024, - enforce_eager=False, additional_config={ "multistream_overlap_shared_expert": True, }, diff --git a/tests/e2e/singlecard/test_quantization.py b/tests/e2e/singlecard/test_quantization.py index 95f26ee8..6ab54084 100644 --- a/tests/e2e/singlecard/test_quantization.py +++ b/tests/e2e/singlecard/test_quantization.py @@ -20,15 +20,14 @@ from modelscope import snapshot_download # type: ignore[import-untyped] from tests.e2e.conftest import VllmRunner -def test_quant_W8A8(): +def test_qwen3_w8a8_quant(): max_tokens = 5 example_prompts = [ "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs." ] with VllmRunner( - snapshot_download("vllm-ascend/Qwen2.5-0.5B-Instruct-W8A8"), + snapshot_download("vllm-ascend/Qwen3-0.6B-W8A8"), max_model_len=8192, - enforce_eager=False, gpu_memory_utilization=0.7, quantization="ascend", ) as vllm_model: diff --git a/tests/e2e/singlecard/test_sampler.py b/tests/e2e/singlecard/test_sampler.py index 73055f3a..fbb03913 100644 --- a/tests/e2e/singlecard/test_sampler.py +++ b/tests/e2e/singlecard/test_sampler.py @@ -21,7 +21,7 @@ from vllm import SamplingParams from tests.e2e.conftest import VllmRunner -def test_models_topk() -> None: +def test_qwen3_topk() -> None: example_prompts = [ "Hello, my name is", ] @@ -36,7 +36,7 @@ def test_models_topk() -> None: runner.generate(example_prompts, sampling_params) -def test_models_prompt_logprobs() -> None: +def test_qwen3_prompt_logprobs() -> None: example_prompts = [ "Hello, my name is", ] @@ -49,7 +49,7 @@ def test_models_prompt_logprobs() -> None: num_logprobs=1) -def test_exponential_overlap() -> None: +def test_qwen3_exponential_overlap() -> None: example_prompts = [ "Hello, my name is", ] diff --git a/tests/e2e/singlecard/test_vlm.py b/tests/e2e/singlecard/test_vlm.py index 6133157f..d2ca42be 100644 --- a/tests/e2e/singlecard/test_vlm.py +++ b/tests/e2e/singlecard/test_vlm.py @@ -46,7 +46,6 @@ def test_multimodal_vl(vl_config): with VllmRunner(vl_config["model"], mm_processor_kwargs=vl_config["mm_processor_kwargs"], - enforce_eager=False, max_model_len=8192, limit_mm_per_prompt={"image": 1}) as vllm_model: outputs = vllm_model.generate_greedy( diff --git a/tests/e2e/singlecard/test_xlite.py b/tests/e2e/singlecard/test_xlite.py index 0b10e63e..eba65a99 100644 --- a/tests/e2e/singlecard/test_xlite.py +++ b/tests/e2e/singlecard/test_xlite.py @@ -48,7 +48,6 @@ def test_models_with_xlite_decode_only( model, block_size=128, max_model_len=1024, - enforce_eager=False, additional_config={"xlite_graph_config": { "enabled": True }}, @@ -97,7 +96,6 @@ def test_models_with_xlite_full_mode( model, block_size=128, max_model_len=1024, - enforce_eager=False, additional_config={ "xlite_graph_config": { "enabled": True,