From 9fbcfa36af96e89d25b38da6546385873b8e8a77 Mon Sep 17 00:00:00 2001 From: fluctlux <38945811+fluctlux@users.noreply.github.com> Date: Mon, 8 Dec 2025 09:26:29 +0800 Subject: [PATCH] [CI] Fix ngram & suffix test oom (#4755) ### What this PR does / why we need it? Avoid oom during CI by using `with VllmRunner` instead of `LLM()`, and enable `test_ngram_correctness` ### How was this patch tested? CI passed. - vLLM version: v0.12.0 - vLLM main: https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9 --------- Signed-off-by: fluctlux <38945811+fluctlux@users.noreply.github.com> Co-authored-by: wangxiyuan --- .github/workflows/_e2e_test.yaml | 3 +-- .../spec_decode_v1/test_v1_spec_decode.py | 19 ++++++++++++------- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/.github/workflows/_e2e_test.yaml b/.github/workflows/_e2e_test.yaml index bacdfd65..d1504a27 100644 --- a/.github/workflows/_e2e_test.yaml +++ b/.github/workflows/_e2e_test.yaml @@ -108,8 +108,7 @@ jobs: # ------------------------------------ v1 spec decode test ------------------------------------ # pytest -sv tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py pytest -sv tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_torchair_correctness.py - # Fix me: test_eagle_correctness OOM error - #pytest -sv tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py + pytest -sv tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py e2e-2-cards: name: multicard-2 diff --git a/tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py b/tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py index 5d74b5d4..d8c0fabe 100644 --- a/tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py +++ b/tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations +import os import random from typing import Any @@ -9,6 +10,8 @@ from vllm import LLM, SamplingParams from tests.e2e.conftest import VllmRunner +os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" + @pytest.fixture def test_prompts(): @@ -61,7 +64,6 @@ def eagle3_model_name(): return "vllm-ascend/EAGLE3-LLaMA3.1-Instruct-8B" -@pytest.mark.skip("TODO: Revert me after ngram oom issue on ci is fixed") def test_ngram_correctness( test_prompts: list[list[dict[str, Any]]], sampling_config: SamplingParams, @@ -71,9 +73,11 @@ def test_ngram_correctness( Compare the outputs of a original LLM and a speculative LLM should be the same when using ngram speculative decoding. ''' - ref_llm = LLM(model=model_name, max_model_len=1024, enforce_eager=False) - ref_outputs = ref_llm.chat(test_prompts, sampling_config) - del ref_llm + + with VllmRunner(model_name, max_model_len=1024, + enforce_eager=False) as ref_llm: + ref_outputs = ref_llm.model.chat(test_prompts, sampling_config) + with VllmRunner(model_name, speculative_config={ "method": "ngram", @@ -156,9 +160,10 @@ def test_suffix_correctness( Compare the outputs of a original LLM and a speculative LLM should be the same when using ngram speculative decoding. ''' - ref_llm = LLM(model=model_name, max_model_len=1024, enforce_eager=False) - ref_outputs = ref_llm.chat(test_prompts, sampling_config) - del ref_llm + with VllmRunner(model_name, max_model_len=1024, + enforce_eager=False) as ref_llm: + ref_outputs = ref_llm.model.chat(test_prompts, sampling_config) + with VllmRunner(model_name, speculative_config={ "method": "suffix",