Sync from v0.13

This commit is contained in:
2026-01-19 10:38:50 +08:00
parent b2ef04d792
commit 5aef6c175a
3714 changed files with 854317 additions and 89342 deletions

View File

@@ -0,0 +1,108 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Test that we handle a startup Error and shutdown."""
import pytest
from tests.utils import wait_for_gpu_memory_to_clear
from tests.v1.shutdown.utils import (
SHUTDOWN_TEST_THRESHOLD_BYTES,
SHUTDOWN_TEST_TIMEOUT_SEC,
)
from vllm import LLM, SamplingParams
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.sampling_params import RequestOutputKind
from vllm.utils.torch_utils import cuda_device_count_stateless
from vllm.v1.engine.async_llm import AsyncLLM
MODELS = ["hmellor/tiny-random-LlamaForCausalLM"]
@pytest.mark.asyncio
@pytest.mark.timeout(SHUTDOWN_TEST_TIMEOUT_SEC)
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("tensor_parallel_size", [2, 1])
@pytest.mark.parametrize("send_one_request", [False, True])
async def test_async_llm_delete(
model: str, tensor_parallel_size: int, send_one_request: bool
) -> None:
"""Test that AsyncLLM frees GPU memory upon deletion.
AsyncLLM always uses an MP client.
Args:
model: model under test
tensor_parallel_size: degree of tensor parallelism
send_one_request: send one request to engine before deleting
"""
if cuda_device_count_stateless() < tensor_parallel_size:
pytest.skip(reason="Not enough CUDA devices")
engine_args = AsyncEngineArgs(
model=model, enforce_eager=True, tensor_parallel_size=tensor_parallel_size
)
# Instantiate AsyncLLM; make request to complete any deferred
# initialization; then delete instance
async_llm = AsyncLLM.from_engine_args(engine_args)
if send_one_request:
async for _ in async_llm.generate(
"Hello my name is",
request_id="abc",
sampling_params=SamplingParams(
max_tokens=1, output_kind=RequestOutputKind.DELTA
),
):
pass
del async_llm
# Confirm all the processes are cleaned up.
wait_for_gpu_memory_to_clear(
devices=list(range(tensor_parallel_size)),
threshold_bytes=SHUTDOWN_TEST_THRESHOLD_BYTES,
)
@pytest.mark.timeout(SHUTDOWN_TEST_TIMEOUT_SEC)
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("tensor_parallel_size", [2, 1])
@pytest.mark.parametrize("enable_multiprocessing", [True])
@pytest.mark.parametrize("send_one_request", [False, True])
def test_llm_delete(
monkeypatch,
model: str,
tensor_parallel_size: int,
enable_multiprocessing: bool,
send_one_request: bool,
) -> None:
"""Test that LLM frees GPU memory upon deletion.
TODO(andy) - LLM without multiprocessing.
Args:
model: model under test
tensor_parallel_size: degree of tensor parallelism
enable_multiprocessing: enable workers in separate process(es)
send_one_request: send one request to engine before deleting
"""
if cuda_device_count_stateless() < tensor_parallel_size:
pytest.skip(reason="Not enough CUDA devices")
with monkeypatch.context() as m:
MP_VALUE = "1" if enable_multiprocessing else "0"
m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", MP_VALUE)
# Instantiate LLM; make request to complete any deferred
# initialization; then delete instance
llm = LLM(
model=model, enforce_eager=True, tensor_parallel_size=tensor_parallel_size
)
if send_one_request:
llm.generate(
"Hello my name is", sampling_params=SamplingParams(max_tokens=1)
)
del llm
# Confirm all the processes are cleaned up.
wait_for_gpu_memory_to_clear(
devices=list(range(tensor_parallel_size)),
threshold_bytes=SHUTDOWN_TEST_THRESHOLD_BYTES,
)

View File

@@ -0,0 +1,134 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Test that we handle an Error in model forward and shutdown."""
import asyncio
import pytest
from tests.utils import wait_for_gpu_memory_to_clear
from tests.v1.shutdown.utils import (
SHUTDOWN_TEST_THRESHOLD_BYTES,
SHUTDOWN_TEST_TIMEOUT_SEC,
)
from vllm import LLM, AsyncEngineArgs, SamplingParams
from vllm.distributed import get_tensor_model_parallel_rank
from vllm.model_executor.models.llama import LlamaForCausalLM
from vllm.utils.torch_utils import cuda_device_count_stateless
from vllm.v1.engine.async_llm import AsyncLLM
from vllm.v1.engine.exceptions import EngineDeadError
MODELS = ["hmellor/tiny-random-LlamaForCausalLM"]
def evil_forward(self, *args, **kwargs):
"""Evil forward method that raise an exception after 10 calls."""
NUMBER_OF_GOOD_PASSES = 10
if not hasattr(self, "num_calls"):
self.num_calls = 0
if (
self.num_calls == NUMBER_OF_GOOD_PASSES
and get_tensor_model_parallel_rank() == 0
):
raise Exception("Simulated illegal memory access on Rank 0!")
self.num_calls += 1
return self.model(*args, **kwargs)
@pytest.mark.asyncio
@pytest.mark.parametrize("tensor_parallel_size", [2, 1])
@pytest.mark.parametrize("model", MODELS)
async def test_async_llm_model_error(
monkeypatch, tensor_parallel_size: int, model: str
) -> None:
"""Test that AsyncLLM propagates a forward pass error and frees memory.
AsyncLLM always uses an MP client.
"""
if cuda_device_count_stateless() < tensor_parallel_size:
pytest.skip(reason="Not enough CUDA devices")
# Monkeypatch an error in the model.
monkeypatch.setattr(LlamaForCausalLM, "forward", evil_forward)
engine_args = AsyncEngineArgs(
model=model, enforce_eager=True, tensor_parallel_size=tensor_parallel_size
)
async_llm = AsyncLLM.from_engine_args(engine_args)
async def generate(request_id: str):
generator = async_llm.generate(
"Hello my name is", request_id=request_id, sampling_params=SamplingParams()
)
try:
async for _ in generator:
pass
except Exception as e:
return e
NUM_REQS = 3
tasks = [generate(f"request-{idx}") for idx in range(NUM_REQS)]
outputs = await asyncio.gather(*tasks)
# Every request should get an EngineDeadError.
for output in outputs:
assert isinstance(output, EngineDeadError)
# AsyncLLM should be errored.
assert async_llm.errored
# We should not be able to make another request.
with pytest.raises(EngineDeadError):
async for _ in async_llm.generate(
"Hello my name is", request_id="abc", sampling_params=SamplingParams()
):
raise Exception("We should not get here.")
# Confirm all the processes are cleaned up.
wait_for_gpu_memory_to_clear(
devices=list(range(tensor_parallel_size)),
threshold_bytes=2 * 2**30,
timeout_s=60,
)
# NOTE: shutdown is handled by the API Server if an exception
# occurs, so it is expected that we would need to call this.
async_llm.shutdown()
@pytest.mark.timeout(SHUTDOWN_TEST_TIMEOUT_SEC)
@pytest.mark.parametrize("enable_multiprocessing", [True])
@pytest.mark.parametrize("tensor_parallel_size", [2, 1])
@pytest.mark.parametrize("model", MODELS)
def test_llm_model_error(
monkeypatch, tensor_parallel_size: int, enable_multiprocessing: bool, model: str
) -> None:
"""Test that LLM propagates a forward pass error and frees memory.
TODO(andy) - LLM without multiprocessing; LLM with multiprocessing
and >1 rank
"""
if cuda_device_count_stateless() < tensor_parallel_size:
pytest.skip(reason="Not enough CUDA devices")
with monkeypatch.context() as m:
MP_VALUE = "1" if enable_multiprocessing else "0"
m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", MP_VALUE)
# Monkeypatch an error in the model.
m.setattr(LlamaForCausalLM, "forward", evil_forward)
llm = LLM(
model=model, enforce_eager=True, tensor_parallel_size=tensor_parallel_size
)
with pytest.raises(EngineDeadError if enable_multiprocessing else Exception):
llm.generate("Hello my name is Robert and I")
# Confirm all the processes are cleaned up.
wait_for_gpu_memory_to_clear(
devices=list(range(tensor_parallel_size)),
threshold_bytes=SHUTDOWN_TEST_THRESHOLD_BYTES,
)

View File

@@ -0,0 +1,71 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Test error handling in Processor. Should not impact other reqs."""
import asyncio
import pytest
from tests.v1.shutdown.utils import SHUTDOWN_TEST_TIMEOUT_SEC
from vllm import SamplingParams
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.inputs.data import TokensPrompt
from vllm.sampling_params import RequestOutputKind
from vllm.v1.engine.async_llm import AsyncLLM
from vllm.v1.engine.exceptions import EngineGenerateError
MODELS = ["meta-llama/Llama-3.2-1B"]
@pytest.mark.asyncio
@pytest.mark.timeout(SHUTDOWN_TEST_TIMEOUT_SEC)
@pytest.mark.parametrize("model", MODELS)
async def test_async_llm_processor_error(model: str) -> None:
"""Test that AsyncLLM propagates a processor error.
Test empty tokens prompt (failure) and non-empty prompt (no failure.)
AsyncLLM always uses an MP client.
"""
engine_args = AsyncEngineArgs(model=model, enforce_eager=True)
async_llm = AsyncLLM.from_engine_args(engine_args)
async def generate(request_id: str):
# [] is not allowed and will raise a ValueError in Processor.
generator = async_llm.generate(
TokensPrompt([]), request_id=request_id, sampling_params=SamplingParams()
)
try:
async for _ in generator:
pass
except Exception as e:
return e
NUM_REQS = 3
tasks = [generate(f"request-{idx}") for idx in range(NUM_REQS)]
outputs = await asyncio.gather(*tasks)
# Every request should have get an EngineGenerateError.
for output in outputs:
with pytest.raises(EngineGenerateError):
raise output
# AsyncLLM should be errored.
assert not async_llm.errored
# This should be no problem.
EXPECTED_TOKENS = 5
outputs = []
async for out in async_llm.generate(
"Hello my name is",
request_id="abc",
sampling_params=SamplingParams(
max_tokens=EXPECTED_TOKENS, output_kind=RequestOutputKind.DELTA
),
):
outputs.append(out)
generated_tokens = []
for out in outputs:
generated_tokens.extend(out.outputs[0].token_ids)
assert len(generated_tokens) == EXPECTED_TOKENS
async_llm.shutdown()

View File

@@ -0,0 +1,109 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Test that we handle a startup Error and shutdown."""
import pytest
from tests.utils import wait_for_gpu_memory_to_clear
from tests.v1.shutdown.utils import (
SHUTDOWN_TEST_THRESHOLD_BYTES,
SHUTDOWN_TEST_TIMEOUT_SEC,
)
from vllm import LLM
from vllm.distributed import get_tensor_model_parallel_rank
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.model_executor.models.llama import LlamaForCausalLM
from vllm.utils.torch_utils import cuda_device_count_stateless
from vllm.v1.engine.async_llm import AsyncLLM
MODELS = ["hmellor/tiny-random-LlamaForCausalLM"]
def evil_method(self, *args, **kwargs):
"""Evil method that raises an exception."""
if get_tensor_model_parallel_rank() == 0:
raise Exception("Simulated Error in startup!")
return self.model(*args, **kwargs, intermediate_tensors=None)
@pytest.mark.timeout(SHUTDOWN_TEST_TIMEOUT_SEC)
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("tensor_parallel_size", [2, 1])
@pytest.mark.parametrize("failing_method", ["forward", "load_weights"])
def test_async_llm_startup_error(
monkeypatch, model: str, tensor_parallel_size: int, failing_method: str
) -> None:
"""Test that AsyncLLM propagates an __init__ error & frees memory.
Test profiling (forward()) and load weights failures.
AsyncLLM always uses an MP client.
"""
if cuda_device_count_stateless() < tensor_parallel_size:
pytest.skip(reason="Not enough CUDA devices")
# Monkeypatch an error in the model.
monkeypatch.setattr(LlamaForCausalLM, failing_method, evil_method)
engine_args = AsyncEngineArgs(
model=model, enforce_eager=True, tensor_parallel_size=tensor_parallel_size
)
# Confirm we get an exception.
with pytest.raises(Exception, match="initialization failed"):
_ = AsyncLLM.from_engine_args(engine_args)
# Confirm all the processes are cleaned up.
wait_for_gpu_memory_to_clear(
devices=list(range(tensor_parallel_size)),
threshold_bytes=SHUTDOWN_TEST_THRESHOLD_BYTES,
)
@pytest.mark.timeout(SHUTDOWN_TEST_TIMEOUT_SEC)
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("tensor_parallel_size", [2, 1])
@pytest.mark.parametrize("enable_multiprocessing", [True])
@pytest.mark.parametrize("failing_method", ["forward", "load_weights"])
def test_llm_startup_error(
monkeypatch,
model: str,
tensor_parallel_size: int,
enable_multiprocessing: bool,
failing_method: str,
) -> None:
"""Test that LLM propagates an __init__ error and frees memory.
Test profiling (forward()) and load weights failures.
TODO(andy) - LLM without multiprocessing.
"""
# Skip non-Llama models since we monkeypatch LlamaForCausalLM specifically.
# If MODELS list grows, each architecture needs its own test variant.
if model != "JackFram/llama-68m":
pytest.skip(reason="Only test JackFram/llama-68m")
if cuda_device_count_stateless() < tensor_parallel_size:
pytest.skip(reason="Not enough CUDA devices")
with monkeypatch.context() as m:
MP_VALUE = "1" if enable_multiprocessing else "0"
m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", MP_VALUE)
# Monkeypatch an error in the model.
monkeypatch.setattr(LlamaForCausalLM, failing_method, evil_method)
with pytest.raises(
Exception,
match="initialization failed"
if enable_multiprocessing
else "Simulated Error in startup!",
):
_ = LLM(
model=model,
enforce_eager=True,
tensor_parallel_size=tensor_parallel_size,
)
# Confirm all the processes are cleaned up.
wait_for_gpu_memory_to_clear(
devices=list(range(tensor_parallel_size)),
threshold_bytes=SHUTDOWN_TEST_THRESHOLD_BYTES,
)

View File

@@ -0,0 +1,6 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Shutdown test utils"""
SHUTDOWN_TEST_TIMEOUT_SEC = 120
SHUTDOWN_TEST_THRESHOLD_BYTES = 2 * 2**30