forked from EngineX-Cambricon/enginex-mlu370-vllm
add qwen3
This commit is contained in:
0
vllm-v0.6.2/tests/mq_llm_engine/__init__.py
Normal file
0
vllm-v0.6.2/tests/mq_llm_engine/__init__.py
Normal file
67
vllm-v0.6.2/tests/mq_llm_engine/test_abort.py
Normal file
67
vllm-v0.6.2/tests/mq_llm_engine/test_abort.py
Normal file
@@ -0,0 +1,67 @@
|
||||
"""Test that aborting is handled properly."""
|
||||
|
||||
import asyncio
|
||||
import tempfile
|
||||
import uuid
|
||||
|
||||
import pytest
|
||||
|
||||
from tests.mq_llm_engine.utils import RemoteMQLLMEngine, generate
|
||||
from vllm.engine.arg_utils import AsyncEngineArgs
|
||||
|
||||
MODEL = "facebook/opt-125m"
|
||||
ENGINE_ARGS = AsyncEngineArgs(model=MODEL)
|
||||
RAISED_ERROR = KeyError
|
||||
RAISED_VALUE = "foo"
|
||||
EXPECTED_TOKENS = 250
|
||||
|
||||
|
||||
@pytest.fixture(scope="function")
|
||||
def tmp_socket():
|
||||
with tempfile.TemporaryDirectory() as td:
|
||||
yield f"ipc://{td}/{uuid.uuid4()}"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_abort(tmp_socket):
|
||||
with RemoteMQLLMEngine(engine_args=ENGINE_ARGS,
|
||||
ipc_path=tmp_socket) as engine:
|
||||
|
||||
client = await engine.make_client()
|
||||
|
||||
request_id_to_be_aborted = "request-aborted"
|
||||
request_ids_a = [f"request-a-{idx}" for idx in range(10)]
|
||||
request_ids_b = [f"request-b-{idx}" for idx in range(10)]
|
||||
|
||||
# Requests started before one to be aborted.
|
||||
tasks = []
|
||||
for request_id in request_ids_a:
|
||||
tasks.append(
|
||||
asyncio.create_task(
|
||||
generate(client, request_id, EXPECTED_TOKENS)))
|
||||
|
||||
# Aborted.
|
||||
task_aborted = asyncio.create_task(
|
||||
generate(client, request_id_to_be_aborted, EXPECTED_TOKENS))
|
||||
|
||||
# Requests started after one to be aborted.
|
||||
for request_id in request_ids_b:
|
||||
tasks.append(
|
||||
asyncio.create_task(
|
||||
generate(client, request_id, EXPECTED_TOKENS)))
|
||||
|
||||
# Actually abort.
|
||||
await asyncio.sleep(0.5)
|
||||
await client.abort(request_id_to_be_aborted)
|
||||
|
||||
# Confirm that we got all the EXPECTED tokens from the requests.
|
||||
for task in tasks:
|
||||
count, request_id = await task
|
||||
assert count == EXPECTED_TOKENS, (
|
||||
f"{request_id} generated only {count} tokens")
|
||||
|
||||
# Cancel task (this will hang indefinitely if not).
|
||||
task_aborted.cancel()
|
||||
|
||||
# Shutdown.
|
||||
client.close()
|
||||
293
vllm-v0.6.2/tests/mq_llm_engine/test_error_handling.py
Normal file
293
vllm-v0.6.2/tests/mq_llm_engine/test_error_handling.py
Normal file
@@ -0,0 +1,293 @@
|
||||
"""Test that various errors are handled properly."""
|
||||
|
||||
import asyncio
|
||||
import tempfile
|
||||
import time
|
||||
import uuid
|
||||
from unittest.mock import Mock
|
||||
|
||||
import pytest
|
||||
|
||||
from tests.mq_llm_engine.utils import RemoteMQLLMEngine
|
||||
from vllm import SamplingParams
|
||||
from vllm.engine.arg_utils import AsyncEngineArgs
|
||||
from vllm.engine.llm_engine import LLMEngine
|
||||
from vllm.engine.multiprocessing import MQEngineDeadError
|
||||
from vllm.engine.multiprocessing.engine import MQLLMEngine
|
||||
from vllm.entrypoints.openai.api_server import build_async_engine_client
|
||||
from vllm.entrypoints.openai.cli_args import make_arg_parser
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.usage.usage_lib import UsageContext
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
MODEL = "facebook/opt-125m"
|
||||
ENGINE_ARGS = AsyncEngineArgs(model=MODEL, enforce_eager=True)
|
||||
RAISED_ERROR = KeyError
|
||||
RAISED_VALUE = "foo"
|
||||
|
||||
|
||||
@pytest.fixture(scope="function")
|
||||
def tmp_socket():
|
||||
with tempfile.TemporaryDirectory() as td:
|
||||
yield f"ipc://{td}/{uuid.uuid4()}"
|
||||
|
||||
|
||||
def run_with_evil_forward(engine_args: AsyncEngineArgs, ipc_path: str):
|
||||
# Make engine.
|
||||
engine = MQLLMEngine.from_engine_args(
|
||||
engine_args=engine_args,
|
||||
usage_context=UsageContext.UNKNOWN_CONTEXT,
|
||||
ipc_path=ipc_path)
|
||||
|
||||
# Raise error during first forward pass.
|
||||
engine.engine.model_executor.execute_model = Mock(
|
||||
side_effect=RAISED_ERROR(RAISED_VALUE))
|
||||
|
||||
# Run engine.
|
||||
engine.start()
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_evil_forward(tmp_socket):
|
||||
with RemoteMQLLMEngine(engine_args=ENGINE_ARGS,
|
||||
ipc_path=tmp_socket,
|
||||
run_fn=run_with_evil_forward) as engine:
|
||||
|
||||
client = await engine.make_client()
|
||||
|
||||
# Server should be healthy after initial probe.
|
||||
await asyncio.sleep(2.0)
|
||||
await client.check_health()
|
||||
|
||||
# Throws an error that should get ENGINE_DEAD_ERROR.
|
||||
with pytest.raises(MQEngineDeadError):
|
||||
async for _ in client.generate(prompt="Hello my name is",
|
||||
sampling_params=SamplingParams(),
|
||||
request_id=uuid.uuid4()):
|
||||
pass
|
||||
assert client.errored
|
||||
|
||||
await asyncio.sleep(1.0)
|
||||
with pytest.raises(RAISED_ERROR):
|
||||
await client.check_health()
|
||||
assert client.errored
|
||||
|
||||
# Shutdown.
|
||||
client.close()
|
||||
|
||||
|
||||
def run_with_evil_model_executor_health(engine_args: AsyncEngineArgs,
|
||||
ipc_path: str):
|
||||
# Make engine.
|
||||
engine = MQLLMEngine.from_engine_args(
|
||||
engine_args=engine_args,
|
||||
usage_context=UsageContext.UNKNOWN_CONTEXT,
|
||||
ipc_path=ipc_path)
|
||||
|
||||
# Raise error during first forward pass.
|
||||
engine.engine.model_executor.check_health = Mock(side_effect=RAISED_ERROR)
|
||||
|
||||
# Run engine.
|
||||
engine.start()
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_failed_health_check(tmp_socket):
|
||||
with RemoteMQLLMEngine(
|
||||
engine_args=ENGINE_ARGS,
|
||||
ipc_path=tmp_socket,
|
||||
run_fn=run_with_evil_model_executor_health) as engine:
|
||||
|
||||
client = await engine.make_client()
|
||||
assert client.is_running
|
||||
|
||||
# Health probe should throw RAISED_ERROR.
|
||||
await asyncio.sleep(15.)
|
||||
|
||||
with pytest.raises(RAISED_ERROR):
|
||||
await client.check_health()
|
||||
assert client.errored
|
||||
|
||||
# Generate call should throw ENGINE_DEAD_ERROR
|
||||
with pytest.raises(MQEngineDeadError):
|
||||
async for _ in client.generate(prompt="Hello my name is",
|
||||
sampling_params=SamplingParams(),
|
||||
request_id=uuid.uuid4()):
|
||||
pass
|
||||
|
||||
client.close()
|
||||
|
||||
|
||||
def run_with_evil_abort(engine_args: AsyncEngineArgs, ipc_path: str):
|
||||
# Make engine.
|
||||
engine = MQLLMEngine.from_engine_args(
|
||||
engine_args=engine_args,
|
||||
usage_context=UsageContext.UNKNOWN_CONTEXT,
|
||||
ipc_path=ipc_path)
|
||||
|
||||
# Raise error during abort call.
|
||||
engine.engine.abort_request = Mock(side_effect=RAISED_ERROR)
|
||||
|
||||
# Run engine.
|
||||
engine.start()
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_failed_abort(tmp_socket):
|
||||
with RemoteMQLLMEngine(engine_args=ENGINE_ARGS,
|
||||
ipc_path=tmp_socket,
|
||||
run_fn=run_with_evil_abort) as engine:
|
||||
|
||||
client = await engine.make_client()
|
||||
assert client.is_running
|
||||
|
||||
# First check health should work.
|
||||
await client.check_health()
|
||||
|
||||
# Trigger an abort on the client side.
|
||||
# This request ID does not exist, and will cause the engine to error
|
||||
await client.abort(request_id="foo")
|
||||
|
||||
# Future generation requests will now fail
|
||||
# with reference to the original KeyError("foo")
|
||||
with pytest.raises(MQEngineDeadError) as execinfo:
|
||||
async for _ in client.generate(
|
||||
prompt="Hello my name is",
|
||||
sampling_params=SamplingParams(max_tokens=10),
|
||||
request_id=uuid.uuid4()):
|
||||
pass
|
||||
assert "KeyError" in repr(execinfo.value)
|
||||
assert client.errored
|
||||
|
||||
# This should raise the original error.
|
||||
with pytest.raises(RAISED_ERROR):
|
||||
await client.check_health()
|
||||
|
||||
client.close()
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_batch_error(tmp_socket):
|
||||
with RemoteMQLLMEngine(engine_args=ENGINE_ARGS,
|
||||
ipc_path=tmp_socket,
|
||||
run_fn=run_with_evil_abort) as engine:
|
||||
|
||||
client = await engine.make_client()
|
||||
assert client.is_running
|
||||
|
||||
# First check health should work.
|
||||
await client.check_health()
|
||||
|
||||
# Batch of requests
|
||||
async def do_generate(client):
|
||||
# min_tokens=2048 to keep busy the engine busy
|
||||
# to get enough time to get process a request
|
||||
# that will crash the engine
|
||||
params = SamplingParams(min_tokens=2048, max_tokens=2048)
|
||||
async for _ in client.generate(prompt="Hello my name is",
|
||||
sampling_params=params,
|
||||
request_id=uuid.uuid4()):
|
||||
pass
|
||||
|
||||
tasks = [asyncio.create_task(do_generate(client)) for _ in range(10)]
|
||||
|
||||
# This request will force a processing batch to raise
|
||||
# an exception and next the engine get errored
|
||||
await client.abort(request_id="foo")
|
||||
|
||||
# The batch of those request failed, then they
|
||||
# should get the same exception as a MQEngineDeadError.
|
||||
errors = await asyncio.gather(*tasks, return_exceptions=True)
|
||||
for e in errors:
|
||||
assert isinstance(e, MQEngineDeadError)
|
||||
assert "KeyError" in repr(e)
|
||||
|
||||
client.close()
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_bad_request(tmp_socket):
|
||||
with RemoteMQLLMEngine(engine_args=ENGINE_ARGS,
|
||||
ipc_path=tmp_socket) as engine:
|
||||
|
||||
client = await engine.make_client()
|
||||
|
||||
# Invalid request should fail, but not crash the server.
|
||||
with pytest.raises(ValueError):
|
||||
async for _ in client.generate(prompt="Hello my name is",
|
||||
sampling_params=SamplingParams(),
|
||||
request_id="abcd-1",
|
||||
lora_request=LoRARequest(
|
||||
"invalid-lora", 1,
|
||||
"invalid-path")):
|
||||
pass
|
||||
|
||||
# This request should be okay.
|
||||
async for _ in client.generate(prompt="Hello my name is",
|
||||
sampling_params=SamplingParams(),
|
||||
request_id="abcd-2"):
|
||||
pass
|
||||
|
||||
# Shutdown.
|
||||
client.close()
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_mp_crash_detection(monkeypatch):
|
||||
|
||||
parser = FlexibleArgumentParser(description="vLLM's remote OpenAI server.")
|
||||
parser = make_arg_parser(parser)
|
||||
args = parser.parse_args([])
|
||||
|
||||
# When LLMEngine is loaded, it will crash.
|
||||
def mock_init():
|
||||
raise ValueError
|
||||
|
||||
monkeypatch.setattr(LLMEngine, "__init__", mock_init)
|
||||
|
||||
start = time.perf_counter()
|
||||
async with build_async_engine_client(args):
|
||||
pass
|
||||
end = time.perf_counter()
|
||||
|
||||
assert end - start < 60, ("Expected vLLM to gracefully shutdown in <60s "
|
||||
"if there is an error in the startup.")
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_mp_cuda_init():
|
||||
# it should not crash, when cuda is initialized
|
||||
# in the API server process
|
||||
import torch
|
||||
torch.cuda.init()
|
||||
parser = FlexibleArgumentParser(description="vLLM's remote OpenAI server.")
|
||||
parser = make_arg_parser(parser)
|
||||
args = parser.parse_args([])
|
||||
|
||||
async with build_async_engine_client(args):
|
||||
pass
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_engine_process_death(tmp_socket):
|
||||
with RemoteMQLLMEngine(engine_args=ENGINE_ARGS,
|
||||
ipc_path=tmp_socket) as engine:
|
||||
|
||||
client = await engine.make_client()
|
||||
assert client.is_running
|
||||
|
||||
# kill the engine process
|
||||
engine.proc.kill()
|
||||
|
||||
# Generate call should fail
|
||||
with pytest.raises(MQEngineDeadError):
|
||||
async for _ in client.generate(prompt="Hello my name is",
|
||||
sampling_params=SamplingParams(),
|
||||
request_id=uuid.uuid4()):
|
||||
pass
|
||||
|
||||
# And the health check should show the engine is dead
|
||||
with pytest.raises(RuntimeError, match="Engine process .* died"):
|
||||
await client.check_health()
|
||||
|
||||
client.close()
|
||||
57
vllm-v0.6.2/tests/mq_llm_engine/test_load.py
Normal file
57
vllm-v0.6.2/tests/mq_llm_engine/test_load.py
Normal file
@@ -0,0 +1,57 @@
|
||||
"""Test that the MQLLMEngine is able to handle 10k concurrent requests."""
|
||||
|
||||
import asyncio
|
||||
import tempfile
|
||||
import uuid
|
||||
|
||||
import pytest
|
||||
|
||||
from tests.mq_llm_engine.utils import RemoteMQLLMEngine, generate
|
||||
from vllm.engine.arg_utils import AsyncEngineArgs
|
||||
|
||||
MODEL = "facebook/opt-125m"
|
||||
NUM_EXPECTED_TOKENS = 10
|
||||
NUM_REQUESTS = 10000
|
||||
|
||||
# Scenarios to test for num generated token.
|
||||
ENGINE_ARGS = AsyncEngineArgs(model=MODEL, disable_log_requests=True)
|
||||
|
||||
|
||||
@pytest.fixture(scope="function")
|
||||
def tmp_socket():
|
||||
with tempfile.TemporaryDirectory() as td:
|
||||
yield f"ipc://{td}/{uuid.uuid4()}"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_load(tmp_socket):
|
||||
with RemoteMQLLMEngine(engine_args=ENGINE_ARGS,
|
||||
ipc_path=tmp_socket) as engine:
|
||||
|
||||
client = await engine.make_client()
|
||||
|
||||
request_ids = [f"request-{i}" for i in range(NUM_REQUESTS)]
|
||||
|
||||
# Create concurrent requests.
|
||||
tasks = []
|
||||
for request_id in request_ids:
|
||||
tasks.append(
|
||||
asyncio.create_task(
|
||||
generate(client, request_id, NUM_EXPECTED_TOKENS)))
|
||||
|
||||
# Confirm that we got all the EXPECTED tokens from the requests.
|
||||
failed_request_id = None
|
||||
tokens = None
|
||||
for task in tasks:
|
||||
num_generated_tokens, request_id = await task
|
||||
if (num_generated_tokens != NUM_EXPECTED_TOKENS
|
||||
and failed_request_id is None):
|
||||
failed_request_id = request_id
|
||||
tokens = num_generated_tokens
|
||||
|
||||
assert failed_request_id is None, (
|
||||
f"{failed_request_id} generated {tokens} but "
|
||||
f"expected {NUM_EXPECTED_TOKENS}")
|
||||
|
||||
# Shutdown.
|
||||
client.close()
|
||||
78
vllm-v0.6.2/tests/mq_llm_engine/utils.py
Normal file
78
vllm-v0.6.2/tests/mq_llm_engine/utils.py
Normal file
@@ -0,0 +1,78 @@
|
||||
import asyncio
|
||||
import multiprocessing
|
||||
from typing import Callable, Tuple, Union
|
||||
|
||||
from vllm import SamplingParams
|
||||
from vllm.engine.arg_utils import AsyncEngineArgs
|
||||
from vllm.engine.multiprocessing.client import MQLLMEngineClient
|
||||
from vllm.engine.multiprocessing.engine import MQLLMEngine
|
||||
from vllm.outputs import RequestOutput
|
||||
from vllm.usage.usage_lib import UsageContext
|
||||
|
||||
|
||||
async def generate(
|
||||
client: MQLLMEngineClient,
|
||||
request_id: str,
|
||||
num_tokens: int,
|
||||
return_output: bool = False) -> Union[RequestOutput, Tuple[int, str]]:
|
||||
|
||||
final_output = None
|
||||
count = 0
|
||||
async for out in client.generate(
|
||||
request_id=request_id,
|
||||
prompt="Hello my name is Robert and",
|
||||
sampling_params=SamplingParams(max_tokens=num_tokens,
|
||||
temperature=0)):
|
||||
|
||||
count += 1
|
||||
final_output = out
|
||||
await asyncio.sleep(0.)
|
||||
|
||||
if return_output:
|
||||
return final_output
|
||||
|
||||
# Confirm we generated all the tokens we expected.
|
||||
return count, request_id
|
||||
|
||||
|
||||
def run_normal(engine_args: AsyncEngineArgs, ipc_path: str):
|
||||
# Make engine.
|
||||
engine = MQLLMEngine.from_engine_args(
|
||||
engine_args=engine_args,
|
||||
usage_context=UsageContext.UNKNOWN_CONTEXT,
|
||||
ipc_path=ipc_path)
|
||||
|
||||
# Run engine.
|
||||
engine.start()
|
||||
|
||||
|
||||
class RemoteMQLLMEngine:
|
||||
|
||||
def __init__(self,
|
||||
engine_args: AsyncEngineArgs,
|
||||
ipc_path: str,
|
||||
run_fn: Callable = run_normal) -> None:
|
||||
|
||||
self.engine_args = engine_args
|
||||
self.ipc_path = ipc_path
|
||||
context = multiprocessing.get_context("spawn")
|
||||
self.proc = context.Process(target=run_fn,
|
||||
args=(engine_args, ipc_path))
|
||||
self.proc.start()
|
||||
|
||||
def __enter__(self):
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_value, traceback):
|
||||
self.proc.kill()
|
||||
|
||||
async def make_client(self) -> MQLLMEngineClient:
|
||||
engine_config = self.engine_args.create_engine_config()
|
||||
client = MQLLMEngineClient(self.ipc_path, engine_config, self.proc.pid)
|
||||
while True:
|
||||
try:
|
||||
await client.setup()
|
||||
break
|
||||
except TimeoutError:
|
||||
assert self.proc.is_alive()
|
||||
return client
|
||||
Reference in New Issue
Block a user