forked from EngineX-Cambricon/enginex-mlu370-vllm
add qwen3
This commit is contained in:
0
vllm-v0.6.2/tests/async_engine/__init__.py
Normal file
0
vllm-v0.6.2/tests/async_engine/__init__.py
Normal file
51
vllm-v0.6.2/tests/async_engine/api_server_async_engine.py
Normal file
51
vllm-v0.6.2/tests/async_engine/api_server_async_engine.py
Normal file
@@ -0,0 +1,51 @@
|
||||
"""vllm.entrypoints.api_server with some extra logging for testing."""
|
||||
from typing import Any, Dict, Iterable
|
||||
|
||||
import uvicorn
|
||||
from fastapi.responses import JSONResponse, Response
|
||||
|
||||
import vllm.entrypoints.api_server
|
||||
from vllm.engine.arg_utils import AsyncEngineArgs
|
||||
from vllm.engine.async_llm_engine import AsyncLLMEngine
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
app = vllm.entrypoints.api_server.app
|
||||
|
||||
|
||||
class AsyncLLMEngineWithStats(AsyncLLMEngine):
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self._num_aborts = 0
|
||||
|
||||
async def _engine_abort(self, request_ids: Iterable[str]):
|
||||
ids = list(request_ids)
|
||||
self._num_aborts += len(ids)
|
||||
await super()._engine_abort(ids)
|
||||
|
||||
def testing_stats(self) -> Dict[str, Any]:
|
||||
return {"num_aborted_requests": self._num_aborts}
|
||||
|
||||
|
||||
@app.get("/stats")
|
||||
def stats() -> Response:
|
||||
"""Get the statistics of the engine."""
|
||||
return JSONResponse(engine.testing_stats())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = FlexibleArgumentParser()
|
||||
parser.add_argument("--host", type=str, default="localhost")
|
||||
parser.add_argument("--port", type=int, default=8000)
|
||||
parser = AsyncEngineArgs.add_cli_args(parser)
|
||||
args = parser.parse_args()
|
||||
|
||||
engine_args = AsyncEngineArgs.from_cli_args(args)
|
||||
engine = AsyncLLMEngineWithStats.from_engine_args(engine_args)
|
||||
vllm.entrypoints.api_server.engine = engine
|
||||
uvicorn.run(
|
||||
app,
|
||||
host=args.host,
|
||||
port=args.port,
|
||||
log_level="debug",
|
||||
timeout_keep_alive=vllm.entrypoints.api_server.TIMEOUT_KEEP_ALIVE)
|
||||
109
vllm-v0.6.2/tests/async_engine/test_api_server.py
Normal file
109
vllm-v0.6.2/tests/async_engine/test_api_server.py
Normal file
@@ -0,0 +1,109 @@
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
from multiprocessing import Pool
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
import requests
|
||||
|
||||
from vllm.utils import get_open_port
|
||||
|
||||
port = get_open_port()
|
||||
|
||||
def _query_server(prompt: str, max_tokens: int = 5) -> dict:
|
||||
response = requests.post(f"http://localhost:{port}/generate",
|
||||
json={
|
||||
"prompt": prompt,
|
||||
"max_tokens": max_tokens,
|
||||
"temperature": 0,
|
||||
"ignore_eos": True
|
||||
})
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
|
||||
|
||||
def _query_server_long(prompt: str) -> dict:
|
||||
return _query_server(prompt, max_tokens=500)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def api_server(tokenizer_pool_size: int, worker_use_ray: bool):
|
||||
script_path = Path(__file__).parent.joinpath(
|
||||
"api_server_async_engine.py").absolute()
|
||||
commands = [
|
||||
sys.executable, "-u",
|
||||
str(script_path), "--model", "facebook/opt-125m", "--host",
|
||||
"127.0.0.1", "--port", f"{port}", "--tokenizer-pool-size",
|
||||
str(tokenizer_pool_size)
|
||||
]
|
||||
|
||||
if worker_use_ray:
|
||||
commands.append("--worker-use-ray")
|
||||
uvicorn_process = subprocess.Popen(commands)
|
||||
yield
|
||||
uvicorn_process.terminate()
|
||||
time.sleep(10)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("tokenizer_pool_size", [0, 2])
|
||||
@pytest.mark.parametrize("worker_use_ray", [False, True])
|
||||
def test_api_server(api_server, tokenizer_pool_size: int,
|
||||
worker_use_ray: bool):
|
||||
"""
|
||||
Run the API server and test it.
|
||||
|
||||
We run both the server and requests in separate processes.
|
||||
|
||||
We test that the server can handle incoming requests, including
|
||||
multiple requests at the same time, and that it can handle requests
|
||||
being cancelled without crashing.
|
||||
"""
|
||||
with Pool(32) as pool:
|
||||
# Wait until the server is ready
|
||||
prompts = ["warm up"] * 1
|
||||
result = None
|
||||
while not result:
|
||||
try:
|
||||
for r in pool.map(_query_server, prompts):
|
||||
result = r
|
||||
break
|
||||
except requests.exceptions.ConnectionError:
|
||||
time.sleep(1)
|
||||
|
||||
# Actual tests start here
|
||||
# Try with 1 prompt
|
||||
for result in pool.map(_query_server, prompts):
|
||||
assert result
|
||||
|
||||
num_aborted_requests = requests.get(
|
||||
f"http://localhost:{port}/stats").json()["num_aborted_requests"]
|
||||
assert num_aborted_requests == 0
|
||||
|
||||
# Try with 100 prompts
|
||||
prompts = ["test prompt"] * 100
|
||||
for result in pool.map(_query_server, prompts):
|
||||
assert result
|
||||
|
||||
with Pool(32) as pool:
|
||||
# Cancel requests
|
||||
prompts = ["canceled requests"] * 100
|
||||
pool.map_async(_query_server_long, prompts)
|
||||
time.sleep(0.01)
|
||||
pool.terminate()
|
||||
pool.join()
|
||||
|
||||
# check cancellation stats
|
||||
# give it some times to update the stats
|
||||
time.sleep(1)
|
||||
|
||||
num_aborted_requests = requests.get(
|
||||
f"http://localhost:{port}/stats").json()["num_aborted_requests"]
|
||||
assert num_aborted_requests > 0
|
||||
|
||||
# check that server still runs after cancellations
|
||||
with Pool(32) as pool:
|
||||
# Try with 100 prompts
|
||||
prompts = ["test prompt after canceled"] * 100
|
||||
for result in pool.map(_query_server, prompts):
|
||||
assert result
|
||||
374
vllm-v0.6.2/tests/async_engine/test_async_llm_engine.py
Normal file
374
vllm-v0.6.2/tests/async_engine/test_async_llm_engine.py
Normal file
@@ -0,0 +1,374 @@
|
||||
import asyncio
|
||||
import os
|
||||
import uuid
|
||||
from asyncio import CancelledError
|
||||
from copy import copy
|
||||
from dataclasses import dataclass
|
||||
from typing import List, Optional
|
||||
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
import torch
|
||||
|
||||
from vllm import SamplingParams
|
||||
from vllm.config import ParallelConfig
|
||||
from vllm.distributed import cleanup_dist_env_and_memory
|
||||
from vllm.engine.async_llm_engine import AsyncEngineArgs, AsyncLLMEngine
|
||||
from vllm.outputs import RequestOutput as RealRequestOutput
|
||||
from vllm.sampling_params import RequestOutputKind
|
||||
|
||||
from ..utils import wait_for_gpu_memory_to_clear
|
||||
|
||||
|
||||
@dataclass
|
||||
class RequestOutput:
|
||||
request_id: int
|
||||
finished: bool = False
|
||||
|
||||
|
||||
@dataclass
|
||||
class MockModelConfig:
|
||||
use_async_output_proc = True
|
||||
|
||||
|
||||
class MockEngine:
|
||||
|
||||
def __init__(self):
|
||||
self.step_calls = 0
|
||||
self.add_request_calls = 0
|
||||
self.abort_request_calls = 0
|
||||
self.request_id = None
|
||||
# Ugly, remove dependency when possible
|
||||
self.parallel_config = ParallelConfig(1, 1, False)
|
||||
self.model_config = MockModelConfig()
|
||||
|
||||
async def step_async(self, virtual_engine):
|
||||
# PP size is 1, ignore virtual engine
|
||||
self.step_calls += 1
|
||||
return [RequestOutput(
|
||||
request_id=self.request_id)] if self.request_id else []
|
||||
|
||||
async def process_model_inputs_async(self, *args, **kwargs):
|
||||
pass
|
||||
|
||||
async def stop_remote_worker_execution_loop_async(self):
|
||||
pass
|
||||
|
||||
def generate(self, request_id):
|
||||
self.request_id = request_id
|
||||
|
||||
def stop_generating(self):
|
||||
self.request_id = None
|
||||
|
||||
def add_request(self, **kwargs):
|
||||
del kwargs # Unused
|
||||
self.add_request_calls += 1
|
||||
print(f'Request calls: {self.add_request_calls}')
|
||||
|
||||
async def add_request_async(self, **kwargs):
|
||||
self.add_request_calls += 1
|
||||
return
|
||||
|
||||
def abort_request(self, request_id):
|
||||
del request_id # Unused
|
||||
self.abort_request_calls += 1
|
||||
|
||||
def has_unfinished_requests(self):
|
||||
return self.request_id is not None
|
||||
|
||||
def has_unfinished_requests_for_virtual_engine(self, virtual_engine):
|
||||
return self.request_id is not None
|
||||
|
||||
|
||||
class MockAsyncLLMEngine(AsyncLLMEngine):
|
||||
_engine_class = MockEngine
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_new_requests_event():
|
||||
params = SamplingParams()
|
||||
|
||||
engine = MockAsyncLLMEngine()
|
||||
engine.start_background_loop()
|
||||
await asyncio.sleep(0.01)
|
||||
assert engine.engine.step_calls == 0
|
||||
|
||||
await engine.add_request("1", "", params)
|
||||
await asyncio.sleep(0.01)
|
||||
assert engine.engine.add_request_calls == 1
|
||||
assert engine.engine.step_calls == 1
|
||||
|
||||
await engine.add_request("2", "", params)
|
||||
engine.engine.generate("2")
|
||||
await asyncio.sleep(0)
|
||||
await asyncio.sleep(0)
|
||||
await asyncio.sleep(0)
|
||||
assert engine.engine.add_request_calls == 2
|
||||
assert engine.engine.step_calls >= 2
|
||||
await asyncio.sleep(0.001)
|
||||
assert engine.engine.step_calls >= 3
|
||||
engine.engine.stop_generating()
|
||||
await asyncio.sleep(0.001)
|
||||
old_step_calls = engine.engine.step_calls
|
||||
await asyncio.sleep(0.001)
|
||||
assert engine.engine.step_calls == old_step_calls
|
||||
|
||||
await engine.add_request("3", "", params)
|
||||
await asyncio.sleep(0.01)
|
||||
assert engine.engine.add_request_calls == 3
|
||||
assert engine.engine.step_calls == old_step_calls + 1
|
||||
await asyncio.sleep(0.01)
|
||||
assert engine.engine.add_request_calls == 3
|
||||
assert engine.engine.step_calls == old_step_calls + 1
|
||||
|
||||
engine = MockAsyncLLMEngine()
|
||||
assert engine.get_model_config() is not None
|
||||
assert engine.get_tokenizer() is not None
|
||||
assert engine.get_decoding_config() is not None
|
||||
|
||||
|
||||
def start_engine():
|
||||
wait_for_gpu_memory_to_clear(
|
||||
devices=list(range(torch.cuda.device_count())),
|
||||
threshold_bytes=2 * 2**30,
|
||||
timeout_s=60,
|
||||
)
|
||||
|
||||
num_scheduler_steps = int(os.getenv("NUM_SCHEDULER_STEPS", "1"))
|
||||
print(f"Starting engine with num_scheduler_steps={num_scheduler_steps}")
|
||||
|
||||
return AsyncLLMEngine.from_engine_args(
|
||||
AsyncEngineArgs(model="facebook/opt-125m",
|
||||
enforce_eager=True,
|
||||
num_scheduler_steps=num_scheduler_steps))
|
||||
|
||||
|
||||
def uid() -> str:
|
||||
return str(uuid.uuid4())
|
||||
|
||||
|
||||
@pytest_asyncio.fixture(scope="module")
|
||||
async def async_engine():
|
||||
engine = await asyncio.get_event_loop().run_in_executor(executor=None,
|
||||
func=start_engine)
|
||||
try:
|
||||
yield engine
|
||||
finally:
|
||||
engine.shutdown_background_loop()
|
||||
del engine
|
||||
await asyncio.sleep(0.1)
|
||||
cleanup_dist_env_and_memory()
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def should_do_global_cleanup_after_test(request) -> bool:
|
||||
# So we can share the async engine fixture between these tests
|
||||
return False
|
||||
|
||||
|
||||
@pytest.mark.asyncio(scope="module")
|
||||
@pytest.mark.parametrize("stop", [None, ["a stop string"]])
|
||||
async def test_asyncio_run(async_engine, stop):
|
||||
|
||||
scheduler_config = await async_engine.get_scheduler_config()
|
||||
num_scheduler_steps = scheduler_config.num_scheduler_steps
|
||||
|
||||
async def run(prompt: str):
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0,
|
||||
max_tokens=32,
|
||||
min_tokens=32,
|
||||
stop=stop,
|
||||
)
|
||||
|
||||
output_count = 0
|
||||
final_output = None
|
||||
async for output in async_engine.generate(prompt,
|
||||
sampling_params,
|
||||
request_id=uid()):
|
||||
output_count += 1
|
||||
final_output = output
|
||||
return final_output, output_count
|
||||
|
||||
results = await asyncio.gather(
|
||||
run("test0"),
|
||||
run("test0"),
|
||||
)
|
||||
assert len(results) == 2
|
||||
first, second = results
|
||||
|
||||
# remove nondeterministic fields for comparison
|
||||
first[0].metrics = None
|
||||
second[0].metrics = None
|
||||
first[0].request_id = None
|
||||
second[0].request_id = None
|
||||
|
||||
assert str(first) == str(second)
|
||||
|
||||
output_count = results[0][1]
|
||||
if num_scheduler_steps == 1:
|
||||
assert output_count == 32
|
||||
else:
|
||||
assert 1 < output_count < 32
|
||||
|
||||
|
||||
@pytest.mark.asyncio(scope="module")
|
||||
@pytest.mark.parametrize("stop", [None, ["a stop string"]])
|
||||
async def test_output_kinds(async_engine, stop):
|
||||
"""Test that output_kind works as expected and that
|
||||
results are equivalent across different kinds."""
|
||||
|
||||
scheduler_config = await async_engine.get_scheduler_config()
|
||||
num_scheduler_steps = scheduler_config.num_scheduler_steps
|
||||
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0,
|
||||
max_tokens=32,
|
||||
min_tokens=32,
|
||||
stop=stop,
|
||||
)
|
||||
|
||||
async def run(prompt: str, kind: RequestOutputKind):
|
||||
params = copy(sampling_params)
|
||||
params.output_kind = kind
|
||||
|
||||
output_count = 0
|
||||
final_output = None
|
||||
async for output in async_engine.generate(prompt,
|
||||
params,
|
||||
request_id=uid()):
|
||||
output_count += 1
|
||||
final_output = output
|
||||
|
||||
assert final_output is not None
|
||||
assert final_output.finished
|
||||
|
||||
return (final_output.prompt_token_ids,
|
||||
final_output.outputs[0].token_ids,
|
||||
final_output.outputs[0].text, output_count)
|
||||
|
||||
async def run_deltas(prompt: str):
|
||||
params = copy(sampling_params)
|
||||
params.output_kind = RequestOutputKind.DELTA
|
||||
|
||||
prompt_tokens = None
|
||||
output_tokens: List[int] = []
|
||||
output_text = ""
|
||||
output_count = 0
|
||||
final_output = None
|
||||
async for output in async_engine.generate(prompt,
|
||||
params,
|
||||
request_id=uid()):
|
||||
token_ids = output.outputs[0].token_ids
|
||||
text = output.outputs[0].text
|
||||
final_output = output
|
||||
|
||||
# Ensure we get prompt ids iff we haven't yet received output tokens
|
||||
if output_tokens:
|
||||
assert 1 <= len(token_ids) <= num_scheduler_steps
|
||||
assert stop or text
|
||||
assert not output.prompt_token_ids
|
||||
else:
|
||||
assert output.prompt_token_ids
|
||||
prompt_tokens = output.prompt_token_ids
|
||||
|
||||
output_tokens.extend(token_ids)
|
||||
output_text += text
|
||||
|
||||
output_count += 1
|
||||
|
||||
assert final_output is not None
|
||||
assert final_output.finished
|
||||
|
||||
return prompt_tokens, output_tokens, output_text, output_count
|
||||
|
||||
results = await asyncio.gather(
|
||||
run("common input prompt", RequestOutputKind.CUMULATIVE),
|
||||
run("common input prompt", RequestOutputKind.FINAL_ONLY),
|
||||
run_deltas("common input prompt"))
|
||||
|
||||
# Make sure outputs are the same
|
||||
prompt_set = set(tuple(prompt_ids) for prompt_ids, _, _, _ in results)
|
||||
assert len(prompt_set) == 1
|
||||
|
||||
text_set = set(text for _, _, text, _ in results)
|
||||
assert len(text_set) == 1
|
||||
|
||||
tokens_set = set(tuple(ids) for _, ids, _, _ in results)
|
||||
assert len(tokens_set) == 1
|
||||
|
||||
cumulative, final, deltas = results
|
||||
|
||||
# output message counts
|
||||
assert cumulative[3] == deltas[3]
|
||||
|
||||
if num_scheduler_steps == 1:
|
||||
assert cumulative[3] == 32
|
||||
else:
|
||||
assert 1 < cumulative[3] < 32
|
||||
|
||||
assert final[3] == 1
|
||||
|
||||
|
||||
@pytest.mark.asyncio(scope="module")
|
||||
@pytest.mark.parametrize("stop", [None, ["a stop string"]])
|
||||
async def test_cancellation(async_engine, stop):
|
||||
scheduler_config = await async_engine.get_scheduler_config()
|
||||
num_scheduler_steps = scheduler_config.num_scheduler_steps
|
||||
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0,
|
||||
min_tokens=13,
|
||||
max_tokens=13,
|
||||
stop=stop,
|
||||
)
|
||||
|
||||
stop_at = 5 if num_scheduler_steps == 1 else 1
|
||||
|
||||
request_id = uid()
|
||||
|
||||
i = 0
|
||||
with pytest.raises(CancelledError):
|
||||
async for output in async_engine.generate("test2",
|
||||
sampling_params,
|
||||
request_id=request_id):
|
||||
assert not output.finished
|
||||
i += 1
|
||||
if i == stop_at:
|
||||
await async_engine.abort(request_id)
|
||||
|
||||
assert i == stop_at
|
||||
|
||||
|
||||
@pytest.mark.asyncio(scope="module")
|
||||
@pytest.mark.parametrize("stop", [None, ["a stop string"]])
|
||||
async def test_delayed_generator(async_engine, stop):
|
||||
scheduler_config = await async_engine.get_scheduler_config()
|
||||
|
||||
if scheduler_config.num_scheduler_steps != 1:
|
||||
pytest.skip("no need to test this one with multistep")
|
||||
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0,
|
||||
min_tokens=10,
|
||||
max_tokens=10,
|
||||
stop=stop,
|
||||
)
|
||||
|
||||
stream = async_engine.generate("test3", sampling_params, request_id=uid())
|
||||
i = 0
|
||||
final_output: Optional[RealRequestOutput] = None
|
||||
async for output in stream:
|
||||
final_output = output
|
||||
if i == 0:
|
||||
# wait for generation to complete before consuming
|
||||
# the remaining messages
|
||||
await asyncio.sleep(1)
|
||||
if i < 9:
|
||||
assert not output.finished
|
||||
i += 1
|
||||
|
||||
assert i == 10
|
||||
assert final_output is not None
|
||||
assert len(final_output.outputs[0].token_ids) == 10
|
||||
assert final_output.finished
|
||||
106
vllm-v0.6.2/tests/async_engine/test_openapi_server.py
Normal file
106
vllm-v0.6.2/tests/async_engine/test_openapi_server.py
Normal file
@@ -0,0 +1,106 @@
|
||||
import openai # use the official client for correctness check
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
|
||||
from ..utils import VLLM_PATH, RemoteOpenAIServer
|
||||
|
||||
# any model with a chat template should work here
|
||||
MODEL_NAME = "facebook/opt-125m"
|
||||
chatml_jinja_path = VLLM_PATH / "examples/template_chatml.jinja"
|
||||
assert chatml_jinja_path.exists()
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server():
|
||||
args = [
|
||||
# use half precision for speed and memory savings in CI environment
|
||||
"--dtype",
|
||||
"float16",
|
||||
"--max-model-len",
|
||||
"2048",
|
||||
"--enforce-eager",
|
||||
"--chat-template",
|
||||
str(chatml_jinja_path),
|
||||
]
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def client(server):
|
||||
async with server.get_async_client() as async_client:
|
||||
yield async_client
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_check_models(client: openai.AsyncOpenAI):
|
||||
models = await client.models.list()
|
||||
models = models.data
|
||||
served_model = models[0]
|
||||
assert served_model.id == MODEL_NAME
|
||||
assert all(model.root == MODEL_NAME for model in models)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_single_completion(client: openai.AsyncOpenAI):
|
||||
completion = await client.completions.create(model=MODEL_NAME,
|
||||
prompt="Hello, my name is",
|
||||
max_tokens=5,
|
||||
temperature=0.0)
|
||||
|
||||
assert completion.id is not None
|
||||
assert len(completion.choices) == 1
|
||||
assert len(completion.choices[0].text) >= 5
|
||||
assert completion.choices[0].finish_reason == "length"
|
||||
assert completion.usage == openai.types.CompletionUsage(
|
||||
completion_tokens=5, prompt_tokens=6, total_tokens=11)
|
||||
|
||||
# test using token IDs
|
||||
completion = await client.completions.create(
|
||||
model=MODEL_NAME,
|
||||
prompt=[0, 0, 0, 0, 0],
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
)
|
||||
assert len(completion.choices[0].text) >= 5
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_single_chat_session(client: openai.AsyncOpenAI):
|
||||
messages = [{
|
||||
"role": "system",
|
||||
"content": "you are a helpful assistant"
|
||||
}, {
|
||||
"role": "user",
|
||||
"content": "what is 1+1?"
|
||||
}]
|
||||
|
||||
# test single completion
|
||||
chat_completion = await client.chat.completions.create(model=MODEL_NAME,
|
||||
messages=messages,
|
||||
max_tokens=10,
|
||||
logprobs=True,
|
||||
top_logprobs=5)
|
||||
assert chat_completion.id is not None
|
||||
assert len(chat_completion.choices) == 1
|
||||
|
||||
choice = chat_completion.choices[0]
|
||||
assert choice.finish_reason == "length"
|
||||
assert chat_completion.usage == openai.types.CompletionUsage(
|
||||
completion_tokens=10, prompt_tokens=55, total_tokens=65)
|
||||
|
||||
message = choice.message
|
||||
assert message.content is not None and len(message.content) >= 10
|
||||
assert message.role == "assistant"
|
||||
messages.append({"role": "assistant", "content": message.content})
|
||||
|
||||
# test multi-turn dialogue
|
||||
messages.append({"role": "user", "content": "express your result in json"})
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=messages,
|
||||
max_tokens=10,
|
||||
)
|
||||
message = chat_completion.choices[0].message
|
||||
assert message.content is not None and len(message.content) >= 0
|
||||
68
vllm-v0.6.2/tests/async_engine/test_request_tracker.py
Normal file
68
vllm-v0.6.2/tests/async_engine/test_request_tracker.py
Normal file
@@ -0,0 +1,68 @@
|
||||
import pytest
|
||||
|
||||
from vllm.engine.async_llm_engine import RequestTracker
|
||||
from vllm.outputs import RequestOutput
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_request_tracker():
|
||||
tracker = RequestTracker()
|
||||
stream_1 = tracker.add_request("1")
|
||||
assert tracker.new_requests_event.is_set()
|
||||
await tracker.wait_for_new_requests()
|
||||
new, aborted = tracker.get_new_and_aborted_requests()
|
||||
assert not tracker.new_requests_event.is_set()
|
||||
assert len(new) == 1
|
||||
assert new[0]["request_id"] == "1"
|
||||
assert not aborted
|
||||
assert not stream_1.finished
|
||||
|
||||
stream_2 = tracker.add_request("2")
|
||||
stream_3 = tracker.add_request("3")
|
||||
assert tracker.new_requests_event.is_set()
|
||||
await tracker.wait_for_new_requests()
|
||||
new, aborted = tracker.get_new_and_aborted_requests()
|
||||
assert not tracker.new_requests_event.is_set()
|
||||
assert len(new) == 2
|
||||
assert new[0]["request_id"] == "2"
|
||||
assert new[1]["request_id"] == "3"
|
||||
assert not aborted
|
||||
assert not stream_2.finished
|
||||
assert not stream_3.finished
|
||||
|
||||
# request_ids must be unique
|
||||
with pytest.raises(KeyError):
|
||||
tracker.add_request("1")
|
||||
assert not tracker.new_requests_event.is_set()
|
||||
|
||||
tracker.abort_request("1")
|
||||
new, aborted = tracker.get_new_and_aborted_requests()
|
||||
assert len(aborted) == 1
|
||||
assert "1" in aborted
|
||||
assert not new
|
||||
assert stream_1.finished
|
||||
|
||||
stream_4 = tracker.add_request("4")
|
||||
tracker.abort_request("4")
|
||||
assert tracker.new_requests_event.is_set()
|
||||
await tracker.wait_for_new_requests()
|
||||
new, aborted = tracker.get_new_and_aborted_requests()
|
||||
# aborted new requests will cancel each other out -
|
||||
# there's no need for them to propagate into the
|
||||
# engine
|
||||
assert not aborted
|
||||
assert not new
|
||||
assert stream_4.finished
|
||||
|
||||
stream_5 = tracker.add_request("5")
|
||||
assert tracker.new_requests_event.is_set()
|
||||
tracker.process_request_output(
|
||||
RequestOutput("2", "output", [], [], [], finished=True))
|
||||
await tracker.wait_for_new_requests()
|
||||
new, aborted = tracker.get_new_and_aborted_requests()
|
||||
assert not tracker.new_requests_event.is_set()
|
||||
assert not aborted
|
||||
assert len(new) == 1
|
||||
assert new[0]["request_id"] == "5"
|
||||
assert stream_2.finished
|
||||
assert not stream_5.finished
|
||||
Reference in New Issue
Block a user