add qwen3

2026-02-04 17:22:39 +08:00
parent d1c0f68ab4
commit 8511fe8530
1932 changed files with 300426 additions and 0 deletions
--- a/vllm-v0.6.2/tests/async_engine/init.py
+++ b/vllm-v0.6.2/tests/async_engine/init.py
--- a/vllm-v0.6.2/tests/async_engine/api_server_async_engine.py
+++ b/vllm-v0.6.2/tests/async_engine/api_server_async_engine.py
@@ -0,0 +1,51 @@
+"""vllm.entrypoints.api_server with some extra logging for testing."""
+from typing import Any, Dict, Iterable
+
+import uvicorn
+from fastapi.responses import JSONResponse, Response
+
+import vllm.entrypoints.api_server
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.engine.async_llm_engine import AsyncLLMEngine
+from vllm.utils import FlexibleArgumentParser
+
+app = vllm.entrypoints.api_server.app
+
+
+class AsyncLLMEngineWithStats(AsyncLLMEngine):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._num_aborts = 0
+
+    async def _engine_abort(self, request_ids: Iterable[str]):
+        ids = list(request_ids)
+        self._num_aborts += len(ids)
+        await super()._engine_abort(ids)
+
+    def testing_stats(self) -> Dict[str, Any]:
+        return {"num_aborted_requests": self._num_aborts}
+
+
+@app.get("/stats")
+def stats() -> Response:
+    """Get the statistics of the engine."""
+    return JSONResponse(engine.testing_stats())
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser()
+    parser.add_argument("--host", type=str, default="localhost")
+    parser.add_argument("--port", type=int, default=8000)
+    parser = AsyncEngineArgs.add_cli_args(parser)
+    args = parser.parse_args()
+
+    engine_args = AsyncEngineArgs.from_cli_args(args)
+    engine = AsyncLLMEngineWithStats.from_engine_args(engine_args)
+    vllm.entrypoints.api_server.engine = engine
+    uvicorn.run(
+        app,
+        host=args.host,
+        port=args.port,
+        log_level="debug",
+        timeout_keep_alive=vllm.entrypoints.api_server.TIMEOUT_KEEP_ALIVE)
--- a/vllm-v0.6.2/tests/async_engine/test_api_server.py
+++ b/vllm-v0.6.2/tests/async_engine/test_api_server.py
@@ -0,0 +1,109 @@
+import subprocess
+import sys
+import time
+from multiprocessing import Pool
+from pathlib import Path
+
+import pytest
+import requests
+
+from vllm.utils import get_open_port
+
+port = get_open_port()
+
+def _query_server(prompt: str, max_tokens: int = 5) -> dict:
+    response = requests.post(f"http://localhost:{port}/generate",
+                             json={
+                                 "prompt": prompt,
+                                 "max_tokens": max_tokens,
+                                 "temperature": 0,
+                                 "ignore_eos": True
+                             })
+    response.raise_for_status()
+    return response.json()
+
+
+def _query_server_long(prompt: str) -> dict:
+    return _query_server(prompt, max_tokens=500)
+
+
+@pytest.fixture
+def api_server(tokenizer_pool_size: int, worker_use_ray: bool):
+    script_path = Path(__file__).parent.joinpath(
+        "api_server_async_engine.py").absolute()
+    commands = [
+        sys.executable, "-u",
+        str(script_path), "--model", "facebook/opt-125m", "--host",
+        "127.0.0.1", "--port", f"{port}", "--tokenizer-pool-size",
+        str(tokenizer_pool_size)
+    ]
+
+    if worker_use_ray:
+        commands.append("--worker-use-ray")
+    uvicorn_process = subprocess.Popen(commands)
+    yield
+    uvicorn_process.terminate()
+    time.sleep(10)
+
+
+@pytest.mark.parametrize("tokenizer_pool_size", [0, 2])
+@pytest.mark.parametrize("worker_use_ray", [False, True])
+def test_api_server(api_server, tokenizer_pool_size: int,
+                    worker_use_ray: bool):
+    """
+    Run the API server and test it.
+
+    We run both the server and requests in separate processes.
+
+    We test that the server can handle incoming requests, including
+    multiple requests at the same time, and that it can handle requests
+    being cancelled without crashing.
+    """
+    with Pool(32) as pool:
+        # Wait until the server is ready
+        prompts = ["warm up"] * 1
+        result = None
+        while not result:
+            try:
+                for r in pool.map(_query_server, prompts):
+                    result = r
+                    break
+            except requests.exceptions.ConnectionError:
+                time.sleep(1)
+
+        # Actual tests start here
+        # Try with 1 prompt
+        for result in pool.map(_query_server, prompts):
+            assert result
+
+        num_aborted_requests = requests.get(
+            f"http://localhost:{port}/stats").json()["num_aborted_requests"]
+        assert num_aborted_requests == 0
+
+        # Try with 100 prompts
+        prompts = ["test prompt"] * 100
+        for result in pool.map(_query_server, prompts):
+            assert result
+
+    with Pool(32) as pool:
+        # Cancel requests
+        prompts = ["canceled requests"] * 100
+        pool.map_async(_query_server_long, prompts)
+        time.sleep(0.01)
+        pool.terminate()
+        pool.join()
+
+        # check cancellation stats
+        # give it some times to update the stats
+        time.sleep(1)
+
+        num_aborted_requests = requests.get(
+            f"http://localhost:{port}/stats").json()["num_aborted_requests"]
+        assert num_aborted_requests > 0
+
+    # check that server still runs after cancellations
+    with Pool(32) as pool:
+        # Try with 100 prompts
+        prompts = ["test prompt after canceled"] * 100
+        for result in pool.map(_query_server, prompts):
+            assert result
--- a/vllm-v0.6.2/tests/async_engine/test_async_llm_engine.py
+++ b/vllm-v0.6.2/tests/async_engine/test_async_llm_engine.py
@@ -0,0 +1,374 @@
+import asyncio
+import os
+import uuid
+from asyncio import CancelledError
+from copy import copy
+from dataclasses import dataclass
+from typing import List, Optional
+
+import pytest
+import pytest_asyncio
+import torch
+
+from vllm import SamplingParams
+from vllm.config import ParallelConfig
+from vllm.distributed import cleanup_dist_env_and_memory
+from vllm.engine.async_llm_engine import AsyncEngineArgs, AsyncLLMEngine
+from vllm.outputs import RequestOutput as RealRequestOutput
+from vllm.sampling_params import RequestOutputKind
+
+from ..utils import wait_for_gpu_memory_to_clear
+
+
+@dataclass
+class RequestOutput:
+    request_id: int
+    finished: bool = False
+
+
+@dataclass
+class MockModelConfig:
+    use_async_output_proc = True
+
+
+class MockEngine:
+
+    def __init__(self):
+        self.step_calls = 0
+        self.add_request_calls = 0
+        self.abort_request_calls = 0
+        self.request_id = None
+        # Ugly, remove dependency when possible
+        self.parallel_config = ParallelConfig(1, 1, False)
+        self.model_config = MockModelConfig()
+
+    async def step_async(self, virtual_engine):
+        # PP size is 1, ignore virtual engine
+        self.step_calls += 1
+        return [RequestOutput(
+            request_id=self.request_id)] if self.request_id else []
+
+    async def process_model_inputs_async(self, *args, **kwargs):
+        pass
+
+    async def stop_remote_worker_execution_loop_async(self):
+        pass
+
+    def generate(self, request_id):
+        self.request_id = request_id
+
+    def stop_generating(self):
+        self.request_id = None
+
+    def add_request(self, **kwargs):
+        del kwargs  # Unused
+        self.add_request_calls += 1
+        print(f'Request calls: {self.add_request_calls}')
+
+    async def add_request_async(self, **kwargs):
+        self.add_request_calls += 1
+        return
+
+    def abort_request(self, request_id):
+        del request_id  # Unused
+        self.abort_request_calls += 1
+
+    def has_unfinished_requests(self):
+        return self.request_id is not None
+
+    def has_unfinished_requests_for_virtual_engine(self, virtual_engine):
+        return self.request_id is not None
+
+
+class MockAsyncLLMEngine(AsyncLLMEngine):
+    _engine_class = MockEngine
+
+
+@pytest.mark.asyncio
+async def test_new_requests_event():
+    params = SamplingParams()
+
+    engine = MockAsyncLLMEngine()
+    engine.start_background_loop()
+    await asyncio.sleep(0.01)
+    assert engine.engine.step_calls == 0
+
+    await engine.add_request("1", "", params)
+    await asyncio.sleep(0.01)
+    assert engine.engine.add_request_calls == 1
+    assert engine.engine.step_calls == 1
+
+    await engine.add_request("2", "", params)
+    engine.engine.generate("2")
+    await asyncio.sleep(0)
+    await asyncio.sleep(0)
+    await asyncio.sleep(0)
+    assert engine.engine.add_request_calls == 2
+    assert engine.engine.step_calls >= 2
+    await asyncio.sleep(0.001)
+    assert engine.engine.step_calls >= 3
+    engine.engine.stop_generating()
+    await asyncio.sleep(0.001)
+    old_step_calls = engine.engine.step_calls
+    await asyncio.sleep(0.001)
+    assert engine.engine.step_calls == old_step_calls
+
+    await engine.add_request("3", "", params)
+    await asyncio.sleep(0.01)
+    assert engine.engine.add_request_calls == 3
+    assert engine.engine.step_calls == old_step_calls + 1
+    await asyncio.sleep(0.01)
+    assert engine.engine.add_request_calls == 3
+    assert engine.engine.step_calls == old_step_calls + 1
+
+    engine = MockAsyncLLMEngine()
+    assert engine.get_model_config() is not None
+    assert engine.get_tokenizer() is not None
+    assert engine.get_decoding_config() is not None
+
+
+def start_engine():
+    wait_for_gpu_memory_to_clear(
+        devices=list(range(torch.cuda.device_count())),
+        threshold_bytes=2 * 2**30,
+        timeout_s=60,
+    )
+
+    num_scheduler_steps = int(os.getenv("NUM_SCHEDULER_STEPS", "1"))
+    print(f"Starting engine with num_scheduler_steps={num_scheduler_steps}")
+
+    return AsyncLLMEngine.from_engine_args(
+        AsyncEngineArgs(model="facebook/opt-125m",
+                        enforce_eager=True,
+                        num_scheduler_steps=num_scheduler_steps))
+
+
+def uid() -> str:
+    return str(uuid.uuid4())
+
+
+@pytest_asyncio.fixture(scope="module")
+async def async_engine():
+    engine = await asyncio.get_event_loop().run_in_executor(executor=None,
+                                                            func=start_engine)
+    try:
+        yield engine
+    finally:
+        engine.shutdown_background_loop()
+        del engine
+        await asyncio.sleep(0.1)
+        cleanup_dist_env_and_memory()
+
+
+@pytest.fixture()
+def should_do_global_cleanup_after_test(request) -> bool:
+    # So we can share the async engine fixture between these tests
+    return False
+
+
+@pytest.mark.asyncio(scope="module")
+@pytest.mark.parametrize("stop", [None, ["a stop string"]])
+async def test_asyncio_run(async_engine, stop):
+
+    scheduler_config = await async_engine.get_scheduler_config()
+    num_scheduler_steps = scheduler_config.num_scheduler_steps
+
+    async def run(prompt: str):
+        sampling_params = SamplingParams(
+            temperature=0,
+            max_tokens=32,
+            min_tokens=32,
+            stop=stop,
+        )
+
+        output_count = 0
+        final_output = None
+        async for output in async_engine.generate(prompt,
+                                                  sampling_params,
+                                                  request_id=uid()):
+            output_count += 1
+            final_output = output
+        return final_output, output_count
+
+    results = await asyncio.gather(
+        run("test0"),
+        run("test0"),
+    )
+    assert len(results) == 2
+    first, second = results
+
+    # remove nondeterministic fields for comparison
+    first[0].metrics = None
+    second[0].metrics = None
+    first[0].request_id = None
+    second[0].request_id = None
+
+    assert str(first) == str(second)
+
+    output_count = results[0][1]
+    if num_scheduler_steps == 1:
+        assert output_count == 32
+    else:
+        assert 1 < output_count < 32
+
+
+@pytest.mark.asyncio(scope="module")
+@pytest.mark.parametrize("stop", [None, ["a stop string"]])
+async def test_output_kinds(async_engine, stop):
+    """Test that output_kind works as expected and that
+    results are equivalent across different kinds."""
+
+    scheduler_config = await async_engine.get_scheduler_config()
+    num_scheduler_steps = scheduler_config.num_scheduler_steps
+
+    sampling_params = SamplingParams(
+        temperature=0,
+        max_tokens=32,
+        min_tokens=32,
+        stop=stop,
+    )
+
+    async def run(prompt: str, kind: RequestOutputKind):
+        params = copy(sampling_params)
+        params.output_kind = kind
+
+        output_count = 0
+        final_output = None
+        async for output in async_engine.generate(prompt,
+                                                  params,
+                                                  request_id=uid()):
+            output_count += 1
+            final_output = output
+
+        assert final_output is not None
+        assert final_output.finished
+
+        return (final_output.prompt_token_ids,
+                final_output.outputs[0].token_ids,
+                final_output.outputs[0].text, output_count)
+
+    async def run_deltas(prompt: str):
+        params = copy(sampling_params)
+        params.output_kind = RequestOutputKind.DELTA
+
+        prompt_tokens = None
+        output_tokens: List[int] = []
+        output_text = ""
+        output_count = 0
+        final_output = None
+        async for output in async_engine.generate(prompt,
+                                                  params,
+                                                  request_id=uid()):
+            token_ids = output.outputs[0].token_ids
+            text = output.outputs[0].text
+            final_output = output
+
+            # Ensure we get prompt ids iff we haven't yet received output tokens
+            if output_tokens:
+                assert 1 <= len(token_ids) <= num_scheduler_steps
+                assert stop or text
+                assert not output.prompt_token_ids
+            else:
+                assert output.prompt_token_ids
+                prompt_tokens = output.prompt_token_ids
+
+            output_tokens.extend(token_ids)
+            output_text += text
+
+            output_count += 1
+
+        assert final_output is not None
+        assert final_output.finished
+
+        return prompt_tokens, output_tokens, output_text, output_count
+
+    results = await asyncio.gather(
+        run("common input prompt", RequestOutputKind.CUMULATIVE),
+        run("common input prompt", RequestOutputKind.FINAL_ONLY),
+        run_deltas("common input prompt"))
+
+    # Make sure outputs are the same
+    prompt_set = set(tuple(prompt_ids) for prompt_ids, _, _, _ in results)
+    assert len(prompt_set) == 1
+
+    text_set = set(text for _, _, text, _ in results)
+    assert len(text_set) == 1
+
+    tokens_set = set(tuple(ids) for _, ids, _, _ in results)
+    assert len(tokens_set) == 1
+
+    cumulative, final, deltas = results
+
+    # output message counts
+    assert cumulative[3] == deltas[3]
+
+    if num_scheduler_steps == 1:
+        assert cumulative[3] == 32
+    else:
+        assert 1 < cumulative[3] < 32
+
+    assert final[3] == 1
+
+
+@pytest.mark.asyncio(scope="module")
+@pytest.mark.parametrize("stop", [None, ["a stop string"]])
+async def test_cancellation(async_engine, stop):
+    scheduler_config = await async_engine.get_scheduler_config()
+    num_scheduler_steps = scheduler_config.num_scheduler_steps
+
+    sampling_params = SamplingParams(
+        temperature=0,
+        min_tokens=13,
+        max_tokens=13,
+        stop=stop,
+    )
+
+    stop_at = 5 if num_scheduler_steps == 1 else 1
+
+    request_id = uid()
+
+    i = 0
+    with pytest.raises(CancelledError):
+        async for output in async_engine.generate("test2",
+                                                  sampling_params,
+                                                  request_id=request_id):
+            assert not output.finished
+            i += 1
+            if i == stop_at:
+                await async_engine.abort(request_id)
+
+    assert i == stop_at
+
+
+@pytest.mark.asyncio(scope="module")
+@pytest.mark.parametrize("stop", [None, ["a stop string"]])
+async def test_delayed_generator(async_engine, stop):
+    scheduler_config = await async_engine.get_scheduler_config()
+
+    if scheduler_config.num_scheduler_steps != 1:
+        pytest.skip("no need to test this one with multistep")
+
+    sampling_params = SamplingParams(
+        temperature=0,
+        min_tokens=10,
+        max_tokens=10,
+        stop=stop,
+    )
+
+    stream = async_engine.generate("test3", sampling_params, request_id=uid())
+    i = 0
+    final_output: Optional[RealRequestOutput] = None
+    async for output in stream:
+        final_output = output
+        if i == 0:
+            # wait for generation to complete before consuming
+            # the remaining messages
+            await asyncio.sleep(1)
+        if i < 9:
+            assert not output.finished
+        i += 1
+
+    assert i == 10
+    assert final_output is not None
+    assert len(final_output.outputs[0].token_ids) == 10
+    assert final_output.finished
--- a/vllm-v0.6.2/tests/async_engine/test_openapi_server.py
+++ b/vllm-v0.6.2/tests/async_engine/test_openapi_server.py
@@ -0,0 +1,106 @@
+import openai  # use the official client for correctness check
+import pytest
+import pytest_asyncio
+
+from ..utils import VLLM_PATH, RemoteOpenAIServer
+
+# any model with a chat template should work here
+MODEL_NAME = "facebook/opt-125m"
+chatml_jinja_path = VLLM_PATH / "examples/template_chatml.jinja"
+assert chatml_jinja_path.exists()
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "float16",
+        "--max-model-len",
+        "2048",
+        "--enforce-eager",
+        "--chat-template",
+        str(chatml_jinja_path),
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+async def test_check_models(client: openai.AsyncOpenAI):
+    models = await client.models.list()
+    models = models.data
+    served_model = models[0]
+    assert served_model.id == MODEL_NAME
+    assert all(model.root == MODEL_NAME for model in models)
+
+
+@pytest.mark.asyncio
+async def test_single_completion(client: openai.AsyncOpenAI):
+    completion = await client.completions.create(model=MODEL_NAME,
+                                                 prompt="Hello, my name is",
+                                                 max_tokens=5,
+                                                 temperature=0.0)
+
+    assert completion.id is not None
+    assert len(completion.choices) == 1
+    assert len(completion.choices[0].text) >= 5
+    assert completion.choices[0].finish_reason == "length"
+    assert completion.usage == openai.types.CompletionUsage(
+        completion_tokens=5, prompt_tokens=6, total_tokens=11)
+
+    # test using token IDs
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=[0, 0, 0, 0, 0],
+        max_tokens=5,
+        temperature=0.0,
+    )
+    assert len(completion.choices[0].text) >= 5
+
+
+@pytest.mark.asyncio
+async def test_single_chat_session(client: openai.AsyncOpenAI):
+    messages = [{
+        "role": "system",
+        "content": "you are a helpful assistant"
+    }, {
+        "role": "user",
+        "content": "what is 1+1?"
+    }]
+
+    # test single completion
+    chat_completion = await client.chat.completions.create(model=MODEL_NAME,
+                                                           messages=messages,
+                                                           max_tokens=10,
+                                                           logprobs=True,
+                                                           top_logprobs=5)
+    assert chat_completion.id is not None
+    assert len(chat_completion.choices) == 1
+
+    choice = chat_completion.choices[0]
+    assert choice.finish_reason == "length"
+    assert chat_completion.usage == openai.types.CompletionUsage(
+        completion_tokens=10, prompt_tokens=55, total_tokens=65)
+
+    message = choice.message
+    assert message.content is not None and len(message.content) >= 10
+    assert message.role == "assistant"
+    messages.append({"role": "assistant", "content": message.content})
+
+    # test multi-turn dialogue
+    messages.append({"role": "user", "content": "express your result in json"})
+    chat_completion = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=messages,
+        max_tokens=10,
+    )
+    message = chat_completion.choices[0].message
+    assert message.content is not None and len(message.content) >= 0
--- a/vllm-v0.6.2/tests/async_engine/test_request_tracker.py
+++ b/vllm-v0.6.2/tests/async_engine/test_request_tracker.py
@@ -0,0 +1,68 @@
+import pytest
+
+from vllm.engine.async_llm_engine import RequestTracker
+from vllm.outputs import RequestOutput
+
+
+@pytest.mark.asyncio
+async def test_request_tracker():
+    tracker = RequestTracker()
+    stream_1 = tracker.add_request("1")
+    assert tracker.new_requests_event.is_set()
+    await tracker.wait_for_new_requests()
+    new, aborted = tracker.get_new_and_aborted_requests()
+    assert not tracker.new_requests_event.is_set()
+    assert len(new) == 1
+    assert new[0]["request_id"] == "1"
+    assert not aborted
+    assert not stream_1.finished
+
+    stream_2 = tracker.add_request("2")
+    stream_3 = tracker.add_request("3")
+    assert tracker.new_requests_event.is_set()
+    await tracker.wait_for_new_requests()
+    new, aborted = tracker.get_new_and_aborted_requests()
+    assert not tracker.new_requests_event.is_set()
+    assert len(new) == 2
+    assert new[0]["request_id"] == "2"
+    assert new[1]["request_id"] == "3"
+    assert not aborted
+    assert not stream_2.finished
+    assert not stream_3.finished
+
+    # request_ids must be unique
+    with pytest.raises(KeyError):
+        tracker.add_request("1")
+    assert not tracker.new_requests_event.is_set()
+
+    tracker.abort_request("1")
+    new, aborted = tracker.get_new_and_aborted_requests()
+    assert len(aborted) == 1
+    assert "1" in aborted
+    assert not new
+    assert stream_1.finished
+
+    stream_4 = tracker.add_request("4")
+    tracker.abort_request("4")
+    assert tracker.new_requests_event.is_set()
+    await tracker.wait_for_new_requests()
+    new, aborted = tracker.get_new_and_aborted_requests()
+    # aborted new requests will cancel each other out -
+    # there's no need for them to propagate into the
+    # engine
+    assert not aborted
+    assert not new
+    assert stream_4.finished
+
+    stream_5 = tracker.add_request("5")
+    assert tracker.new_requests_event.is_set()
+    tracker.process_request_output(
+        RequestOutput("2", "output", [], [], [], finished=True))
+    await tracker.wait_for_new_requests()
+    new, aborted = tracker.get_new_and_aborted_requests()
+    assert not tracker.new_requests_event.is_set()
+    assert not aborted
+    assert len(new) == 1
+    assert new[0]["request_id"] == "5"
+    assert stream_2.finished
+    assert not stream_5.finished