add qwen3

2026-02-04 17:22:39 +08:00
parent d1c0f68ab4
commit 8511fe8530
1932 changed files with 300426 additions and 0 deletions
--- a/vllm-v0.6.2/tests/init.py
+++ b/vllm-v0.6.2/tests/init.py
--- a/vllm-v0.6.2/tests/async_engine/init.py
+++ b/vllm-v0.6.2/tests/async_engine/init.py
--- a/vllm-v0.6.2/tests/async_engine/api_server_async_engine.py
+++ b/vllm-v0.6.2/tests/async_engine/api_server_async_engine.py
@@ -0,0 +1,51 @@
+"""vllm.entrypoints.api_server with some extra logging for testing."""
+from typing import Any, Dict, Iterable
+
+import uvicorn
+from fastapi.responses import JSONResponse, Response
+
+import vllm.entrypoints.api_server
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.engine.async_llm_engine import AsyncLLMEngine
+from vllm.utils import FlexibleArgumentParser
+
+app = vllm.entrypoints.api_server.app
+
+
+class AsyncLLMEngineWithStats(AsyncLLMEngine):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._num_aborts = 0
+
+    async def _engine_abort(self, request_ids: Iterable[str]):
+        ids = list(request_ids)
+        self._num_aborts += len(ids)
+        await super()._engine_abort(ids)
+
+    def testing_stats(self) -> Dict[str, Any]:
+        return {"num_aborted_requests": self._num_aborts}
+
+
+@app.get("/stats")
+def stats() -> Response:
+    """Get the statistics of the engine."""
+    return JSONResponse(engine.testing_stats())
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser()
+    parser.add_argument("--host", type=str, default="localhost")
+    parser.add_argument("--port", type=int, default=8000)
+    parser = AsyncEngineArgs.add_cli_args(parser)
+    args = parser.parse_args()
+
+    engine_args = AsyncEngineArgs.from_cli_args(args)
+    engine = AsyncLLMEngineWithStats.from_engine_args(engine_args)
+    vllm.entrypoints.api_server.engine = engine
+    uvicorn.run(
+        app,
+        host=args.host,
+        port=args.port,
+        log_level="debug",
+        timeout_keep_alive=vllm.entrypoints.api_server.TIMEOUT_KEEP_ALIVE)
--- a/vllm-v0.6.2/tests/async_engine/test_api_server.py
+++ b/vllm-v0.6.2/tests/async_engine/test_api_server.py
@@ -0,0 +1,109 @@
+import subprocess
+import sys
+import time
+from multiprocessing import Pool
+from pathlib import Path
+
+import pytest
+import requests
+
+from vllm.utils import get_open_port
+
+port = get_open_port()
+
+def _query_server(prompt: str, max_tokens: int = 5) -> dict:
+    response = requests.post(f"http://localhost:{port}/generate",
+                             json={
+                                 "prompt": prompt,
+                                 "max_tokens": max_tokens,
+                                 "temperature": 0,
+                                 "ignore_eos": True
+                             })
+    response.raise_for_status()
+    return response.json()
+
+
+def _query_server_long(prompt: str) -> dict:
+    return _query_server(prompt, max_tokens=500)
+
+
+@pytest.fixture
+def api_server(tokenizer_pool_size: int, worker_use_ray: bool):
+    script_path = Path(__file__).parent.joinpath(
+        "api_server_async_engine.py").absolute()
+    commands = [
+        sys.executable, "-u",
+        str(script_path), "--model", "facebook/opt-125m", "--host",
+        "127.0.0.1", "--port", f"{port}", "--tokenizer-pool-size",
+        str(tokenizer_pool_size)
+    ]
+
+    if worker_use_ray:
+        commands.append("--worker-use-ray")
+    uvicorn_process = subprocess.Popen(commands)
+    yield
+    uvicorn_process.terminate()
+    time.sleep(10)
+
+
+@pytest.mark.parametrize("tokenizer_pool_size", [0, 2])
+@pytest.mark.parametrize("worker_use_ray", [False, True])
+def test_api_server(api_server, tokenizer_pool_size: int,
+                    worker_use_ray: bool):
+    """
+    Run the API server and test it.
+
+    We run both the server and requests in separate processes.
+
+    We test that the server can handle incoming requests, including
+    multiple requests at the same time, and that it can handle requests
+    being cancelled without crashing.
+    """
+    with Pool(32) as pool:
+        # Wait until the server is ready
+        prompts = ["warm up"] * 1
+        result = None
+        while not result:
+            try:
+                for r in pool.map(_query_server, prompts):
+                    result = r
+                    break
+            except requests.exceptions.ConnectionError:
+                time.sleep(1)
+
+        # Actual tests start here
+        # Try with 1 prompt
+        for result in pool.map(_query_server, prompts):
+            assert result
+
+        num_aborted_requests = requests.get(
+            f"http://localhost:{port}/stats").json()["num_aborted_requests"]
+        assert num_aborted_requests == 0
+
+        # Try with 100 prompts
+        prompts = ["test prompt"] * 100
+        for result in pool.map(_query_server, prompts):
+            assert result
+
+    with Pool(32) as pool:
+        # Cancel requests
+        prompts = ["canceled requests"] * 100
+        pool.map_async(_query_server_long, prompts)
+        time.sleep(0.01)
+        pool.terminate()
+        pool.join()
+
+        # check cancellation stats
+        # give it some times to update the stats
+        time.sleep(1)
+
+        num_aborted_requests = requests.get(
+            f"http://localhost:{port}/stats").json()["num_aborted_requests"]
+        assert num_aborted_requests > 0
+
+    # check that server still runs after cancellations
+    with Pool(32) as pool:
+        # Try with 100 prompts
+        prompts = ["test prompt after canceled"] * 100
+        for result in pool.map(_query_server, prompts):
+            assert result
--- a/vllm-v0.6.2/tests/async_engine/test_async_llm_engine.py
+++ b/vllm-v0.6.2/tests/async_engine/test_async_llm_engine.py
@@ -0,0 +1,374 @@
+import asyncio
+import os
+import uuid
+from asyncio import CancelledError
+from copy import copy
+from dataclasses import dataclass
+from typing import List, Optional
+
+import pytest
+import pytest_asyncio
+import torch
+
+from vllm import SamplingParams
+from vllm.config import ParallelConfig
+from vllm.distributed import cleanup_dist_env_and_memory
+from vllm.engine.async_llm_engine import AsyncEngineArgs, AsyncLLMEngine
+from vllm.outputs import RequestOutput as RealRequestOutput
+from vllm.sampling_params import RequestOutputKind
+
+from ..utils import wait_for_gpu_memory_to_clear
+
+
+@dataclass
+class RequestOutput:
+    request_id: int
+    finished: bool = False
+
+
+@dataclass
+class MockModelConfig:
+    use_async_output_proc = True
+
+
+class MockEngine:
+
+    def __init__(self):
+        self.step_calls = 0
+        self.add_request_calls = 0
+        self.abort_request_calls = 0
+        self.request_id = None
+        # Ugly, remove dependency when possible
+        self.parallel_config = ParallelConfig(1, 1, False)
+        self.model_config = MockModelConfig()
+
+    async def step_async(self, virtual_engine):
+        # PP size is 1, ignore virtual engine
+        self.step_calls += 1
+        return [RequestOutput(
+            request_id=self.request_id)] if self.request_id else []
+
+    async def process_model_inputs_async(self, *args, **kwargs):
+        pass
+
+    async def stop_remote_worker_execution_loop_async(self):
+        pass
+
+    def generate(self, request_id):
+        self.request_id = request_id
+
+    def stop_generating(self):
+        self.request_id = None
+
+    def add_request(self, **kwargs):
+        del kwargs  # Unused
+        self.add_request_calls += 1
+        print(f'Request calls: {self.add_request_calls}')
+
+    async def add_request_async(self, **kwargs):
+        self.add_request_calls += 1
+        return
+
+    def abort_request(self, request_id):
+        del request_id  # Unused
+        self.abort_request_calls += 1
+
+    def has_unfinished_requests(self):
+        return self.request_id is not None
+
+    def has_unfinished_requests_for_virtual_engine(self, virtual_engine):
+        return self.request_id is not None
+
+
+class MockAsyncLLMEngine(AsyncLLMEngine):
+    _engine_class = MockEngine
+
+
+@pytest.mark.asyncio
+async def test_new_requests_event():
+    params = SamplingParams()
+
+    engine = MockAsyncLLMEngine()
+    engine.start_background_loop()
+    await asyncio.sleep(0.01)
+    assert engine.engine.step_calls == 0
+
+    await engine.add_request("1", "", params)
+    await asyncio.sleep(0.01)
+    assert engine.engine.add_request_calls == 1
+    assert engine.engine.step_calls == 1
+
+    await engine.add_request("2", "", params)
+    engine.engine.generate("2")
+    await asyncio.sleep(0)
+    await asyncio.sleep(0)
+    await asyncio.sleep(0)
+    assert engine.engine.add_request_calls == 2
+    assert engine.engine.step_calls >= 2
+    await asyncio.sleep(0.001)
+    assert engine.engine.step_calls >= 3
+    engine.engine.stop_generating()
+    await asyncio.sleep(0.001)
+    old_step_calls = engine.engine.step_calls
+    await asyncio.sleep(0.001)
+    assert engine.engine.step_calls == old_step_calls
+
+    await engine.add_request("3", "", params)
+    await asyncio.sleep(0.01)
+    assert engine.engine.add_request_calls == 3
+    assert engine.engine.step_calls == old_step_calls + 1
+    await asyncio.sleep(0.01)
+    assert engine.engine.add_request_calls == 3
+    assert engine.engine.step_calls == old_step_calls + 1
+
+    engine = MockAsyncLLMEngine()
+    assert engine.get_model_config() is not None
+    assert engine.get_tokenizer() is not None
+    assert engine.get_decoding_config() is not None
+
+
+def start_engine():
+    wait_for_gpu_memory_to_clear(
+        devices=list(range(torch.cuda.device_count())),
+        threshold_bytes=2 * 2**30,
+        timeout_s=60,
+    )
+
+    num_scheduler_steps = int(os.getenv("NUM_SCHEDULER_STEPS", "1"))
+    print(f"Starting engine with num_scheduler_steps={num_scheduler_steps}")
+
+    return AsyncLLMEngine.from_engine_args(
+        AsyncEngineArgs(model="facebook/opt-125m",
+                        enforce_eager=True,
+                        num_scheduler_steps=num_scheduler_steps))
+
+
+def uid() -> str:
+    return str(uuid.uuid4())
+
+
+@pytest_asyncio.fixture(scope="module")
+async def async_engine():
+    engine = await asyncio.get_event_loop().run_in_executor(executor=None,
+                                                            func=start_engine)
+    try:
+        yield engine
+    finally:
+        engine.shutdown_background_loop()
+        del engine
+        await asyncio.sleep(0.1)
+        cleanup_dist_env_and_memory()
+
+
+@pytest.fixture()
+def should_do_global_cleanup_after_test(request) -> bool:
+    # So we can share the async engine fixture between these tests
+    return False
+
+
+@pytest.mark.asyncio(scope="module")
+@pytest.mark.parametrize("stop", [None, ["a stop string"]])
+async def test_asyncio_run(async_engine, stop):
+
+    scheduler_config = await async_engine.get_scheduler_config()
+    num_scheduler_steps = scheduler_config.num_scheduler_steps
+
+    async def run(prompt: str):
+        sampling_params = SamplingParams(
+            temperature=0,
+            max_tokens=32,
+            min_tokens=32,
+            stop=stop,
+        )
+
+        output_count = 0
+        final_output = None
+        async for output in async_engine.generate(prompt,
+                                                  sampling_params,
+                                                  request_id=uid()):
+            output_count += 1
+            final_output = output
+        return final_output, output_count
+
+    results = await asyncio.gather(
+        run("test0"),
+        run("test0"),
+    )
+    assert len(results) == 2
+    first, second = results
+
+    # remove nondeterministic fields for comparison
+    first[0].metrics = None
+    second[0].metrics = None
+    first[0].request_id = None
+    second[0].request_id = None
+
+    assert str(first) == str(second)
+
+    output_count = results[0][1]
+    if num_scheduler_steps == 1:
+        assert output_count == 32
+    else:
+        assert 1 < output_count < 32
+
+
+@pytest.mark.asyncio(scope="module")
+@pytest.mark.parametrize("stop", [None, ["a stop string"]])
+async def test_output_kinds(async_engine, stop):
+    """Test that output_kind works as expected and that
+    results are equivalent across different kinds."""
+
+    scheduler_config = await async_engine.get_scheduler_config()
+    num_scheduler_steps = scheduler_config.num_scheduler_steps
+
+    sampling_params = SamplingParams(
+        temperature=0,
+        max_tokens=32,
+        min_tokens=32,
+        stop=stop,
+    )
+
+    async def run(prompt: str, kind: RequestOutputKind):
+        params = copy(sampling_params)
+        params.output_kind = kind
+
+        output_count = 0
+        final_output = None
+        async for output in async_engine.generate(prompt,
+                                                  params,
+                                                  request_id=uid()):
+            output_count += 1
+            final_output = output
+
+        assert final_output is not None
+        assert final_output.finished
+
+        return (final_output.prompt_token_ids,
+                final_output.outputs[0].token_ids,
+                final_output.outputs[0].text, output_count)
+
+    async def run_deltas(prompt: str):
+        params = copy(sampling_params)
+        params.output_kind = RequestOutputKind.DELTA
+
+        prompt_tokens = None
+        output_tokens: List[int] = []
+        output_text = ""
+        output_count = 0
+        final_output = None
+        async for output in async_engine.generate(prompt,
+                                                  params,
+                                                  request_id=uid()):
+            token_ids = output.outputs[0].token_ids
+            text = output.outputs[0].text
+            final_output = output
+
+            # Ensure we get prompt ids iff we haven't yet received output tokens
+            if output_tokens:
+                assert 1 <= len(token_ids) <= num_scheduler_steps
+                assert stop or text
+                assert not output.prompt_token_ids
+            else:
+                assert output.prompt_token_ids
+                prompt_tokens = output.prompt_token_ids
+
+            output_tokens.extend(token_ids)
+            output_text += text
+
+            output_count += 1
+
+        assert final_output is not None
+        assert final_output.finished
+
+        return prompt_tokens, output_tokens, output_text, output_count
+
+    results = await asyncio.gather(
+        run("common input prompt", RequestOutputKind.CUMULATIVE),
+        run("common input prompt", RequestOutputKind.FINAL_ONLY),
+        run_deltas("common input prompt"))
+
+    # Make sure outputs are the same
+    prompt_set = set(tuple(prompt_ids) for prompt_ids, _, _, _ in results)
+    assert len(prompt_set) == 1
+
+    text_set = set(text for _, _, text, _ in results)
+    assert len(text_set) == 1
+
+    tokens_set = set(tuple(ids) for _, ids, _, _ in results)
+    assert len(tokens_set) == 1
+
+    cumulative, final, deltas = results
+
+    # output message counts
+    assert cumulative[3] == deltas[3]
+
+    if num_scheduler_steps == 1:
+        assert cumulative[3] == 32
+    else:
+        assert 1 < cumulative[3] < 32
+
+    assert final[3] == 1
+
+
+@pytest.mark.asyncio(scope="module")
+@pytest.mark.parametrize("stop", [None, ["a stop string"]])
+async def test_cancellation(async_engine, stop):
+    scheduler_config = await async_engine.get_scheduler_config()
+    num_scheduler_steps = scheduler_config.num_scheduler_steps
+
+    sampling_params = SamplingParams(
+        temperature=0,
+        min_tokens=13,
+        max_tokens=13,
+        stop=stop,
+    )
+
+    stop_at = 5 if num_scheduler_steps == 1 else 1
+
+    request_id = uid()
+
+    i = 0
+    with pytest.raises(CancelledError):
+        async for output in async_engine.generate("test2",
+                                                  sampling_params,
+                                                  request_id=request_id):
+            assert not output.finished
+            i += 1
+            if i == stop_at:
+                await async_engine.abort(request_id)
+
+    assert i == stop_at
+
+
+@pytest.mark.asyncio(scope="module")
+@pytest.mark.parametrize("stop", [None, ["a stop string"]])
+async def test_delayed_generator(async_engine, stop):
+    scheduler_config = await async_engine.get_scheduler_config()
+
+    if scheduler_config.num_scheduler_steps != 1:
+        pytest.skip("no need to test this one with multistep")
+
+    sampling_params = SamplingParams(
+        temperature=0,
+        min_tokens=10,
+        max_tokens=10,
+        stop=stop,
+    )
+
+    stream = async_engine.generate("test3", sampling_params, request_id=uid())
+    i = 0
+    final_output: Optional[RealRequestOutput] = None
+    async for output in stream:
+        final_output = output
+        if i == 0:
+            # wait for generation to complete before consuming
+            # the remaining messages
+            await asyncio.sleep(1)
+        if i < 9:
+            assert not output.finished
+        i += 1
+
+    assert i == 10
+    assert final_output is not None
+    assert len(final_output.outputs[0].token_ids) == 10
+    assert final_output.finished
--- a/vllm-v0.6.2/tests/async_engine/test_openapi_server.py
+++ b/vllm-v0.6.2/tests/async_engine/test_openapi_server.py
@@ -0,0 +1,106 @@
+import openai  # use the official client for correctness check
+import pytest
+import pytest_asyncio
+
+from ..utils import VLLM_PATH, RemoteOpenAIServer
+
+# any model with a chat template should work here
+MODEL_NAME = "facebook/opt-125m"
+chatml_jinja_path = VLLM_PATH / "examples/template_chatml.jinja"
+assert chatml_jinja_path.exists()
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "float16",
+        "--max-model-len",
+        "2048",
+        "--enforce-eager",
+        "--chat-template",
+        str(chatml_jinja_path),
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+async def test_check_models(client: openai.AsyncOpenAI):
+    models = await client.models.list()
+    models = models.data
+    served_model = models[0]
+    assert served_model.id == MODEL_NAME
+    assert all(model.root == MODEL_NAME for model in models)
+
+
+@pytest.mark.asyncio
+async def test_single_completion(client: openai.AsyncOpenAI):
+    completion = await client.completions.create(model=MODEL_NAME,
+                                                 prompt="Hello, my name is",
+                                                 max_tokens=5,
+                                                 temperature=0.0)
+
+    assert completion.id is not None
+    assert len(completion.choices) == 1
+    assert len(completion.choices[0].text) >= 5
+    assert completion.choices[0].finish_reason == "length"
+    assert completion.usage == openai.types.CompletionUsage(
+        completion_tokens=5, prompt_tokens=6, total_tokens=11)
+
+    # test using token IDs
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=[0, 0, 0, 0, 0],
+        max_tokens=5,
+        temperature=0.0,
+    )
+    assert len(completion.choices[0].text) >= 5
+
+
+@pytest.mark.asyncio
+async def test_single_chat_session(client: openai.AsyncOpenAI):
+    messages = [{
+        "role": "system",
+        "content": "you are a helpful assistant"
+    }, {
+        "role": "user",
+        "content": "what is 1+1?"
+    }]
+
+    # test single completion
+    chat_completion = await client.chat.completions.create(model=MODEL_NAME,
+                                                           messages=messages,
+                                                           max_tokens=10,
+                                                           logprobs=True,
+                                                           top_logprobs=5)
+    assert chat_completion.id is not None
+    assert len(chat_completion.choices) == 1
+
+    choice = chat_completion.choices[0]
+    assert choice.finish_reason == "length"
+    assert chat_completion.usage == openai.types.CompletionUsage(
+        completion_tokens=10, prompt_tokens=55, total_tokens=65)
+
+    message = choice.message
+    assert message.content is not None and len(message.content) >= 10
+    assert message.role == "assistant"
+    messages.append({"role": "assistant", "content": message.content})
+
+    # test multi-turn dialogue
+    messages.append({"role": "user", "content": "express your result in json"})
+    chat_completion = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=messages,
+        max_tokens=10,
+    )
+    message = chat_completion.choices[0].message
+    assert message.content is not None and len(message.content) >= 0
--- a/vllm-v0.6.2/tests/async_engine/test_request_tracker.py
+++ b/vllm-v0.6.2/tests/async_engine/test_request_tracker.py
@@ -0,0 +1,68 @@
+import pytest
+
+from vllm.engine.async_llm_engine import RequestTracker
+from vllm.outputs import RequestOutput
+
+
+@pytest.mark.asyncio
+async def test_request_tracker():
+    tracker = RequestTracker()
+    stream_1 = tracker.add_request("1")
+    assert tracker.new_requests_event.is_set()
+    await tracker.wait_for_new_requests()
+    new, aborted = tracker.get_new_and_aborted_requests()
+    assert not tracker.new_requests_event.is_set()
+    assert len(new) == 1
+    assert new[0]["request_id"] == "1"
+    assert not aborted
+    assert not stream_1.finished
+
+    stream_2 = tracker.add_request("2")
+    stream_3 = tracker.add_request("3")
+    assert tracker.new_requests_event.is_set()
+    await tracker.wait_for_new_requests()
+    new, aborted = tracker.get_new_and_aborted_requests()
+    assert not tracker.new_requests_event.is_set()
+    assert len(new) == 2
+    assert new[0]["request_id"] == "2"
+    assert new[1]["request_id"] == "3"
+    assert not aborted
+    assert not stream_2.finished
+    assert not stream_3.finished
+
+    # request_ids must be unique
+    with pytest.raises(KeyError):
+        tracker.add_request("1")
+    assert not tracker.new_requests_event.is_set()
+
+    tracker.abort_request("1")
+    new, aborted = tracker.get_new_and_aborted_requests()
+    assert len(aborted) == 1
+    assert "1" in aborted
+    assert not new
+    assert stream_1.finished
+
+    stream_4 = tracker.add_request("4")
+    tracker.abort_request("4")
+    assert tracker.new_requests_event.is_set()
+    await tracker.wait_for_new_requests()
+    new, aborted = tracker.get_new_and_aborted_requests()
+    # aborted new requests will cancel each other out -
+    # there's no need for them to propagate into the
+    # engine
+    assert not aborted
+    assert not new
+    assert stream_4.finished
+
+    stream_5 = tracker.add_request("5")
+    assert tracker.new_requests_event.is_set()
+    tracker.process_request_output(
+        RequestOutput("2", "output", [], [], [], finished=True))
+    await tracker.wait_for_new_requests()
+    new, aborted = tracker.get_new_and_aborted_requests()
+    assert not tracker.new_requests_event.is_set()
+    assert not aborted
+    assert len(new) == 1
+    assert new[0]["request_id"] == "5"
+    assert stream_2.finished
+    assert not stream_5.finished
--- a/vllm-v0.6.2/tests/basic_correctness/init.py
+++ b/vllm-v0.6.2/tests/basic_correctness/init.py
--- a/vllm-v0.6.2/tests/basic_correctness/test_basic_correctness.py
+++ b/vllm-v0.6.2/tests/basic_correctness/test_basic_correctness.py
@@ -0,0 +1,198 @@
+"""Compare the short outputs of HF and vLLM when using greedy sampling.
+
+Run `pytest tests/basic_correctness/test_basic_correctness.py`.
+"""
+import os
+import pickle
+import re
+import weakref
+from unittest.mock import patch
+
+import pytest
+
+from vllm import LLM
+from vllm.platforms import current_platform
+from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata
+
+from ..models.utils import check_outputs_equal
+from ..utils import multi_gpu_test
+
+MODELS = [
+    "facebook/opt-125m",
+    "meta-llama/Llama-3.2-1B-Instruct",
+]
+
+TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4")
+
+
+def test_vllm_gc_ed():
+    """Verify vllm instance is GC'ed when it is deleted"""
+    llm = LLM("facebook/opt-125m")
+    weak_llm = weakref.ref(llm)
+    del llm
+    # If there's any circular reference to vllm, this fails
+    # because llm instance is not GC'ed.
+    assert weak_llm() is None
+
+'''
+=============================
+Modify by vllm_mlu
+=============================
+@brief(backend): MLU device only support MLU_FLASH_ATTN backend
+''' 
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("backend", ["MLU_FLASH_ATTN"])
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [5])
+@pytest.mark.parametrize("enforce_eager", [False, True])
+def test_models(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    backend: str,
+    dtype: str,
+    max_tokens: int,
+    enforce_eager: bool,
+) -> None:
+
+    if backend == "FLASHINFER" and current_platform.is_rocm():
+        pytest.skip("Flashinfer does not support ROCm/HIP.")
+
+    os.environ["VLLM_ATTENTION_BACKEND"] = backend
+
+    with hf_runner(model, dtype=dtype) as hf_model:
+        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+
+    with vllm_runner(model,
+                     dtype=dtype,
+                     enforce_eager=enforce_eager,
+                     gpu_memory_utilization=0.7) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+
+    check_outputs_equal(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )
+
+'''
+=============================
+Modify by vllm_mlu
+=============================
+@brief(multi_gpu_test): torch_mlu not support multi-process test without 'spawn'
+@brief(backend): MLU device only support MLU_FLASH_ATTN backend
+''' 
+# @multi_gpu_test(num_gpus=2)
+@pytest.mark.parametrize(
+    "model, distributed_executor_backend, attention_backend, "
+    "test_suite", [
+        ("facebook/opt-125m", "ray", "", "L4"),
+        # ("facebook/opt-125m", "mp", "", "L4"),
+        ("meta-llama/Llama-2-7b-hf", "ray", "", "L4"),
+        # ("meta-llama/Llama-2-7b-hf", "mp", "", "L4"),
+        ("facebook/opt-125m", "ray", "", "A100"),
+        ("facebook/opt-125m", "mp", "", "A100"),
+        ("facebook/opt-125m", "mp", "FLASHINFER", "A100"),
+        ("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"),
+    ])
+def test_models_distributed(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    distributed_executor_backend: str,
+    attention_backend: str,
+    test_suite: str,
+) -> None:
+
+    # use MLU_FLASH_ATTN for MLU devices
+    attention_backend = "MLU_FLASH_ATTN"
+
+    if test_suite != TARGET_TEST_SUITE:
+        pytest.skip(f"Skip test for {test_suite}")
+
+    if model == "meta-llama/Llama-2-7b-hf" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4":  # noqa
+        # test ray adag
+        os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
+        os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"
+
+    if attention_backend:
+        os.environ["VLLM_ATTENTION_BACKEND"] = attention_backend
+
+    dtype = "half"
+    max_tokens = 5
+
+    # NOTE: take care of the order. run vLLM first, and then run HF.
+    # vLLM needs a fresh new process without cuda initialization.
+    # if we run HF first, the cuda initialization will be done and it
+    # will hurt multiprocessing backend with fork method (the default method).
+    with vllm_runner(model,
+                     dtype=dtype,
+                     tensor_parallel_size=2,
+                     distributed_executor_backend=distributed_executor_backend
+                     ) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+
+    with hf_runner(model, dtype=dtype) as hf_model:
+        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+
+    check_outputs_equal(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )
+
+
+def test_model_with_failure(vllm_runner) -> None:
+    try:
+        with patch("vllm.model_executor.models.opt.OPTForCausalLM.forward",
+                   side_effect=ValueError()):
+            with pytest.raises(ValueError) as exc_info:
+                vllm_runner("facebook/opt-125m",
+                            dtype="half",
+                            enforce_eager=False,
+                            gpu_memory_utilization=0.7)
+            matches = re.search(r"input dumped to (.+).pkl",
+                                str(exc_info.value))
+            assert matches is not None
+            filename = f"{matches.group(1)}.pkl"
+
+        with open(filename, "rb") as filep:
+            inputs = pickle.load(filep)
+
+        if any(key not in inputs for key in ("arg_1", "arg_2", "arg_3")):
+            raise AssertionError("Missing keys in dumped inputs. Dumped keys: "
+                                 f"{list(inputs.keys())}")
+        assert isinstance(inputs["arg_1"],
+                          ModelInputForGPUWithSamplingMetadata)
+    finally:
+        os.remove(filename)
+
+
+def test_failure_with_async_out_proc(vllm_runner) -> None:
+
+    filename = None
+    try:
+        with vllm_runner("facebook/opt-125m",
+                         dtype="half",
+                         enforce_eager=False,
+                         gpu_memory_utilization=0.7) as vllm_model,\
+             patch("vllm.model_executor.models.opt.OPTForCausalLM.forward",
+                       side_effect=ValueError()):
+            model_config = vllm_model.model.llm_engine.model_config
+            assert model_config.use_async_output_proc
+            with pytest.raises(ValueError) as exc_info:
+                vllm_model.generate_greedy('how to make pizza?', 250)
+            matches = re.search(r"input dumped to (.+).pkl",
+                                str(exc_info.value))
+            assert matches is not None
+
+            filename = f"{matches.group(1)}.pkl"
+    finally:
+        # Clean up
+        if filename is not None:
+            os.remove(filename)
+        pass
--- a/vllm-v0.6.2/tests/basic_correctness/test_chunked_prefill.py
+++ b/vllm-v0.6.2/tests/basic_correctness/test_chunked_prefill.py
@@ -0,0 +1,298 @@
+"""Compare the outputs of HF and vLLM when using greedy sampling.
+
+It tests chunked prefill. Chunked prefill can be enabled by
+enable_chunked_prefill=True. If prefill size exceeds max_num_batched_tokens,
+prefill requests are chunked.
+
+Run `pytest tests/models/test_chunked_prefill.py`.
+"""
+import os
+from contextlib import nullcontext
+
+import pytest
+
+from tests.kernels.utils import override_backend_env_variable
+
+from ..models.utils import check_logprobs_close, check_outputs_equal
+from ..utils import multi_gpu_test
+
+MODELS = [
+    "facebook/opt-125m",
+    "meta-llama/Llama-3.2-1B-Instruct",
+]
+
+
+'''
+=============================
+Modify by vllm_mlu
+=============================
+@brief(backend): MLU device only support MLU_FLASH_ATTN backend
+NOTES: chunked_prefill_token_size=1 contains some accuracy issue.
+So we skip this case in mlu ut.
+TODO(VLLM-662): fix accuracy error
+'''
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [32])
+# The original case is: @pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16])
+@pytest.mark.parametrize("chunked_prefill_token_size", [4, 16])
+@pytest.mark.parametrize("enforce_eager", [False, True])
+# NOTE: Increasing this in this suite will fail CI because we currently cannot
+# reset distributed env properly. Use a value > 1 just when you test.
+@pytest.mark.parametrize("tensor_parallel_size", [1])
+@pytest.mark.parametrize("attention_backend", ["MLU_FLASH_ATTN"])
+def test_models(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+    chunked_prefill_token_size: int,
+    enforce_eager: bool,
+    tensor_parallel_size: int,
+    attention_backend: str,
+    monkeypatch,
+) -> None:
+    """
+    Checks exact match decode between huggingface model and vllm runner with
+    chunked prefill.
+    """
+    override_backend_env_variable(monkeypatch, attention_backend)
+
+    max_num_seqs = chunked_prefill_token_size
+    max_num_batched_tokens = chunked_prefill_token_size
+
+    with hf_runner(model, dtype=dtype) as hf_model:
+        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+    '''
+    =============================
+    Modify by vllm_mlu
+    =============================
+    NOTE: Since the kv cache memory is too big for small models hich would trigger
+    large tensor problem in flash attention, we need to specify the num_gpu_blocks_override to 100
+    '''
+    with vllm_runner(
+            model,
+            dtype=dtype,
+            max_num_batched_tokens=max_num_batched_tokens,
+            enable_chunked_prefill=True,
+            tensor_parallel_size=tensor_parallel_size,
+            enforce_eager=enforce_eager,
+            max_num_seqs=max_num_seqs,
+            num_gpu_blocks_override=100,
+    ) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+
+    check_outputs_equal(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )
+
+
+'''
+=============================
+Modify by vllm_mlu
+=============================
+@brief(multi_gpu_test): torch_mlu not support multi-process test without 'spawn'
+@brief(backend): MLU device only support MLU_FLASH_ATTN backend
+'''
+# @multi_gpu_test(num_gpus=2)
+@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("attention_backend", ["MLU_FLASH_ATTN"])
+def test_models_distributed(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    distributed_executor_backend: str,
+    attention_backend: str,
+    monkeypatch,
+) -> None:
+
+    if (model == "meta-llama/Llama-2-7b-hf"
+            and distributed_executor_backend == "ray"):
+        # test ray adag
+        os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "0"
+        os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "0"
+
+    # Set the attention backend environment variable
+    os.environ["VLLM_ATTENTION_BACKEND"] = attention_backend
+
+    dtype = "half"
+    max_tokens = 5
+    chunked_prefill_token_size = 16
+
+    # Add a chunked prefill config.
+    max_num_seqs = min(chunked_prefill_token_size, 256)
+    assert chunked_prefill_token_size != -1
+    enable_chunked_prefill = True
+    max_num_batched_tokens = chunked_prefill_token_size
+
+    # NOTE: take care of the order. run vLLM first, and then run HF.
+    # vLLM needs a fresh new process without cuda initialization.
+    # if we run HF first, the cuda initialization will be done and it
+    # will hurt multiprocessing backend with fork method (the default method).
+
+    with vllm_runner(
+            model,
+            dtype=dtype,
+            tensor_parallel_size=2,
+            max_num_seqs=max_num_seqs,
+            enable_chunked_prefill=enable_chunked_prefill,
+            max_num_batched_tokens=max_num_batched_tokens,
+            distributed_executor_backend=distributed_executor_backend,
+            gpu_memory_utilization=0.6,
+    ) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+
+    with hf_runner(model, dtype=dtype) as hf_model:
+        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+
+    check_outputs_equal(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )
+
+
+@pytest.mark.parametrize(
+    "kv_cache_dtype,model",
+    [("fp8_e4m3",
+      "nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme")])
+# Due to low-precision numerical divergence, we only test logprob of 4 tokens
+@pytest.mark.parametrize("max_tokens", [4])
+@pytest.mark.parametrize("chunked_prefill_token_size", [4, 16])
+@pytest.mark.parametrize("enforce_eager", [False, True])
+# NOTE: Increasing this in this suite will fail CI because we currently cannot
+# reset distributed env properly. Use a value > 1 just when you test.
+@pytest.mark.parametrize("tensor_parallel_size", [1])
+# Due to low-precision numerical divergence, this test is too sensitive to
+# the async postprocessor
+@pytest.mark.parametrize("disable_async_output_proc", [True])
+def test_models_with_fp8_kv_cache(
+    vllm_runner,
+    example_prompts,
+    kv_cache_dtype: str,
+    model: str,
+    max_tokens: int,
+    chunked_prefill_token_size: int,
+    enforce_eager: bool,
+    tensor_parallel_size: int,
+    disable_async_output_proc: bool,
+) -> None:
+    """
+    Check output logprobs match between no_chunked_prefill and chunked_prefill
+    with fp8 kv cache. General fp8 kv-cache tests are covered in test_fp8.py,
+    so here we only check chunked prefill.
+    """
+    NUM_LOG_PROBS = 8
+
+    max_num_seqs = chunked_prefill_token_size
+    max_num_batched_tokens = chunked_prefill_token_size
+
+    with vllm_runner(
+            model,
+            tensor_parallel_size=tensor_parallel_size,
+            enforce_eager=enforce_eager,
+            max_num_seqs=max_num_seqs,
+            kv_cache_dtype=kv_cache_dtype,
+            disable_async_output_proc=disable_async_output_proc,
+    ) as vllm_model:
+        no_chunked_prefill_outputs = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, NUM_LOG_PROBS)
+
+    with vllm_runner(
+            model,
+            max_num_batched_tokens=max_num_batched_tokens,
+            enable_chunked_prefill=True,
+            tensor_parallel_size=tensor_parallel_size,
+            enforce_eager=enforce_eager,
+            max_num_seqs=max_num_seqs,
+            kv_cache_dtype=kv_cache_dtype,
+            disable_async_output_proc=disable_async_output_proc,
+    ) as vllm_model:
+        chunked_prefill_outputs = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, NUM_LOG_PROBS)
+
+    check_logprobs_close(
+        outputs_0_lst=no_chunked_prefill_outputs,
+        outputs_1_lst=chunked_prefill_outputs,
+        name_0="no_chunked_prefill",
+        name_1="chunked_prefill",
+    )
+
+
+'''
+=============================
+Modify by vllm_mlu
+=============================
+NOTES: chunk_size=32 under VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 contains some accuracy issue.
+So we skip this case in mlu ut.
+TODO(VLLM-662): fix accuracy error
+'''
+@pytest.mark.parametrize("max_tokens", [16])
+@pytest.mark.parametrize("enforce_eager", [False])
+# the original case is @pytest.mark.parametrize("chunk_size", [30, 32])
+@pytest.mark.parametrize("chunk_size", [30])
+# NOTE: Increasing this in this suite will fail CI because we currently cannot
+# reset distributed env properly. Use a value > 1 just when you test.
+@pytest.mark.parametrize("tensor_parallel_size", [1])
+def test_with_prefix_caching(
+    vllm_runner,
+    max_tokens: int,
+    enforce_eager: bool,
+    chunk_size: int,
+    tensor_parallel_size: int,
+) -> None:
+    """
+    Checks exact match decode with and without prefix caching
+    with chunked prefill enabled.
+    """
+    model = "meta-llama/Llama-2-7b-chat-hf"
+    # The common prompt has 142 tokens with Llama-2 tokenizer.
+    common_prompt = "You are a helpful AI assistant " * 20
+    unique_prompts = [
+        "Question",  # Warmup
+        "Question",  # Fully cached
+        "Another question",  # Partial cached
+    ]
+    full_prompts = [f"{common_prompt}\n{p}" for p in unique_prompts]
+
+    max_num_batched_tokens = max_num_seqs = chunk_size
+    outputs = {}  # type: ignore
+    check_result = True
+    for enable in (True, False):
+        with vllm_runner(
+                model,
+                dtype="half",
+                max_num_batched_tokens=max_num_batched_tokens,
+                enable_chunked_prefill=True,
+                enable_prefix_caching=enable,
+                tensor_parallel_size=tensor_parallel_size,
+                enforce_eager=enforce_eager,
+                max_num_seqs=max_num_seqs,
+        ) as vllm_model:
+            # It should fail when prefix caching is enable and chunk
+            # size is not a multiple of block size (16).
+            should_fail = chunk_size % 16 != 0 and enable
+            check_result &= not should_fail
+            outputs[enable] = []
+            # Send the request one-by-one to ensure the cache is populated.
+            with pytest.raises(ValueError) if should_fail else nullcontext():
+                for prompt in full_prompts:
+                    outputs[enable] += vllm_model.generate_greedy([prompt],
+                                                                  max_tokens)
+
+    # Check results only if we did not expect a failure.
+    if check_result:
+        check_outputs_equal(
+            outputs_0_lst=outputs[False],
+            outputs_1_lst=outputs[True],
+            name_0="w/o prefix caching",
+            name_1="with prefix caching",
+        )
--- a/vllm-v0.6.2/tests/basic_correctness/test_cpu_offload.py
+++ b/vllm-v0.6.2/tests/basic_correctness/test_cpu_offload.py
@@ -0,0 +1,6 @@
+from ..utils import compare_two_settings
+
+
+def test_cpu_offload():
+    compare_two_settings("meta-llama/Llama-3.2-1B-Instruct", [],
+                         ["--cpu-offload-gb", "1"])
--- a/vllm-v0.6.2/tests/basic_correctness/test_preemption.py
+++ b/vllm-v0.6.2/tests/basic_correctness/test_preemption.py
@@ -0,0 +1,186 @@
+"""Compare the short outputs of HF and vLLM when using greedy sampling.
+
+VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 has to be set before running this test.
+
+Run `VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1
+pytest tests/basic_correctness/test_preemption.py`.
+"""
+import pytest
+from prometheus_client import REGISTRY
+
+import vllm.envs as envs
+from vllm import SamplingParams
+from vllm.core.scheduler import (ARTIFICIAL_PREEMPTION_MAX_CNT,
+                                 ENABLE_ARTIFICIAL_PREEMPT)
+
+from ..models.utils import check_outputs_equal
+
+MODELS = [
+    "facebook/opt-125m",
+]
+
+
+@pytest.fixture(scope="module", autouse=True)
+def check_settings():
+    assert ENABLE_ARTIFICIAL_PREEMPT is True, (
+        "Use an env var VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1."
+        "`VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 "
+        "pytest tests/basic_correctness/test_preemption.py`")
+
+
+@pytest.fixture
+def worker_use_ray() -> bool:
+    # When SPMD worker is used, use ray_use_worker=True
+    # to test delta input optimization works with preemption.
+    return envs.VLLM_USE_RAY_SPMD_WORKER
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [96])
+@pytest.mark.parametrize("chunked_prefill_token_size", [16])
+def test_chunked_prefill_recompute(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+    chunked_prefill_token_size: int,
+    worker_use_ray: bool,
+) -> None:
+    """Ensure that chunked prefill works with preemption."""
+    max_num_seqs = min(chunked_prefill_token_size, 256)
+    enable_chunked_prefill = False
+    max_num_batched_tokens = None
+    if chunked_prefill_token_size != -1:
+        enable_chunked_prefill = True
+        max_num_batched_tokens = chunked_prefill_token_size
+
+    with hf_runner(model, dtype=dtype) as hf_model:
+        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+
+    '''
+    =============================
+    Modify by vllm_mlu
+    =============================
+    NOTE: Since the kv cache memory is too big for small models hich would trigger
+    large tensor problem in flash attention, we need to specify the num_gpu_blocks_override to 500
+    '''
+    with vllm_runner(
+            model,
+            dtype=dtype,
+            max_num_batched_tokens=max_num_batched_tokens,
+            enable_chunked_prefill=enable_chunked_prefill,
+            max_num_seqs=max_num_seqs,
+            worker_use_ray=worker_use_ray,
+            disable_log_stats=False,
+            num_gpu_blocks_override=500,
+    ) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+        assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
+                < ARTIFICIAL_PREEMPTION_MAX_CNT)
+
+    for i in range(len(example_prompts)):
+        hf_output_ids, hf_output_str = hf_outputs[i]
+        vllm_output_ids, vllm_output_str = vllm_outputs[i]
+        assert hf_output_str == vllm_output_str, (
+            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
+        assert hf_output_ids == vllm_output_ids, (
+            f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize("max_tokens", [96])
+def test_preemption(
+    caplog_vllm,
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+    worker_use_ray: bool,
+) -> None:
+    """By default, recompute preemption is enabled"""
+
+    with hf_runner(model, dtype=dtype) as hf_model:
+        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+
+    with vllm_runner(
+            model,
+            dtype=dtype,
+            disable_log_stats=False,
+            worker_use_ray=worker_use_ray,
+    ) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+        assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
+                < ARTIFICIAL_PREEMPTION_MAX_CNT)
+        total_preemption = (
+            vllm_model.model.llm_engine.scheduler[0].num_cumulative_preemption)
+
+    check_outputs_equal(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )
+
+    assert ("is preempted by PreemptionMode.RECOMPUTE mode because there "
+            "is not enough KV cache space." in caplog_vllm.text)
+    # Ensure the count bucket of request-level histogram metrics matches
+    # the number of requests as a simple sanity check to ensure metrics are
+    # generated
+    preemption_metrics = None
+    for m in REGISTRY.collect():
+        if m.name == "vllm:num_preemptions":
+            preemption_metrics = m
+    assert preemption_metrics is not None
+    total_recorded_preemption = 0
+    for sample in preemption_metrics.samples:
+        total_recorded_preemption += sample.value
+    assert total_preemption == total_recorded_preemption
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize("max_tokens", [96])
+def test_preemption_infeasible(
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+    worker_use_ray: bool,
+) -> None:
+    """Verify infeasible preemption request will be ignored."""
+    BLOCK_SIZE = 16
+    prefill_blocks = 2
+    decode_blocks = max_tokens // BLOCK_SIZE
+    with vllm_runner(
+            model,
+            dtype=dtype,
+            block_size=BLOCK_SIZE,
+            # Not enough gpu blocks to complete a single sequence.
+            # preemption should happen, and the sequence should be
+            # ignored instead of hanging forever.
+            num_gpu_blocks_override=prefill_blocks + decode_blocks // 2,
+            max_model_len=((prefill_blocks + decode_blocks // 2) * BLOCK_SIZE),
+            worker_use_ray=worker_use_ray,
+    ) as vllm_model:
+        sampling_params = SamplingParams(max_tokens=max_tokens,
+                                         ignore_eos=True)
+        req_outputs = vllm_model.model.generate(
+            example_prompts,
+            sampling_params=sampling_params,
+        )
+
+        assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
+                < ARTIFICIAL_PREEMPTION_MAX_CNT)
+
+    # Verify the request is ignored and not hang.
+    for req_output in req_outputs:
+        outputs = req_output.outputs
+        assert len(outputs) == 1
+        assert outputs[0].finish_reason == "length"
--- a/vllm-v0.6.2/tests/benchmark/test_benchmark_latency.py
+++ b/vllm-v0.6.2/tests/benchmark/test_benchmark_latency.py
@@ -0,0 +1,59 @@
+import numpy as np
+from vllm import LLM, SamplingParams
+import os
+import pandas as pd
+
+def test_generating_csv():
+    '''
+    test generating csv
+    '''
+    # contents of this test is brought from benchmark_latency.py
+
+    csv_file = "output.csv"
+    if os.path.isfile(csv_file):
+        os.remove("output.csv")
+    assert not os.path.isfile(csv_file)
+
+    os.environ['VLLM_LATENCY_DEBUG'] = "1"
+    model_path = "/data/vllm/sq_per_tensor_per_channel/Llama-2-7b-hf"
+    tp = 1
+    batch_size = 4
+    input_len = 128
+    output_len = 5
+    quantization = "smoothquant"
+    llm = LLM(model=model_path,
+              tokenizer=model_path,
+              quantization=quantization,
+              tensor_parallel_size=tp,
+              trust_remote_code=True,
+              enforce_eager=True)
+    sampling_params = SamplingParams(
+        n=1,
+        temperature=1.0,
+        top_p=1.0,
+        ignore_eos=True,
+        max_tokens=output_len,
+    )
+    dummy_prompt_token_ids = np.random.randint(10000,
+                                               size=(batch_size,
+                                                     input_len))
+    dummy_prompt_token_ids = dummy_prompt_token_ids.tolist()
+    llm.generate(prompt_token_ids=dummy_prompt_token_ids,
+                 sampling_params=sampling_params,
+                 use_tqdm=False)
+    llm.get_metrics(0, # args.num_iters_warmup,
+                    False, #args.only_average,
+                    input_len, #args.input_len,
+                    output_len, #args.output_len,
+                    tp, #args.tensor_parallel_size,
+                    quantization, #args.quantization
+                    llm.dump_info)
+    assert os.path.isfile(csv_file)
+    df = pd.read_csv(csv_file)
+    assert df['batch size'].item() == batch_size
+    assert df['model'].item() == model_path
+    assert df['input len'].item() == input_len
+    assert df['output len'].item() == output_len
+    assert df['tp'].item() == tp
+    assert df['weight dtype'].item() == "SmoothQuant-int8"
+    os.remove(csv_file)
--- a/vllm-v0.6.2/tests/compile/init.py
+++ b/vllm-v0.6.2/tests/compile/init.py
--- a/vllm-v0.6.2/tests/compile/backend.py
+++ b/vllm-v0.6.2/tests/compile/backend.py
@@ -0,0 +1,33 @@
+from copy import deepcopy
+from typing import Callable
+
+import torch
+
+
+class TestBackend:
+    """
+    This class provides a simple Inductor backend that can be used for testing.
+    It takes a list of custom passes and runs them after Inductor's passes.
+    It also saves the graph before and after the custom passes for inspection.
+    """
+
+    def __init__(self, *args: Callable[[torch.fx.Graph], None]):
+        self.custom_passes = args
+        from torch._inductor import config
+        self.current_config = config.shallow_copy_dict()
+        self.current_config['post_grad_custom_post_pass'] = self.post_pass
+
+    def __call__(self, graph: torch.fx.GraphModule, example_inputs):
+        from torch._inductor.compile_fx import compile_fx
+        return compile_fx(graph,
+                          example_inputs,
+                          config_patches=self.current_config)
+
+    def post_pass(self, graph: torch.fx.Graph):
+        self.graph_pre_pass = deepcopy(graph)
+        for pass_ in self.custom_passes:
+            pass_(graph)
+
+        self.graph_post_pass = deepcopy(graph)
+        # assign by reference, will reflect the final state of the graph
+        self.final_graph = graph
--- a/vllm-v0.6.2/tests/compile/piecewise/init.py
+++ b/vllm-v0.6.2/tests/compile/piecewise/init.py
--- a/vllm-v0.6.2/tests/compile/piecewise/piecewise_compilation_config.json
+++ b/vllm-v0.6.2/tests/compile/piecewise/piecewise_compilation_config.json
@@ -0,0 +1,5 @@
+{
+    "use_cudagraph": true,
+    "non_cudagraph_ops": ["silly.attention"],
+    "cudagraph_copy_inputs": true
+}
--- a/vllm-v0.6.2/tests/compile/piecewise/test_simple.py
+++ b/vllm-v0.6.2/tests/compile/piecewise/test_simple.py
@@ -0,0 +1,112 @@
+"""
+Test the piecewise compilation with a simple model so that we
+can exactly calculate the expected output and side effects.
+"""
+import os
+
+import torch
+from torch import nn
+from torch.library import Library
+
+from vllm.compilation.compile_context import set_compile_context
+from vllm.compilation.counter import compilation_counter
+from vllm.compilation.decorators import support_torch_compile
+from vllm.compilation.levels import CompilationLevel
+from vllm.config import VllmConfig
+from vllm.utils import direct_register_custom_op
+
+global_counter = 0
+
+# create a library to hold the custom op
+silly_lib = Library("silly", "FRAGMENT")  # noqa
+
+
+def silly_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
+                    out: torch.Tensor) -> None:
+    global global_counter
+    global_counter += 1
+    print(f"{global_counter=}")
+    out.copy_(q)
+    out[0] += 1
+
+
+def silly_attention_fake(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
+                         out: torch.Tensor) -> None:
+    return
+
+
+direct_register_custom_op(
+    op_name="attention",
+    op_func=silly_attention,
+    mutates_args=["out"],
+    fake_impl=silly_attention_fake,
+    target_lib=silly_lib,
+)
+
+
+@support_torch_compile
+class SillyModel(nn.Module):
+
+    def __init__(self,
+                 *,
+                 vllm_config: VllmConfig,
+                 prefix: str = '',
+                 **kwargs) -> None:
+        super().__init__()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Overall effect:
+        x += 1
+        x[0] += 2
+        global_counter += 2
+        """
+        x = x + 1
+        x = x + 2
+        out = torch.empty_like(x)
+        torch.ops.silly.attention(x, x, x, out)
+        x = out
+        x = x - 2
+        x = x - 1
+        out = torch.empty_like(x)
+        torch.ops.silly.attention(x, x, x, out)
+        x = out
+        x = x + 1
+        return x
+
+
+def test_simple_piecewise_compile():
+
+    directory = os.path.dirname(__file__)
+    config = os.path.join(directory, "piecewise_compilation_config.json")
+    os.environ["VLLM_TORCH_COMPILE_CONFIG"] = config
+    os.environ["VLLM_TORCH_COMPILE_LEVEL"] = str(CompilationLevel.PIECEWISE)
+
+    model = SillyModel(vllm_config=VllmConfig(), prefix='')
+
+    inputs = torch.randn(100).cuda()
+
+    with compilation_counter.expect(
+            num_graphs_seen=1,  # one graph for the model
+            num_piecewise_graphs_seen=5,  # 2 * num_layers + 1
+            num_piecewise_capturable_graphs_seen=3,  # 1 + num_layers
+            num_inductor_compilations=3,  # num_piecewise_capturable_graphs_seen
+            num_cudagraph_caputured=
+            6,  # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
+    ):
+
+        with set_compile_context([1, 2]):
+            model(inputs)
+
+            model(torch.randn(2).cuda())
+            model(torch.randn(1).cuda())
+
+        input = torch.zeros(2).cuda()
+        global global_counter
+        global_counter = 0
+        output = model(input)
+        assert global_counter == 2
+        assert torch.allclose(output.cpu(), torch.tensor([3., 1.]))
+
+    # clean up to avoid side effects for other tests
+    del os.environ["VLLM_TORCH_COMPILE_CONFIG"]
--- a/vllm-v0.6.2/tests/compile/piecewise/test_toy_llama.py
+++ b/vllm-v0.6.2/tests/compile/piecewise/test_toy_llama.py
@@ -0,0 +1,444 @@
+"""
+Test the piecewise compilation with a simple model, comparing the output
+with and without the piecewise compilation.
+
+This is a tractable model, the weights and computation are specially designed
+if the config `tractable_init` is set to True. Otherwise, the weights are
+initialized randomly with a fixed seed.
+"""
+import os
+from dataclasses import dataclass
+from typing import Optional, Tuple
+
+import torch
+from torch import nn
+from torch.library import Library
+
+from vllm.compilation.compile_context import set_compile_context
+from vllm.compilation.config import CompilationConfig
+from vllm.compilation.counter import compilation_counter
+from vllm.compilation.decorators import support_torch_compile
+from vllm.compilation.levels import CompilationLevel
+from vllm.config import VllmConfig
+from vllm.plugins import set_compilation_config
+from vllm.utils import direct_register_custom_op
+
+# create a library to hold the custom op
+silly_lib = Library("silly", "FRAGMENT")  # noqa
+
+
+def silly_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
+                    out: torch.Tensor) -> None:
+    out.copy_(q)
+    out += k
+    out += v
+
+
+def silly_attention_fake(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
+                         out: torch.Tensor) -> None:
+    return
+
+
+direct_register_custom_op(
+    op_name="attention",
+    op_func=silly_attention,
+    mutates_args=["out"],
+    fake_impl=silly_attention_fake,
+    target_lib=silly_lib,
+)
+
+
+@dataclass
+class LlamaConfig:
+    hidden_size: int = 128
+    mlp_size: int = 256
+    vocab_size: int = 128
+    num_layers: int = 2
+    init_value: float = 1.0
+    tractable_init: bool = False
+    random_seed: int = 0
+
+    def __post_init__(self):
+        assert self.mlp_size >= self.hidden_size
+
+
+class LlamaMLP(nn.Module):
+
+    def __init__(self, config: LlamaConfig) -> None:
+        super().__init__()
+        self.gate_up_projection = nn.Linear(
+            in_features=config.hidden_size,
+            out_features=config.mlp_size * 2,
+            bias=False,
+        )
+        self.down_projection = nn.Linear(
+            in_features=config.mlp_size,
+            out_features=config.hidden_size,
+            bias=False,
+        )
+
+        if config.tractable_init:
+            nn.init.eye_(self.gate_up_projection.weight.data[:config.mlp_size])
+            nn.init.eye_(self.gate_up_projection.weight.data[config.mlp_size:])
+            nn.init.eye_(self.down_projection.weight.data)
+        else:
+            nn.init.xavier_normal_(self.gate_up_projection.weight.data,
+                                   generator=torch.Generator().manual_seed(
+                                       config.random_seed),
+                                   gain=0.001)
+            nn.init.xavier_normal_(self.down_projection.weight.data,
+                                   generator=torch.Generator().manual_seed(
+                                       config.random_seed),
+                                   gain=0.001)
+
+    def forward(self, x):
+        # for tractable_init and positive input, this is
+        # essentially an elementwise-square
+        x = self.gate_up_projection(x)
+        x = x[:, :x.size(1) // 2] * torch.nn.functional.relu(
+            x[:, x.size(1) // 2:])
+        x = self.down_projection(x)
+        return x
+
+
+class LlamaAttention(nn.Module):
+
+    def __init__(self, config: LlamaConfig) -> None:
+        super().__init__()
+        self.qkv_projection = nn.Linear(
+            in_features=config.hidden_size,
+            out_features=config.hidden_size * 3,
+            bias=False,
+        )
+
+        self.output_projection = nn.Linear(
+            in_features=config.hidden_size,
+            out_features=config.hidden_size,
+            bias=False,
+        )
+
+        if config.tractable_init:
+            nn.init.eye_(self.qkv_projection.weight.data[:config.hidden_size])
+            nn.init.eye_(self.qkv_projection.weight.data[config.hidden_size:2 *
+                                                         config.hidden_size])
+            nn.init.eye_(self.qkv_projection.weight.data[2 *
+                                                         config.hidden_size:])
+            nn.init.eye_(self.output_projection.weight.data)
+        else:
+            nn.init.xavier_normal_(self.qkv_projection.weight.data,
+                                   generator=torch.Generator().manual_seed(
+                                       config.random_seed),
+                                   gain=0.001)
+            nn.init.xavier_normal_(self.output_projection.weight.data,
+                                   generator=torch.Generator().manual_seed(
+                                       config.random_seed),
+                                   gain=0.001)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        # for tractable_init, this is:
+        # output = (hidden_states * 3 + positions * 2)
+        qkv = self.qkv_projection(hidden_states)
+        hidden_size = qkv.size(-1) // 3
+        q, k, v = qkv.split([hidden_size, hidden_size, hidden_size], dim=-1)
+
+        q = q + positions.unsqueeze(1)
+        k = k + positions.unsqueeze(1)
+
+        attn_output = torch.empty_like(q)
+        torch.ops.silly.attention(q, k, v, attn_output)
+
+        output = self.output_projection(attn_output)
+        return output
+
+
+class LlamaDecoderLayer(nn.Module):
+
+    def __init__(self, config: LlamaConfig) -> None:
+        super().__init__()
+        self.self_attention = LlamaAttention(config)
+        self.mlp = LlamaMLP(config)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        For tractable computation:
+        - if residual is None, the outputs are:
+            - residual = (hidden_states + 1) * 3 + positions * 2 + hidden_states = hidden_states * 4 + positions * 2 + 3
+            - hidden_states = (residual + 1) ** 2
+        - if residual is not None, the outputs are:
+            - residual = (hidden_states + residual + 1) * 3 + positions * 2 + hidden_states + residual = (hidden_states + residual) * 4 + positions * 2 + 3
+            - hidden_states = (residual + 1) ** 2
+        """ # noqa
+        if residual is None:
+            residual = hidden_states
+            hidden_states = hidden_states + 1
+        else:
+            hidden_states = hidden_states + residual
+            residual = hidden_states
+            hidden_states = hidden_states + 1
+
+        hidden_states = self.self_attention(positions=positions,
+                                            hidden_states=hidden_states)
+
+        hidden_states = hidden_states + residual
+        residual = hidden_states
+        hidden_states = hidden_states + 1
+        hidden_states = self.mlp(hidden_states)
+
+        return hidden_states, residual
+
+
+@support_torch_compile
+class LlamaModel(nn.Module):
+
+    def __init__(self,
+                 *,
+                 vllm_config: VllmConfig,
+                 config: LlamaConfig,
+                 prefix: str = '',
+                 **kwargs) -> None:
+        super().__init__()
+        self.embedding_tokens = nn.Embedding(
+            num_embeddings=config.vocab_size,
+            embedding_dim=config.hidden_size,
+        )
+        self.layers = nn.ModuleList(
+            [LlamaDecoderLayer(config) for _ in range(config.num_layers)])
+
+        # this is the initial value of the hidden states
+        self.embedding_tokens.weight.data.fill_(config.init_value)
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor],
+        positions: torch.Tensor,
+    ) -> torch.Tensor:
+        hidden_states = self.embedding_tokens(input_ids)
+        residual = None
+        for layer in self.layers:
+            hidden_states, residual = layer(positions, hidden_states, residual)
+        return hidden_states
+
+
+def tractable_computation(input_ids: torch.Tensor,
+                          positions: torch.Tensor,
+                          config: LlamaConfig,
+                          init_value: float = 1.0) -> torch.Tensor:
+    hidden_states = torch.ones(input_ids.size(0),
+                               config.hidden_size,
+                               device=input_ids.device,
+                               dtype=input_ids.dtype) * init_value
+
+    # first layer
+    residual = hidden_states * 4 + positions.unsqueeze(1) * 2 + 3
+    hidden_states = (residual + 1)**2
+
+    # following layers
+    for _ in range(config.num_layers - 1):
+        hidden_states = hidden_states + residual
+        residual = hidden_states * 4 + positions.unsqueeze(1) * 2 + 3
+        hidden_states = (residual + 1)**2
+
+    return hidden_states
+
+
+@torch.inference_mode
+def run_model(llama_config,
+              use_compile: bool,
+              split_attn: bool = False) -> torch.Tensor:
+
+    if use_compile:
+        os.environ["VLLM_TORCH_COMPILE_LEVEL"] = str(
+            CompilationLevel.PIECEWISE)
+
+        if split_attn:
+            set_compilation_config(
+                CompilationConfig(
+                    use_cudagraph=True,
+                    non_cudagraph_ops=["silly.attention"],
+                ))
+        else:
+            set_compilation_config(CompilationConfig(use_cudagraph=True, ))
+    else:
+        os.environ["VLLM_TORCH_COMPILE_LEVEL"] = str(
+            CompilationLevel.NO_COMPILATION)
+        set_compilation_config(None)
+
+    model = LlamaModel(config=llama_config,
+                       vllm_config=VllmConfig(),
+                       prefix="").eval().cuda()
+
+    B = 16  # max batch size
+    input_ids = torch.randint(0, llama_config.vocab_size, (B, )).cuda()
+    positions = torch.arange(B).cuda()
+
+    with set_compile_context([1, 2]):
+        model(input_ids, positions)
+        model(input_ids[:2], positions[:2])
+        model(input_ids[:1], positions[:1])
+
+    input_ids[:2].zero_()
+    output = model(input_ids[:2], positions[:2])
+
+    # manual cleanup
+    del os.environ["VLLM_TORCH_COMPILE_LEVEL"]
+    set_compilation_config(None)
+
+    output = output.cpu()
+
+    if llama_config.tractable_init:
+        expected_output = tractable_computation(input_ids[:2], positions[:2],
+                                                llama_config).cpu()
+
+        assert torch.allclose(output, expected_output)
+    else:
+        return output.cpu()
+
+
+def test_toy_llama():
+    # compare output with and without piecewise compilation
+
+    llama_config = LlamaConfig(hidden_size=128,
+                               mlp_size=256,
+                               vocab_size=128,
+                               num_layers=12)
+
+    tractable_config = LlamaConfig(hidden_size=128,
+                                   mlp_size=256,
+                                   vocab_size=128,
+                                   num_layers=2,
+                                   tractable_init=True)
+
+    outputs = []
+    with compilation_counter.expect(
+            num_graphs_seen=0,
+            num_piecewise_graphs_seen=0,
+            num_piecewise_capturable_graphs_seen=0,
+            num_inductor_compilations=0,
+            num_cudagraph_caputured=0,
+    ):
+        outputs.append(run_model(llama_config, use_compile=False))
+    run_model(tractable_config, use_compile=False)
+
+    with compilation_counter.expect(
+            num_graphs_seen=1,  # one graph for the model
+            num_piecewise_graphs_seen=1,
+            num_piecewise_capturable_graphs_seen=1,
+            num_inductor_compilations=1,  # num_piecewise_capturable_graphs_seen
+            num_cudagraph_caputured=
+            2,  # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
+    ):
+        outputs.append(run_model(llama_config, use_compile=True))
+    run_model(tractable_config, use_compile=True)
+
+    with compilation_counter.expect(
+            num_graphs_seen=1,  # one graph for the model
+            num_piecewise_graphs_seen=2 * llama_config.num_layers +
+            1,  # 2 * num_layers + 1
+            num_piecewise_capturable_graphs_seen=1 +
+            llama_config.num_layers,  # 1 + num_layers
+            num_inductor_compilations=1 +
+            llama_config.num_layers,  # num_piecewise_capturable_graphs_seen
+            num_cudagraph_caputured=2 *
+        (1 + llama_config.num_layers
+         ),  # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
+    ):
+        outputs.append(
+            run_model(llama_config, use_compile=True, split_attn=True))
+    run_model(tractable_config, use_compile=True, split_attn=True)
+
+    for i in range(1, len(outputs)):
+        assert torch.allclose(outputs[0], outputs[i])
+
+
+@torch.inference_mode
+def benchmark():
+    os.environ["VLLM_TORCH_COMPILE_LEVEL"] = str(CompilationLevel.PIECEWISE)
+    from triton.testing import do_bench
+
+    # similar to llama 3.1-8B
+    llama_config = LlamaConfig(hidden_size=4096,
+                               mlp_size=14336,
+                               vocab_size=128 * 1024,
+                               num_layers=32)
+
+    # a tiny model to measure the overhead
+    # of piecewise cudagraph
+    llama_config = LlamaConfig(hidden_size=40,
+                               mlp_size=80,
+                               vocab_size=128,
+                               num_layers=2)
+
+    cudagraph_sizes = [1, 2, 4] + [i * 8 for i in range(1, 33)]
+
+    eager_time = {}
+    full_cudagraph_time = {}
+    piecewise_cudagraph_time = {}
+
+    pool = torch.cuda.graph_pool_handle()
+
+    for piecewise in [False, True]:
+        if piecewise:
+            set_compilation_config(
+                CompilationConfig(
+                    use_cudagraph=True,
+                    non_cudagraph_ops=["silly.attention"],
+                ))
+        else:
+            set_compilation_config(None)
+
+        model = LlamaModel(config=llama_config,
+                           vllm_config=VllmConfig(),
+                           prefix="").eval().cuda().to(torch.bfloat16)
+
+        B = 256  # max batch size
+        input_ids = torch.randint(0, llama_config.vocab_size, (B, )).cuda()
+        positions = torch.arange(B).cuda().to(torch.bfloat16)
+
+        graphs = {}
+
+        with set_compile_context(cudagraph_sizes):
+            model(input_ids, positions)
+            for b in cudagraph_sizes[::-1]:
+                if not piecewise:
+                    graph = torch.cuda.CUDAGraph()
+                    with torch.cuda.graph(graph, pool=pool):
+                        output = model(input_ids[:b], positions[:b])
+                    graphs[b] = (graph, output)
+                else:
+                    output = model(input_ids[:b], positions[:b])
+                    graphs[b] = (model, output)
+        for b in cudagraph_sizes:
+            if piecewise:
+                # noqa is for `Function definition does not bind loop variable`
+                # it will be problematic if we save the created lambda function
+                # and use it later, because it will look up the name `b` in the
+                # enclosing scope, and the value of `b` will always be 256.
+                # it is fine here, because we only use the lambda function once.
+                runtime = do_bench(lambda: graphs[b][0]  # noqa
+                                   (input_ids[:b], positions[:b]))  # noqa
+                piecewise_cudagraph_time[b] = runtime
+            else:
+                runtime = do_bench(lambda: graphs[b][0].replay())  # noqa
+                eager_runtime = do_bench(
+                    lambda: model(input_ids[:b], positions[:b]))  # noqa
+                full_cudagraph_time[b] = runtime
+                eager_time[b] = eager_runtime
+
+    # print in tabular format
+    print("batch size\teager mode\tfull cudagraph\tpiecewise cudagraph")
+    for b in cudagraph_sizes:
+        print(f"{b}\t{eager_time[b]:.3f}\t{full_cudagraph_time[b]:.3f}"
+              f"\t{piecewise_cudagraph_time[b]:.3f}")
+
+
+if __name__ == "__main__":
+    benchmark()
--- a/vllm-v0.6.2/tests/compile/test_basic_correctness.py
+++ b/vllm-v0.6.2/tests/compile/test_basic_correctness.py
@@ -0,0 +1,126 @@
+import dataclasses
+from typing import Dict, List, Optional
+
+import pytest
+
+from vllm.compilation.levels import CompilationLevel
+from vllm.utils import cuda_device_count_stateless
+
+from ..utils import compare_all_settings
+
+
+@dataclasses.dataclass
+class TestSetting:
+    model: str
+    model_args: List[str]
+    pp_size: int
+    tp_size: int
+    attn_backend: str
+    method: str
+    fullgraph: bool
+
+
+# representative settings for testing
+test_settings = [
+    # basic llama model
+    TestSetting(
+        model="meta-llama/Llama-3.2-1B",
+        model_args=[],
+        pp_size=2,
+        tp_size=2,
+        attn_backend="FLASHINFER",
+        method="generate",
+        fullgraph=True,
+    ),
+    # llama model with quantization
+    TestSetting(
+        model="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
+        model_args=["--quantization", "gptq"],
+        pp_size=1,
+        tp_size=1,
+        attn_backend="FLASH_ATTN",
+        method="generate",
+        fullgraph=True,
+    ),
+    # MoE model
+    TestSetting(
+        model="ibm/PowerMoE-3b",
+        model_args=[],
+        pp_size=1,
+        tp_size=2,
+        attn_backend="FLASH_ATTN",
+        method="generate",
+        fullgraph=True,
+    ),
+    # embedding model
+    TestSetting(
+        model="BAAI/bge-multilingual-gemma2",
+        model_args=["--task", "embedding"],
+        pp_size=1,
+        tp_size=1,
+        attn_backend="FLASHINFER",
+        method="encode",
+        fullgraph=True,
+    ),
+    # vision language model
+    TestSetting(
+        model="microsoft/Phi-3.5-vision-instruct",
+        model_args=["--trust-remote-code", "--max-model-len", "2048"],
+        pp_size=2,
+        tp_size=1,
+        attn_backend="FLASH_ATTN",
+        method="generate_with_image",
+        fullgraph=False,
+    ),
+]
+
+
+# we cannot afford testing the full Catesian product
+# of all models and all levels
+@pytest.mark.parametrize("test_setting", test_settings)
+def test_compile_correctness(test_setting: TestSetting):
+    # this test is run under multiple suits, with different GPUs.
+    # make sure we only run the test with correct CUDA devices.
+    # don't use "<", as it will duplicate the tests.
+    model = test_setting.model
+    model_args = test_setting.model_args
+    pp_size = test_setting.pp_size
+    tp_size = test_setting.tp_size
+    attn_backend = test_setting.attn_backend
+    method = test_setting.method
+    fullgraph = test_setting.fullgraph
+    if cuda_device_count_stateless() != pp_size * tp_size:
+        pytest.skip("Not correct CUDA devices for the test.")
+    import os
+    os.environ["VLLM_ATTENTION_BACKEND"] = attn_backend
+    final_args = ["--enforce-eager"] + model_args + ["-pp", str(pp_size)] + \
+                ["-tp", str(tp_size)]
+
+    all_envs: List[Optional[Dict[str, str]]] = []
+
+    for level in [
+            CompilationLevel.NO_COMPILATION,
+            CompilationLevel.PIECEWISE,
+    ]:
+        all_envs.append({"VLLM_TORCH_COMPILE_LEVEL": str(level)})
+
+    # inductor will change the output, so we only compare if the output
+    # is close, not exactly the same.
+    compare_all_settings(
+        model, [final_args] * 2,
+        all_envs,
+        method=method if method != "generate" else "generate_close")
+    all_envs.clear()
+
+    for level in [
+            CompilationLevel.NO_COMPILATION,
+            CompilationLevel.DYNAMO_AS_IS,
+            CompilationLevel.DYNAMO_ONCE,
+    ]:
+        all_envs.append({"VLLM_TORCH_COMPILE_LEVEL": str(level)})
+        if level != CompilationLevel.DYNAMO_ONCE and not fullgraph:
+            # "DYNAMO_ONCE" will always use fullgraph
+            all_envs[-1][
+                "VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "0"  # type: ignore
+
+    compare_all_settings(model, [final_args] * 3, all_envs, method=method)
--- a/vllm-v0.6.2/tests/compile/test_full_graph.py
+++ b/vllm-v0.6.2/tests/compile/test_full_graph.py
@@ -0,0 +1,20 @@
+import pytest
+
+from vllm.compilation.levels import CompilationLevel
+
+from ..utils import fork_new_process_for_each_test
+from .utils import TEST_MODELS, check_full_graph_support
+
+
+@pytest.mark.parametrize("model_info", TEST_MODELS)
+@pytest.mark.parametrize(
+    "optimization_level",
+    [CompilationLevel.DYNAMO_ONCE, CompilationLevel.PIECEWISE])
+@fork_new_process_for_each_test
+def test_full_graph(model_info, optimization_level):
+    model = model_info[0]
+    model_kwargs = model_info[1]
+    check_full_graph_support(model,
+                             model_kwargs,
+                             optimization_level,
+                             tp_size=1)
--- a/vllm-v0.6.2/tests/compile/test_fusion.py
+++ b/vllm-v0.6.2/tests/compile/test_fusion.py
@@ -0,0 +1,92 @@
+import pytest
+import torch
+from compressed_tensors.quantization import FP8_DTYPE
+
+import vllm.envs as envs
+from vllm.compilation.config import CompilationConfig
+from vllm.compilation.fusion import (FusionPass, find_auto_fn,
+                                     find_auto_fn_maybe)
+from vllm.compilation.reshapes import RedundantReshapesPass
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+    apply_fp8_linear)
+
+from .backend import TestBackend
+
+
+class TestModel(torch.nn.Module):
+
+    def __init__(self, hidden_size: int, eps: float, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.norm = [RMSNorm(hidden_size, eps) for _ in range(3)]
+        self.scale = [torch.rand(1, dtype=torch.float32) for _ in range(4)]
+        self.w = [
+            torch.rand(hidden_size, hidden_size).to(dtype=FP8_DTYPE).t()
+            for _ in range(2)
+        ]
+
+    def forward(self, x):
+        resid = torch.relu(x)
+        y = self.norm[0](x)
+
+        x2 = apply_fp8_linear(y, self.w[0], self.scale[0], self.scale[1])
+        # make sure resid is used for replacement to work
+        y2, resid = self.norm[1](x2, resid)
+
+        x3 = apply_fp8_linear(y2, self.w[1], self.scale[2], self.scale[3])
+        y3, resid = self.norm[2](x3, resid)  # use resid here
+        return y3
+
+
+# Init does pattern registration, which can only happen once
+config = CompilationConfig(enable_fusion=True)
+reshape_pass = RedundantReshapesPass(config)
+fusion_pass = FusionPass.instance(config)
+
+
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("hidden_size", [64, 3392, 4096])
+@pytest.mark.parametrize("num_tokens", [7, 256, 533, 2048, 2049])
+@pytest.mark.parametrize("eps", [1e-5, 1e-6])
+@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE != "cuda",
+                    reason="Only test on CUDA")
+def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps):
+    torch.set_default_device("cuda")
+    torch.set_default_dtype(torch.float16)
+
+    if eps != 1e-5:
+        pytest.skip("Only test eps=1e-5 for now")
+
+    # Reshape pass is needed for the fusion pass to work
+    backend = TestBackend(reshape_pass, fusion_pass)
+    model = TestModel(hidden_size, eps)
+
+    # First dimension dynamic
+    x = torch.rand(num_tokens, hidden_size)
+    torch._dynamo.mark_dynamic(x, 0)
+
+    result = model(x)
+
+    model2 = torch.compile(model, backend=backend)
+    result2 = model2(x)
+
+    # Check that it gives the same answer
+    torch.testing.assert_close(result, result2, atol=1e-3, rtol=1e-3)
+
+    # Check substitution worked
+    pre_nodes = backend.graph_pre_pass.nodes
+    post_nodes = backend.graph_post_pass.nodes
+
+    rms_quant = torch.ops._C.rms_norm_static_fp8_quant.default
+    add_rms_quant = torch.ops._C.fused_add_rms_norm_static_fp8_quant.default
+    fp8_quant = torch.ops._C.static_scaled_fp8_quant.default
+
+    # In pre-nodes, fp8 quant should be present and fused kernels should not
+    assert find_auto_fn_maybe(pre_nodes, rms_quant) is None
+    assert find_auto_fn_maybe(pre_nodes, add_rms_quant) is None
+    find_auto_fn(pre_nodes, fp8_quant)
+
+    # In post-nodes, fused kernels should be present and fp8 quant should not
+    find_auto_fn(post_nodes, rms_quant)
+    find_auto_fn(post_nodes, add_rms_quant)
+    assert find_auto_fn_maybe(post_nodes, fp8_quant) is None
--- a/vllm-v0.6.2/tests/compile/test_wrapper.py
+++ b/vllm-v0.6.2/tests/compile/test_wrapper.py
@@ -0,0 +1,59 @@
+from typing import Optional
+
+import torch
+
+from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher
+
+
+class MyMod(torch.nn.Module):
+
+    def forward(self, x: torch.Tensor, cache: Optional[torch.Tensor] = None):
+        if cache is not None:
+            return x + cache
+        return x * 2
+
+
+class MyWrapper(TorchCompileWrapperWithCustomDispatcher):
+
+    def __init__(self, model):
+        self.model = model
+        compiled_callable = torch.compile(self.forward, backend="eager")
+        super().__init__(compiled_callable)
+
+    def forward(self, x: torch.Tensor, cache: Optional[torch.Tensor] = None):
+        # this is the function to be compiled
+        return self.model(x, cache)
+
+    def __call__(self, x: torch.Tensor, cache: Optional[torch.Tensor] = None):
+        # let torch.compile compile twice
+        if len(self.compiled_codes) == 2:
+            dispatch_id = 0 if cache is None else 1
+            with self.dispatch_to_code(dispatch_id):
+                return self.forward(x, cache)
+        else:
+            return self.compiled_callable(x, cache)
+
+
+def test_torch_compile_wrapper():
+    mod = MyMod()
+    wrappers = []
+    for i in range(3):
+        torch._dynamo.reset()
+        wrapper = MyWrapper(mod)
+        wrappers.append(wrapper)
+        x = torch.tensor([1])
+        wrapper(x, None)  # profile run, compile
+        # create a cache tensor
+        cache = torch.tensor([2])
+        wrapper(x, cache)  # warm up with cache, recompile
+
+        # for new input, dispatch to the compiled code directly
+        new_x = torch.tensor([3])
+        assert wrapper(new_x,
+                       None).item() == 6  # dispatch to the first compiled code
+        assert wrapper(
+            new_x, cache).item() == 5  # dispatch to the second compiled code
+
+    for wrapper in wrappers:
+        # make sure they have independent compiled codes
+        assert len(wrapper.compiled_codes) == 2
--- a/vllm-v0.6.2/tests/compile/utils.py
+++ b/vllm-v0.6.2/tests/compile/utils.py
@@ -0,0 +1,97 @@
+import os
+
+import torch
+
+from tests.quantization.utils import is_quant_method_supported
+from vllm import LLM, SamplingParams
+from vllm.compilation.levels import CompilationLevel
+from vllm.platforms import current_platform
+
+TEST_MODELS = [
+    ("facebook/opt-125m", {}),
+    ("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", {
+        "dtype": torch.float16,
+        "quantization": "compressed-tensors"
+    }),
+    ("neuralmagic/Meta-Llama-3-8B-Instruct-FP8", {
+        "dtype": torch.float16,
+        "quantization": "fp8"
+    }),
+    ("nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dyn-Per-Token-2048-Samples", {
+        "quantization": "compressed-tensors"
+    }),
+    ("meta-llama/Meta-Llama-3-8B", {}),
+]
+
+if is_quant_method_supported("aqlm"):
+    TEST_MODELS.append(("ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf", {
+        "quantization": "aqlm"
+    }))
+
+# TODO: figure out why this fails.
+if False and is_quant_method_supported("gguf"):  # noqa: SIM223
+    TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", {
+        "quantization": "gguf"
+    }))
+
+if is_quant_method_supported("gptq"):
+    TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", {
+        "quantization": "gptq"
+    }))
+
+if is_quant_method_supported("gptq_marlin"):
+    TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", {
+        "quantization": "gptq_marlin"
+    }))
+
+if is_quant_method_supported("gptq_marlin_24"):
+    TEST_MODELS.append(("alexm-nm/tinyllama-24-marlin24-4bit-g128", {
+        "quantization": "gptq_marlin_24"
+    }))
+
+if is_quant_method_supported("marlin"):
+    TEST_MODELS.append(("robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin", {
+        "quantization": "marlin"
+    }))
+
+if not current_platform.is_rocm() and is_quant_method_supported("awq"):
+    TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", {
+        "quantization": "AWQ"
+    }))
+
+
+def check_full_graph_support(model,
+                             model_kwargs,
+                             optimization_level,
+                             tp_size=1):
+    # make sure these models can be captured in full graph mode
+    os.environ["VLLM_TORCH_COMPILE_LEVEL"] = str(optimization_level)
+    os.environ["VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "1"
+
+    # The base meta llama uses too much memory.
+    if (model == "meta-llama/Meta-Llama-3-8B"
+            and optimization_level >= CompilationLevel.PIECEWISE):
+        return
+
+    print(f"MODEL={model}")
+
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+    sampling_params = SamplingParams(temperature=0)
+    llm = LLM(model=model,
+              enforce_eager=True,
+              tensor_parallel_size=tp_size,
+              disable_custom_all_reduce=True,
+              **model_kwargs)
+
+    outputs = llm.generate(prompts, sampling_params)
+
+    # Print the outputs.
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
--- a/vllm-v0.6.2/tests/conftest.py
+++ b/vllm-v0.6.2/tests/conftest.py
--- a/vllm-v0.6.2/tests/core/init.py
+++ b/vllm-v0.6.2/tests/core/init.py
--- a/vllm-v0.6.2/tests/core/block/init.py
+++ b/vllm-v0.6.2/tests/core/block/init.py
--- a/vllm-v0.6.2/tests/core/block/conftest.py
+++ b/vllm-v0.6.2/tests/core/block/conftest.py
@@ -0,0 +1,12 @@
+import pytest
+
+
+@pytest.fixture()
+def should_do_global_cleanup_after_test() -> bool:
+    """Disable the global cleanup fixture for tests in this directory. This
+    provides a ~10x speedup for unit tests that don't load a model to GPU.
+
+    This requires that tests in this directory clean up after themselves if they
+    use the GPU.
+    """
+    return False
--- a/vllm-v0.6.2/tests/core/block/e2e/init.py
+++ b/vllm-v0.6.2/tests/core/block/e2e/init.py
--- a/vllm-v0.6.2/tests/core/block/e2e/conftest.py
+++ b/vllm-v0.6.2/tests/core/block/e2e/conftest.py
@@ -0,0 +1,67 @@
+from typing import Callable, Iterable, Optional
+
+import pytest
+
+from vllm import LLM
+from vllm.distributed import cleanup_dist_env_and_memory
+from vllm.model_executor.utils import set_random_seed
+
+
+@pytest.fixture
+def baseline_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
+                           baseline_llm_kwargs, seed):
+    return create_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
+                                baseline_llm_kwargs, seed)
+
+
+@pytest.fixture
+def test_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
+                       test_llm_kwargs, seed):
+    return create_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
+                                test_llm_kwargs, seed)
+
+
+def create_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
+                         distinct_llm_kwargs, seed):
+    kwargs = {
+        **common_llm_kwargs,
+        **per_test_common_llm_kwargs,
+        **distinct_llm_kwargs,
+    }
+
+    def generator_inner():
+        llm = LLM(**kwargs)
+
+        set_random_seed(seed)
+
+        yield llm
+        del llm
+        cleanup_dist_env_and_memory()
+
+    for llm in generator_inner():
+        yield llm
+        del llm
+
+
+def get_text_from_llm_generator(llm_generator: Iterable[LLM],
+                                prompts,
+                                sampling_params,
+                                llm_cb: Optional[Callable[[LLM],
+                                                          None]] = None):
+    for llm in llm_generator:
+        if llm_cb:
+            llm_cb(llm)
+        outputs = llm.generate(prompts, sampling_params, use_tqdm=True)
+        text = [output.outputs[0].text for output in outputs]
+        del llm
+
+    return text
+
+
+def get_token_ids_from_llm_generator(llm_generator, prompts, sampling_params):
+    for llm in llm_generator:
+        outputs = llm.generate(prompts, sampling_params, use_tqdm=True)
+        token_ids = [output.outputs[0].token_ids for output in outputs]
+        del llm
+
+    return token_ids
--- a/vllm-v0.6.2/tests/core/block/e2e/test_correctness.py
+++ b/vllm-v0.6.2/tests/core/block/e2e/test_correctness.py
@@ -0,0 +1,489 @@
+from itertools import cycle
+
+import pytest
+
+from vllm import SamplingParams
+
+from .conftest import get_token_ids_from_llm_generator
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        # Use a small model for a fast test.
+        "model": "facebook/opt-125m",
+
+        # skip cuda graph creation for fast test.
+        "enforce_eager": True,
+
+        # Allow only 5 sequences of ~1024 tokens in worst case.
+        "block_size": 16,
+        "num_gpu_blocks_override": 5 * (64 + 1),
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [{
+    "preemption_mode": "swap"
+}, {
+    "preemption_mode": "recompute"
+}])
+@pytest.mark.parametrize("batch_size", [10])
+@pytest.mark.parametrize("seed", [1])
+def test_block_manager_with_preemption(baseline_llm_generator,
+                                       test_llm_generator, batch_size):
+    """Verify block manager produces same outputs even when there is preemption.
+
+    This constructs two LLM, each with limited number of GPU blocks. The limit
+    is decided such that as the sequences in the batch grow, sequences must be
+    preempted and removed from cache.
+
+    If the output token ids are equivalent, then we have confidence that the KV
+    cache is not corrupted.
+
+    NOTE: We want a significant number of generated tokens so that any incorrect
+    KV mapping has time to build up error.
+
+    NOTE(Kuntai): Though we have removed block manager v1, this test is still
+    useful as it asserts the behavior of block manager v2 (now it is called 
+    SelfAttnBlockSpaceManager) is the same when swapping / preemption, so we  
+    keep this test.
+    """
+    output_len = 1024
+    temperature = 0.0
+
+    # We want to ensure equality even with preemption.
+    # We force the total block size to be 1 + cdiv(output_len, block_size)
+    # so that only one sequence can fit at a time (once the sequences grow).
+
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+
+    prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
+
+    sampling_params = SamplingParams(
+        max_tokens=output_len,
+        ignore_eos=True,
+        temperature=temperature,
+    )
+
+    baseline_token_ids = get_token_ids_from_llm_generator(
+        baseline_llm_generator, prompts, sampling_params)
+
+    test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
+                                                      prompts, sampling_params)
+
+    for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
+                                                    test_token_ids):
+        assert expected_token_ids == actual_token_ids
+
+    assert baseline_token_ids == test_token_ids
+
+
+'''
+=============================
+Modify by vllm_mlu
+=============================
+@brief(block_size): MLU paged attention only support block_size=16
+''' 
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        # Use a small model for a fast test.
+        "model": "facebook/opt-125m",
+
+        # Our prompts will generate 128 tokens; since the prompts themselves are
+        # small, we don't need much KV space beyond 128.
+        "max_model_len": 160,
+
+        # skip cuda graph creation for fast test.
+        "enforce_eager": True,
+    }])
+@pytest.mark.parametrize(
+    "per_test_common_llm_kwargs",
+    [
+        {
+            "block_size": 16,
+
+            # Allow only 2 sequences of ~128 tokens in worst case.
+            # Note 8 = 128/block_size
+            "num_gpu_blocks_override": 2 * (8 + 1),
+        },
+        {
+            "block_size": 16,
+
+            # Allow only 2 sequences of ~128 tokens in worst case.
+            # Note 16 = 128/block_size
+            "num_gpu_blocks_override": 2 * (16 + 2),
+        }
+    ])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{
+    "num_lookahead_slots": 0,
+}])
+@pytest.mark.parametrize(
+    "test_llm_kwargs",
+    [
+        {
+            # We run one test with block_size < lookahead_slots, one test with
+            # block_size > lookahead_slots
+            "num_lookahead_slots": 10,
+            "preemption_mode": "swap",
+        },
+        {
+            "num_lookahead_slots": 10,
+            "preemption_mode": "recompute",
+        }
+    ])
+@pytest.mark.parametrize("batch_size", [4])
+@pytest.mark.parametrize("seed", [1])
+def test_lookahead_greedy_equality_with_preemption(baseline_llm_generator,
+                                                   test_llm_generator,
+                                                   batch_size):
+    """Verify vLLM produces the same output with greedy sampling, when lookahead
+    scheduling is used vs. not.
+
+    Lookahead scheduling is not expected to modify the output, as it simply
+    allocates empty slots ahead of the known token ids in a sliding fashion.
+
+    This test constrains the total number of blocks to force preemption. It also
+    varies the block size so that the lookahead size is less than and greater
+    than the block size.
+    """
+    output_len = 128
+    temperature = 0.0
+
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+
+    prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
+
+    sampling_params = SamplingParams(
+        max_tokens=output_len,
+        ignore_eos=True,
+        temperature=temperature,
+    )
+
+    print('Getting token ids without lookahead scheduling')
+    baseline_token_ids = get_token_ids_from_llm_generator(
+        baseline_llm_generator, prompts, sampling_params)
+
+    print('Getting token ids with lookahead scheduling')
+    test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
+                                                      prompts, sampling_params)
+
+    for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
+                                                    test_token_ids):
+        assert expected_token_ids == actual_token_ids
+
+    assert baseline_token_ids == test_token_ids
+
+
+'''
+=============================
+Modify by vllm_mlu
+=============================
+@brief(block_size): Only support Paged block_size 16, change block_size from 8 to 16
+'''
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [
+        {
+            # Use a small model for a fast test.
+            "model": "facebook/opt-125m",
+
+            # skip cuda graph creation for fast test.
+            "enforce_eager": True,
+            "enable_chunked_prefill": True,
+            "gpu_memory_utilization": 0.6,
+        },
+    ])
+@pytest.mark.parametrize("per_test_common_llm_kwargs",
+                         [{
+                             "block_size": 16,
+                             "max_num_batched_tokens": 2,
+                             "max_num_seqs": 2,
+                         }, {
+                             "block_size": 16,
+                             "max_num_batched_tokens": 3,
+                             "max_num_seqs": 2,
+                         }, {
+                             "block_size": 16,
+                             "max_num_batched_tokens": 256,
+                             "max_num_seqs": 10,
+                         }])
+@pytest.mark.parametrize("baseline_llm_kwargs", [
+    {},
+])
+@pytest.mark.parametrize("test_llm_kwargs", [
+    {
+        "num_lookahead_slots": 0,
+    },
+    {
+        "num_lookahead_slots": 5,
+    },
+])
+@pytest.mark.parametrize("batch_size", [4])
+@pytest.mark.parametrize("seed", [1])
+def test_chunked_prefill_block_manager(baseline_llm_generator,
+                                       test_llm_generator, batch_size):
+    """Verify that chunked prefill works with SelfAttnBlockSpaceManager, 
+    with and without lookahead scheduling.
+    """
+    output_len = 32
+    temperature = 0.0
+
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        ("1 + " * 50) + " 1 = ",  # Longer prompt.
+        "The capital of France is",
+        "The future of AI is",
+    ]
+
+    prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
+
+    sampling_params = SamplingParams(
+        max_tokens=output_len,
+        ignore_eos=True,
+        temperature=temperature,
+    )
+
+    print('Getting token ids with BlockManager')
+    baseline_token_ids = get_token_ids_from_llm_generator(
+        baseline_llm_generator, prompts, sampling_params)
+
+    print('Getting token ids with BlockManager, with lookahead slots.')
+    test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
+                                                      prompts, sampling_params)
+
+    for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
+                                                    test_token_ids):
+        assert expected_token_ids == actual_token_ids
+
+    assert baseline_token_ids == test_token_ids
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        # Use a small model for a fast test.
+        "model": "facebook/opt-125m",
+
+        # skip cuda graph creation for fast test.
+        "enforce_eager": True,
+
+        # Allow only 5 sequences of ~1024 tokens in worst case.
+        "block_size": 16,
+        "num_gpu_blocks_override": 5 * (64 + 1),
+
+        # Enable prefill cache
+        "enable_prefix_caching": True,
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [{
+    "preemption_mode": "swap"
+}, {
+    "preemption_mode": "recompute"
+}])
+@pytest.mark.parametrize("batch_size", [10])
+@pytest.mark.parametrize("seed", [1])
+def test_block_manager_prefix_caching_enabled_with_preemption(
+        baseline_llm_generator, test_llm_generator, batch_size):
+    """Verify block manager produces same outputs even when there is preemption.
+
+    This constructs two LLM, each with limited number of GPU blocks. The limit
+    is decided such that as the sequences in the batch grow, sequences must be
+    preempted and removed from cache.
+
+    If the output token ids are equivalent, then we have confidence that the KV
+    cache is not corrupted.
+
+    NOTE: We want a significant number of generated tokens so that any incorrect
+    KV mapping has time to build up error.
+
+    NOTE(Kuntai): Though we have removed block manager v1, this test is still
+    useful as it asserts the behavior of block manager v2 (now it is called 
+    SelfAttnBlockSpaceManager) is the same when swapping / preemption, so we  
+    keep this test.
+    """
+    output_len = 1024
+    temperature = 0.0
+
+    # We want to ensure equality even with preemption.
+    # We force the total block size to be 1 + cdiv(output_len, block_size)
+    # so that only one sequence can fit at a time (once the sequences grow).
+
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+
+    prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
+
+    sampling_params = SamplingParams(
+        max_tokens=output_len,
+        ignore_eos=True,
+        temperature=temperature,
+    )
+
+    print('Getting token ids from block manager')
+    baseline_token_ids = get_token_ids_from_llm_generator(
+        baseline_llm_generator, prompts, sampling_params)
+
+    print('Getting token ids from block manager, with preemption')
+    test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
+                                                      prompts, sampling_params)
+
+    for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
+                                                    test_token_ids):
+        assert expected_token_ids == actual_token_ids
+
+    assert baseline_token_ids == test_token_ids
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        # Use a small model for a fast test.
+        "model": "facebook/opt-125m",
+
+        # skip cuda graph creation for fast test.
+        "enforce_eager": True,
+
+        # Allow only 5 sequences of ~1024 tokens in worst case.
+        "block_size": 16,
+        "num_gpu_blocks_override": 5 * (64 + 1),
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{
+    "enable_prefix_caching": False
+}])
+@pytest.mark.parametrize("test_llm_kwargs", [{
+    "enable_prefix_caching": True,
+    "preemption_mode": "swap"
+}, {
+    "enable_prefix_caching": True,
+    "preemption_mode": "recompute"
+}])
+@pytest.mark.parametrize("batch_size", [10])
+@pytest.mark.parametrize("seed", [1])
+def test_auto_prefix_caching_with_preemption(baseline_llm_generator,
+                                             test_llm_generator, batch_size):
+    """Verify block manager v2 with auto prefix caching enabled produces same
+    outputs as auto prefix caching disabled, even when there is preemption.
+
+    This constructs two LLM, each with limited number of GPU blocks. The limit
+    is decided such that as the sequences in the batch grow, sequences must be
+    preempted and removed from cache.
+
+    If the output token ids are equivalent, then we have confidence that auto
+    prefix caching itself at least don't cause result error.
+    """
+    output_len = 1024
+    temperature = 0.0
+
+    # We want to ensure equality even with preemption.
+    # We force the total block size to be 1 + cdiv(output_len, block_size)
+    # so that only one sequence can fit at a time (once the sequences grow).
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+
+    prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
+
+    sampling_params = SamplingParams(
+        max_tokens=output_len,
+        ignore_eos=True,
+        temperature=temperature,
+    )
+
+    print('Getting token ids with APC disabled')
+    baseline_token_ids = get_token_ids_from_llm_generator(
+        baseline_llm_generator, prompts, sampling_params)
+
+    print('Getting token ids with APC enabled')
+    test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
+                                                      prompts, sampling_params)
+
+    for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
+                                                    test_token_ids):
+        assert expected_token_ids == actual_token_ids
+
+    assert baseline_token_ids == test_token_ids
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        # Use a small model for a fast test.
+        "model": "facebook/opt-125m",
+
+        # skip cuda graph creation for fast test.
+        "enforce_eager": True,
+
+        # we keep the blocks small, so that hit eviction quickly
+        "max_model_len": 48,
+        "block_size": 16,
+        "num_gpu_blocks_override": 3,
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{
+    "enable_prefix_caching": False
+}])
+@pytest.mark.parametrize("test_llm_kwargs", [{
+    "enable_prefix_caching": True,
+}])
+@pytest.mark.parametrize("seed", [1])
+def test_auto_prefix_caching_after_evition_start(baseline_llm_generator,
+                                                 test_llm_generator):
+    """Verify block manager v2 with auto prefix caching could works normal
+    even when eviction started.
+    With APC enabled, all blocks are held by native block at the beginning.
+    Then blocks are managed by evictor instead. If cache hit at the evitor's
+    block, then it could be reused, or we need to recompute its kv cache.
+    """
+    output_len = 10
+    temperature = 0.0
+
+    prompts = [
+        "You are a helpful assistant. Please answer truthfully and write "
+        "out your thinking step by step to be sure you get the right answer. "
+        "If you make a mistake, attempt to correct it. who are you?",
+        "You are a helpful assistant. Please answer truthfully and write out "
+        "your thinking step by step to be sure you get the right answer. You "
+        "are helpful and harmless and you follow ethical guidelines. "
+        "who are you?"
+    ]
+
+    sampling_params = SamplingParams(
+        max_tokens=output_len,
+        ignore_eos=True,
+        temperature=temperature,
+    )
+
+    print('Getting token ids with APC disabled')
+    baseline_token_ids = get_token_ids_from_llm_generator(
+        baseline_llm_generator, prompts, sampling_params)
+
+    print('Getting token ids with APC enabled')
+    test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
+                                                      prompts, sampling_params)
+
+    for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
+                                                    test_token_ids):
+        assert expected_token_ids == actual_token_ids
+
+    assert baseline_token_ids == test_token_ids
--- a/vllm-v0.6.2/tests/core/block/e2e/test_correctness_sliding_window.py
+++ b/vllm-v0.6.2/tests/core/block/e2e/test_correctness_sliding_window.py
@@ -0,0 +1,180 @@
+import random
+from typing import List
+
+import pytest
+
+from vllm import LLM, SamplingParams
+
+from .conftest import get_text_from_llm_generator
+
+# relatively small model with 4k sliding window.
+'''
+=============================
+Modify by vllm_mlu
+=============================
+Currently tmo.apply_rotary not support offsets so bigcode/starcoder2-3b cannot run.
+use mistralai/Mistral-7B-v0.1 instead, which also have 4k sliding window.
+'''
+# The original model is: MODEL = "bigcode/starcoder2-3b"
+MODEL = "mistralai/Mistral-7B-v0.1"
+BLOCK_SIZE = 16
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        "model": MODEL,
+
+        # skip cuda graph creation for fast test.
+        "enforce_eager": True,
+        "block_size": BLOCK_SIZE,
+        # needed due to https://github.com/vllm-project/vllm/issues/1908#issuecomment-2101122008
+        "num_gpu_blocks_override": 100000 // BLOCK_SIZE,
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [{}])
+@pytest.mark.parametrize("batch_size", [5])
+@pytest.mark.parametrize("seed", [1])
+def test_sliding_window_retrival(baseline_llm_generator, test_llm_generator,
+                                 batch_size, seed):
+    """
+    The test does a bunch of assignments "x1 = 10\nx2 = 33\n..." and then
+    asks for value of one of them (which is outside the sliding window).
+    If we tell it upfront which we are going to be looking for, then
+    it answers correctly (mostly).
+
+    Additionally, we compare the results of the v1 and v2 managers.
+    """
+    sampling_params = SamplingParams(
+        max_tokens=1024,
+        ignore_eos=True,
+        temperature=0.0,
+    )
+
+    prompts, answer, indices = prep_prompts(batch_size)
+
+    baseline_texts = get_text_from_llm_generator(baseline_llm_generator,
+                                                 prompts,
+                                                 sampling_params,
+                                                 llm_cb=check_window(prompts))
+
+    check_answers(indices, answer, baseline_texts)
+
+    print('Getting token ids from block manager v2')
+    test_texts = get_text_from_llm_generator(test_llm_generator, prompts,
+                                             sampling_params)
+    check_answers(indices, answer, test_texts)
+
+    cmp = [
+        expected_text == actual_text
+        for expected_text, actual_text in zip(baseline_texts, test_texts)
+    ]
+    print(cmp)
+    # make sure it's mostly OK; this is possibly because https://github.com/vllm-project/vllm/pull/4768
+    # however, https://github.com/vllm-project/vllm/issues/3385#issuecomment-1995924290
+    # states that xformers and flash_attn have different ideas about the window
+    # size anyways
+    assert sum(cmp) > 0.7 * len(cmp)
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        "model": MODEL,
+
+        # skip cuda graph creation for fast test.
+        "enforce_eager": True,
+        "block_size": BLOCK_SIZE,
+        "num_gpu_blocks_override": 100000 // BLOCK_SIZE,
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [{"enable_chunked_prefill": True}])
+@pytest.mark.parametrize("batch_size", [5])
+@pytest.mark.parametrize("seed", [1])
+def test_sliding_window_chunked_prefill(test_llm_generator, batch_size, seed):
+    """
+    This is similar to test_sliding_window_retrival, however, it doesn't
+    compare against the v1 block manager since v1 doesn't support
+    chunked prefill with sliding window.
+
+    The results with and without chunked prefill are not the same due to
+    numerical instabilities.
+    """
+    sampling_params = SamplingParams(
+        max_tokens=10,
+        ignore_eos=True,
+        temperature=0.0,
+    )
+
+    prompts, answer, indices = prep_prompts(batch_size)
+
+    # We don't compare with the baseline model here, since the results
+    # slightly different due to different tailing in attention.
+    test_texts = get_text_from_llm_generator(test_llm_generator,
+                                             prompts,
+                                             sampling_params,
+                                             llm_cb=check_window(prompts))
+    check_answers(indices, answer, test_texts)
+
+
+def prep_prompts(batch_size: int):
+    """
+    Generate prompts which a bunch of assignments,
+    then asking for the value of one of them.
+    The prompt is just under 10k tokens; sliding window is 4k
+    so the answer is outside sliding window, but should still be correct.
+    """
+    prompts: List[str] = []
+    answer: List[int] = []
+    indices: List[int] = []
+    random.seed(1)
+    for _ in range(batch_size):
+        idx = random.randint(30, 90)
+        indices.append(idx)
+        prompt = "```python\n# We set a number of variables, " + \
+                 f"x{idx} will be important later\n"
+
+        '''
+        =============================
+        Modify by vllm_mlu
+        =============================
+        Since we have used a different model, the length of the
+        prompt need to reset to the proper value as well
+        '''
+        # The original value is 800~1100
+        ln = random.randint(400, 500)
+        for k in range(30, ln):
+            v = random.randint(10, 99)
+            if k == idx:
+                answer.append(v)
+            prompt += f"x{k} = {v}\n"
+        prompt += f"# Now, we check the value of x{idx}:\n"
+        prompt += f"assert x{idx} == "
+        prompts.append(prompt)
+    return prompts, answer, indices
+
+
+def check_answers(indices: List[int], answer: List[int], outputs: List[str]):
+    answer2 = [int(text[0:2].strip()) for text in outputs]
+    print(list(zip(indices, zip(answer, answer2))))
+    numok = 0
+    for a1, a2 in zip(answer, answer2):
+        if a1 == a2:
+            numok += 1
+    frac_ok = numok / len(answer)
+    print(f"Num OK: {numok}/{len(answer)} {frac_ok}")
+    # The original value is 0.7
+    assert frac_ok >= 0.4
+
+
+def check_window(prompts: List[str]):
+
+    def inner(llm: LLM):
+        sliding_window = llm.llm_engine.model_config.get_sliding_window()
+        assert sliding_window and sliding_window > 0
+        assert any(
+            len(llm.get_tokenizer().tokenize(prompt)) > sliding_window
+            for prompt in prompts)
+
+    return inner
--- a/vllm-v0.6.2/tests/core/block/test_block_manager.py
+++ b/vllm-v0.6.2/tests/core/block/test_block_manager.py
@@ -0,0 +1,491 @@
+import pytest
+
+from vllm.core.block.utils import (STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE,
+                                   STR_NOT_IMPL_ENC_DEC_SWA)
+from vllm.core.block_manager import SelfAttnBlockSpaceManager
+from vllm.core.interfaces import AllocStatus
+from vllm.sequence import Logprob, SequenceStatus
+from vllm.utils import chunk_list
+
+from ..utils import (create_dummy_prompt, create_seq_group,
+                     create_seq_group_encoder_decoder)
+
+
+@pytest.mark.parametrize("block_size", [16])
+@pytest.mark.parametrize("num_gpu_blocks", [8, 40, 80])
+@pytest.mark.parametrize("num_seqs_per_group", [1, 4])
+@pytest.mark.parametrize("watermark", [0.0, 0.5])
+def test_can_allocate_seq_group(block_size: int, num_seqs_per_group: int,
+                                num_gpu_blocks: int, watermark: float):
+    block_manager = SelfAttnBlockSpaceManager(
+        block_size=block_size,
+        num_gpu_blocks=num_gpu_blocks,
+        num_cpu_blocks=1024,
+        watermark=watermark,
+    )
+    num_watermark_blocks = int(watermark * num_gpu_blocks)
+
+    num_output_blocks_per_seq = 1
+
+    # NOTE: This should be num_output_blocks_per_seq * num_seqs_per_group, but
+    # the current implementation assumes all seqs are new prompts / don't have
+    # different output lens.
+    num_output_blocks = num_output_blocks_per_seq
+
+    for num_prompt_blocks in range(1, num_gpu_blocks - num_output_blocks):
+        seq_group = create_seq_group(
+            seq_prompt_len=block_size * num_prompt_blocks,
+            seq_output_lens=[
+                block_size * num_output_blocks_per_seq
+                for _ in range(num_seqs_per_group)
+            ],
+        )
+
+        assert num_prompt_blocks + num_output_blocks <= num_gpu_blocks
+
+        can_allocate_result = block_manager.can_allocate(seq_group)
+
+        num_required_blocks = num_prompt_blocks + num_output_blocks
+
+        if num_gpu_blocks - num_required_blocks < num_watermark_blocks:
+            assert can_allocate_result == AllocStatus.NEVER
+        elif num_gpu_blocks >= num_required_blocks:
+            assert can_allocate_result == AllocStatus.OK
+        else:
+            assert can_allocate_result == AllocStatus.LATER
+
+
+@pytest.mark.parametrize("block_size", [16])
+@pytest.mark.parametrize("num_gpu_blocks", [16, 80, 160])
+@pytest.mark.parametrize("num_seqs_per_group", [1, 4])
+@pytest.mark.parametrize("watermark", [0.0, 0.5])
+def test_can_allocate_seq_group_encoder_decoder(block_size: int,
+                                                num_seqs_per_group: int,
+                                                num_gpu_blocks: int,
+                                                watermark: float):
+    block_manager = SelfAttnBlockSpaceManager(
+        block_size=block_size,
+        num_gpu_blocks=num_gpu_blocks,
+        num_cpu_blocks=1024,
+        watermark=watermark,
+    )
+    num_watermark_blocks = int(watermark * num_gpu_blocks)
+
+    num_output_blocks_per_seq = 1
+
+    # NOTE: This should be num_output_blocks_per_seq * num_seqs_per_group, but
+    # the current implementation assumes all seqs are new prompts / don't have
+    # different output lens.
+    num_output_blocks = num_output_blocks_per_seq
+
+    for bdx, num_prompt_blocks in enumerate(
+            range(1, num_gpu_blocks - num_output_blocks)):
+        num_cross_blocks_per_seq = num_prompt_blocks
+
+        seq_group = create_seq_group_encoder_decoder(
+            seq_prompt_len=block_size * num_prompt_blocks,
+            seq_output_lens=[
+                block_size * num_output_blocks_per_seq
+                for _ in range(num_seqs_per_group)
+            ],
+            request_id=str(bdx))
+
+        assert num_prompt_blocks + num_output_blocks <= num_gpu_blocks
+
+        can_allocate_result = block_manager.can_allocate(seq_group)
+
+        num_required_blocks = num_prompt_blocks + \
+                              num_output_blocks + \
+                              num_cross_blocks_per_seq
+
+        if num_gpu_blocks - num_required_blocks < num_watermark_blocks:
+            assert can_allocate_result == AllocStatus.NEVER
+        elif num_gpu_blocks >= num_required_blocks:
+            assert can_allocate_result == AllocStatus.OK
+        else:
+            assert can_allocate_result == AllocStatus.LATER
+
+
+@pytest.mark.parametrize("block_size", [16])
+@pytest.mark.parametrize("num_gpu_blocks", [16])
+@pytest.mark.parametrize("num_seqs_per_group", [1])
+@pytest.mark.parametrize("watermark", [0.0, 0.5])
+def test_can_allocate_encoder_decoder_fails_with_swa(block_size: int,
+                                                     num_seqs_per_group: int,
+                                                     num_gpu_blocks: int,
+                                                     watermark: float):
+    '''
+    SWA short for Sliding Window Attention.
+
+    At time of writing block manager does not support SWA.
+
+    However even when SWA is implemented for block manager,
+    there will still most likely be a separate workstream required
+    to enable SWA for encoder/decoder models.
+
+    Therefore this test enforces that one of the following cases
+    hold true:
+    1. Block manager does not support SWA at all (true at time of writing)
+    2. Block manager fails with NotImplementError when SWA is enabled
+       AND a SequenceGroup with an encoder sequence (i.e. in support of an
+       encoder/decoder model) is passed into can_allocate() as an argument
+
+    The setup for this test is stripped down version of
+    test_can_allocate_seq_group_encoder_decoder()
+    '''
+
+    with pytest.raises((NotImplementedError, AssertionError)) as exc_info:
+        block_manager = SelfAttnBlockSpaceManager(
+            block_size=block_size,
+            num_gpu_blocks=num_gpu_blocks,
+            num_cpu_blocks=1024,
+            watermark=watermark,
+            sliding_window=5  # SWA
+        )
+
+        num_output_blocks_per_seq = 1
+        num_prompt_blocks = 1
+        num_output_blocks = num_output_blocks_per_seq
+        seq_group = create_seq_group_encoder_decoder(
+            seq_prompt_len=block_size * num_prompt_blocks,
+            seq_output_lens=[
+                block_size * num_output_blocks_per_seq
+                for _ in range(num_seqs_per_group)
+            ],
+            request_id="0")
+
+        assert num_prompt_blocks + num_output_blocks <= num_gpu_blocks
+        block_manager.can_allocate(seq_group)
+
+    # Assert that either
+    # 1. Block manager constructor fails with assertion that sliding window
+    #    is not yet supported (most likely near-term outcome at time of
+    #    writing), or
+    # 2. can_allocate() fails with NotImplementedError due to combination of
+    #    encoder/decoder and sliding window attention
+    if isinstance(exc_info.value, NotImplementedError):
+        assert str(exc_info.value) == STR_NOT_IMPL_ENC_DEC_SWA
+    elif isinstance(exc_info.value, AssertionError):
+        assert str(exc_info.value) == "Sliding window not yet supported"
+
+
+@pytest.mark.parametrize("block_size", [16])
+@pytest.mark.parametrize("num_gpu_blocks", [16])
+@pytest.mark.parametrize("num_seqs_per_group", [1])
+@pytest.mark.parametrize("watermark", [0.0, 0.5])
+def test_can_allocate_encoder_decoder_fails_with_prefix_cache(
+        block_size: int, num_seqs_per_group: int, num_gpu_blocks: int,
+        watermark: float):
+
+    block_manager = SelfAttnBlockSpaceManager(
+        block_size=block_size,
+        num_gpu_blocks=num_gpu_blocks,
+        num_cpu_blocks=1024,
+        watermark=watermark,
+        enable_caching=True  # Prefix cache
+    )
+
+    num_output_blocks_per_seq = 1
+    num_prompt_blocks = 1
+    num_output_blocks = num_output_blocks_per_seq
+    seq_group = create_seq_group_encoder_decoder(
+        seq_prompt_len=block_size * num_prompt_blocks,
+        seq_output_lens=[
+            block_size * num_output_blocks_per_seq
+            for _ in range(num_seqs_per_group)
+        ],
+        request_id="0")
+
+    assert num_prompt_blocks + num_output_blocks <= num_gpu_blocks
+
+    # Assert that either can_allocate() fails with NotImplementedError
+    # due to combination of encoder/decoder and prefix cache
+    with pytest.raises(NotImplementedError) as exc_info:
+        block_manager.can_allocate(seq_group)
+    assert str(exc_info.value) == STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE
+
+
+@pytest.mark.parametrize("block_size", [1, 8])
+@pytest.mark.parametrize("prompt_len", [1, 7, 8])
+@pytest.mark.parametrize("num_slots_to_append", [1, 8, 129])
+@pytest.mark.parametrize("num_lookahead_slots", [0, 10])
+def test_append_slots(block_size, prompt_len, num_slots_to_append,
+                      num_lookahead_slots):
+    """Verify append_slots consumes the correct number of blocks from the block
+    table.
+    """
+
+    num_gpu_blocks = 1024
+    watermark = 0.1
+    block_manager = SelfAttnBlockSpaceManager(
+        block_size=block_size,
+        num_gpu_blocks=num_gpu_blocks,
+        num_cpu_blocks=0,
+        watermark=watermark,
+    )
+
+    seq_group = create_seq_group(
+        seq_prompt_len=prompt_len,
+        seq_output_lens=[0],
+    )
+
+    # Allocate seq
+    assert block_manager.can_allocate(seq_group)
+    block_manager.allocate(seq_group)
+
+    # Seq seq to RUNNING
+    seq = seq_group.get_seqs()[0]
+    seq.status = SequenceStatus.RUNNING
+
+    # Append tokens to the sequeqnce
+    for token_id in range(num_slots_to_append):
+        seq.append_token_id(token_id, {token_id: Logprob(0.0)})
+
+    # Append slots for new tokens and lookahead slots.
+    free_blocks_before_append = block_manager.get_num_free_gpu_blocks()
+    block_manager.append_slots(seq, num_lookahead_slots)
+    num_consumed_blocks = (free_blocks_before_append -
+                           block_manager.get_num_free_gpu_blocks())
+
+    # Expect consumed blocks to be new blocks required to support the new slots.
+    expected_consumed_blocks = len(
+        list(
+            chunk_list(
+                list(
+                    range(prompt_len + num_slots_to_append +
+                          num_lookahead_slots)),
+                block_size))) - len(
+                    list(chunk_list(list(range(prompt_len)), block_size)))
+    assert num_consumed_blocks == expected_consumed_blocks
+
+
+@pytest.mark.parametrize("block_size", [8])
+@pytest.mark.parametrize("num_cpu_blocks", [4])
+@pytest.mark.parametrize("num_gpu_blocks", [4])
+@pytest.mark.parametrize("num_lookahead_slots", [0, 2, 10])
+@pytest.mark.parametrize("enable_caching", [False, True])
+def test_swap(block_size, num_cpu_blocks, num_gpu_blocks, num_lookahead_slots,
+              enable_caching):
+    """Verify blocks number on src/desc device is correct after swapping in/out
+        sequence group (not missing or extra blocks).
+    """
+    block_manager = SelfAttnBlockSpaceManager(block_size,
+                                              num_cpu_blocks,
+                                              num_gpu_blocks,
+                                              watermark=0,
+                                              enable_caching=enable_caching)
+    prompt, seq_group = create_dummy_prompt("1", prompt_length=block_size - 1)
+    prompt.status = SequenceStatus.WAITING
+    block_manager.allocate(seq_group)
+
+    # Emulate a forward pass by appending a single token.
+    # The block manager then knows how many unprocessed
+    # tokens will be written in the next forward pass.
+    token_id = 0
+    prompt.status = SequenceStatus.RUNNING
+    prompt.append_token_id(token_id, {token_id: Logprob(0.0)})
+
+    # Swap seq group from GPU -> CPU.
+    gpu_blocks = block_manager.get_block_table(prompt)
+    assert block_manager.can_swap_out(seq_group)
+    before_cpu_blocks = block_manager.get_num_free_cpu_blocks()
+    before_gpu_blocks = block_manager.get_num_free_gpu_blocks()
+    mapping = block_manager.swap_out(seq_group)
+    mapping_keys = [key for key, _ in mapping]
+    assert mapping_keys == gpu_blocks
+    after_cpu_blocks = block_manager.get_num_free_cpu_blocks()
+    after_gpu_blocks = block_manager.get_num_free_gpu_blocks()
+    assert before_cpu_blocks == after_cpu_blocks + len(gpu_blocks)
+    assert before_gpu_blocks + len(gpu_blocks) == after_gpu_blocks
+    prompt.status = SequenceStatus.SWAPPED
+
+    # Swap seq group from CPU -> GPU.
+    assert block_manager.can_swap_in(seq_group, num_lookahead_slots)
+    before_cpu_blocks = block_manager.get_num_free_cpu_blocks()
+    before_gpu_blocks = block_manager.get_num_free_gpu_blocks()
+    mapping = block_manager.swap_in(seq_group)
+    cpu_blocks = block_manager.get_block_table(prompt)
+    mapping_keys = [key for key, _ in mapping]
+    assert mapping_keys == [cpu_blocks[0]]
+    after_cpu_blocks = block_manager.get_num_free_cpu_blocks()
+    after_gpu_blocks = block_manager.get_num_free_gpu_blocks()
+    assert before_gpu_blocks == after_gpu_blocks + len(cpu_blocks)
+
+
+@pytest.mark.parametrize("block_size", [8])
+@pytest.mark.parametrize("num_gpu_blocks", [4])
+@pytest.mark.parametrize("num_lookahead_slots", [3, 8, 10])
+@pytest.mark.parametrize("enable_caching", [True, False])
+def test_can_swap(block_size, num_gpu_blocks, num_lookahead_slots,
+                  enable_caching):
+    """ Verify the block manager can correctly determine if a sequence group
+        can be swapped in/out.
+    """
+    num_cpu_blocks = num_gpu_blocks
+    block_manager = SelfAttnBlockSpaceManager(block_size,
+                                              num_cpu_blocks,
+                                              num_gpu_blocks,
+                                              watermark=0,
+                                              enable_caching=enable_caching)
+    prompt, seq_group = create_dummy_prompt(
+        "1", prompt_length=(num_gpu_blocks - 1) * block_size - 1)
+    prompt.status = SequenceStatus.WAITING
+    block_manager.allocate(seq_group)
+    prompt.status = SequenceStatus.RUNNING
+
+    # Swap seq group from GPU -> CPU.
+    gpu_blocks = block_manager.get_block_table(prompt)
+    assert block_manager.can_swap_out(seq_group)
+    before_cpu_blocks = block_manager.get_num_free_cpu_blocks()
+    before_gpu_blocks = block_manager.get_num_free_gpu_blocks()
+    mapping = block_manager.swap_out(seq_group)
+    mapping_keys = [key for key, _ in mapping]
+    assert mapping_keys == gpu_blocks
+    after_cpu_blocks = block_manager.get_num_free_cpu_blocks()
+    after_gpu_blocks = block_manager.get_num_free_gpu_blocks()
+    assert before_cpu_blocks == after_cpu_blocks + len(gpu_blocks)
+    assert before_gpu_blocks + len(gpu_blocks) == after_gpu_blocks
+    prompt.status = SequenceStatus.SWAPPED
+
+    # At this moment, we still have enough free blocks to swap in the seq group.
+    if num_lookahead_slots <= block_size:
+        assert block_manager.can_swap_in(seq_group,
+                                         num_lookahead_slots) == AllocStatus.OK
+    else:
+        assert block_manager.can_swap_in(
+            seq_group, num_lookahead_slots) == AllocStatus.NEVER
+
+    # During Swapped out, 2 cached blocks were evicted from the GPU,
+    # so the prompt1 can't be swapped in
+    prompt2_len = 2 * block_size - 1
+    prompt2, seq_group2 = create_dummy_prompt(
+        "2",
+        prompt_length=prompt2_len,
+        prompt_tokens=[10000 + i for i in range(prompt2_len)])
+    prompt2.status = SequenceStatus.WAITING
+    block_manager.allocate(seq_group2)
+
+    # Swap seq group from CPU -> GPU.
+    if num_lookahead_slots <= block_size:
+        assert block_manager.can_swap_in(
+            seq_group, num_lookahead_slots) == AllocStatus.LATER
+    else:
+        assert block_manager.can_swap_in(
+            seq_group, num_lookahead_slots) == AllocStatus.NEVER
+
+
+@pytest.mark.parametrize("num_lookahead_slots", [0, 2, 10])
+@pytest.mark.parametrize("enable_caching", [False, True])
+def test_swap_in_infeasible(num_lookahead_slots, enable_caching):
+    """Verifies that swapping fails if there is not enough free blocks
+    to account for unseen tokens and lookahead_slots.
+    """
+    block_size = 8
+    num_cpu_blocks = 1
+    num_gpu_blocks = 1
+    block_manager = SelfAttnBlockSpaceManager(block_size,
+                                              num_cpu_blocks,
+                                              num_gpu_blocks,
+                                              watermark=0,
+                                              enable_caching=enable_caching)
+    prompt_length = block_size - 3
+    assert prompt_length > 0
+    prompt, seq_group = create_dummy_prompt("1", prompt_length=prompt_length)
+    prompt.status = SequenceStatus.WAITING
+    block_manager.allocate(seq_group)
+    # Emulate a forward pass by appending a single token.
+    # The block manager then knows how many unprocessed
+    # tokens will be written in the next forward pass.
+    token_id = 0
+    prompt.status = SequenceStatus.RUNNING
+    prompt.append_token_id(token_id, {token_id: Logprob(0.0)})
+
+    # Swap seq group from GPU -> CPU.
+    assert block_manager.can_swap_out(seq_group)
+    block_manager.swap_out(seq_group)
+    prompt.status = SequenceStatus.SWAPPED
+
+    # Swap seq group from CPU -> GPU.
+    # The number of unseen tokens is 1. If the number of existing
+    # tokens plus the unseen ones and number of lookahead slots exceeds
+    # the total number of available GPU blocks then the swap
+    # should fail.
+    num_unseen_tokens = 1
+    if (num_lookahead_slots + num_unseen_tokens +
+            prompt_length) <= (block_size * num_gpu_blocks):
+        assert block_manager.can_swap_in(seq_group,
+                                         num_lookahead_slots) == AllocStatus.OK
+    else:
+        assert block_manager.can_swap_in(
+            seq_group, num_lookahead_slots) == AllocStatus.NEVER
+
+
+# TODO(cade/kaiyang): add comprehensive tests for swapping at allocator level.
+
+
+@pytest.mark.parametrize("block_size", [8, 16])
+@pytest.mark.parametrize("prompt_len", [10, 300, 1000])
+@pytest.mark.parametrize("num_slots_to_append", [50])
+@pytest.mark.parametrize("sliding_window", [20, 32, 200, 512])
+def test_sliding_window(block_size, prompt_len, num_slots_to_append,
+                        sliding_window):
+    """Verify append_slots consumes the correct number of blocks from the block
+    table.
+    """
+
+    num_gpu_blocks = 1024
+    watermark = 0.1
+    block_manager = SelfAttnBlockSpaceManager(
+        block_size=block_size,
+        num_gpu_blocks=num_gpu_blocks,
+        num_cpu_blocks=0,
+        watermark=watermark,
+        sliding_window=sliding_window,
+    )
+
+    def check_used(min_n, max_n=None):
+        if max_n is None:
+            max_n = min_n
+        used = num_gpu_blocks - block_manager.get_num_free_gpu_blocks()
+        assert min_n <= used
+        assert used <= max_n
+
+    def num_blocks(num_tokens):
+        return (num_tokens + block_size - 1) // block_size
+
+    check_used(0)
+
+    seq_group = create_seq_group(
+        seq_prompt_len=prompt_len,
+        seq_output_lens=[0],
+    )
+
+    check_used(0)
+
+    # Allocate seq
+    assert block_manager.can_allocate(seq_group)
+    block_manager.allocate(seq_group)
+
+    check_used(num_blocks(prompt_len))
+
+    # Seq seq to RUNNING
+    seq = seq_group.get_seqs()[0]
+    seq.status = SequenceStatus.RUNNING
+
+    seq.data.update_num_computed_tokens(prompt_len)
+    check_used(num_blocks(prompt_len))
+
+    # this is how we compute it in SelfAttnBlockSpaceManager.__init__
+    sliding_blocks = (sliding_window // block_size) + 2
+    # plus one block for null block
+    sliding_blocks += 1
+
+    # Append tokens to the sequeqnce
+    for token_id in range(num_slots_to_append):
+        seq.append_token_id(token_id, {token_id: Logprob(0.0)})
+        seq.data.update_num_computed_tokens(1)
+        block_manager.append_slots(seq, num_lookahead_slots=0)
+        if prompt_len < sliding_window + 10:
+            check_used(0, sliding_blocks + 1)
+        else:
+            check_used(sliding_blocks, sliding_blocks + 1)
--- a/vllm-v0.6.2/tests/core/block/test_block_table.py
+++ b/vllm-v0.6.2/tests/core/block/test_block_table.py
@@ -0,0 +1,576 @@
+from typing import List
+
+import pytest
+
+from vllm.core.block.block_table import BlockTable
+from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator
+from vllm.utils import Device, cdiv, chunk_list
+
+
+@pytest.mark.parametrize("block_size", [16])
+@pytest.mark.parametrize("sequence_len", [1, 16, 129])
+def test_allocate_naive(block_size: int, sequence_len: int):
+    """Test the allocation of blocks using the naive allocator.
+
+    This test creates a CpuGpuBlockAllocator with the specified block size and
+    number of blocks. It then allocates multiple BlockTables with varying
+    sequence lengths and verifies that the number of free blocks decreases as
+    expected after each allocation.
+    """
+    assert block_size > 1
+    num_gpu_blocks = 1024
+
+    allocator = CpuGpuBlockAllocator.create(
+        allocator_type="naive",
+        num_gpu_blocks=num_gpu_blocks,
+        num_cpu_blocks=1024,
+        block_size=block_size,
+    )
+
+    token_ids = list(range(sequence_len))
+    num_blocks_per_alloc = len(list(chunk_list(token_ids, block_size)))
+
+    block_tables: List[BlockTable] = []
+    for i in range(5):
+        assert allocator.get_num_free_blocks(
+            device=Device.GPU) == num_gpu_blocks - i * num_blocks_per_alloc
+
+        block_tables.append(
+            BlockTable(
+                block_size=block_size,
+                block_allocator=allocator,
+            ))
+        block_tables[-1].allocate(token_ids=token_ids, device=Device.GPU)
+
+
+@pytest.mark.parametrize("block_size", [16])
+@pytest.mark.parametrize("sequence_len", [1, 16, 129])
+def test_allocate_prefix_caching(block_size: int, sequence_len: int):
+    """Test the allocation of blocks using the prefix caching allocator.
+
+    This test creates a CpuGpuBlockAllocator with the specified block size and
+    number of blocks, using the prefix caching allocator. It then allocates
+    multiple BlockTables with varying sequence lengths and verifies that the
+    number of free blocks decreases as expected after each allocation.
+
+    The test expects all sequences to share allocations, except for their last
+    block, which may be mutable. It calculates the expected number of immutable
+    and mutable blocks per allocation based on the sequence length and block
+    size.
+    """
+    assert block_size > 1
+    num_gpu_blocks = 1024
+
+    allocator = CpuGpuBlockAllocator.create(
+        allocator_type="prefix_caching",
+        num_gpu_blocks=num_gpu_blocks,
+        num_cpu_blocks=1024,
+        block_size=block_size,
+    )
+
+    token_ids = list(range(sequence_len))
+    chunked_tokens = list(chunk_list(token_ids, block_size))
+    num_mutable_blocks_per_alloc = 0 if len(
+        chunked_tokens[-1]) == block_size else 1
+    num_immutable_blocks_per_alloc = len(
+        chunked_tokens) - num_mutable_blocks_per_alloc
+
+    block_tables: List[BlockTable] = []
+    for alloc_i in range(1, 6):
+
+        block_tables.append(
+            BlockTable(
+                block_size=block_size,
+                block_allocator=allocator,
+            ))
+        block_tables[-1].allocate(token_ids=token_ids, device=Device.GPU)
+
+        # Expect all sequences to share allocations, except for their last block
+        # (which may be mutable).
+        assert allocator.get_num_free_blocks(
+            device=Device.GPU) == num_gpu_blocks - (
+                num_immutable_blocks_per_alloc + num_mutable_blocks_per_alloc *
+                (alloc_i))
+
+
+@pytest.mark.parametrize("block_size", [16])
+@pytest.mark.parametrize("sequence_len", [1, 16, 129])
+@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
+@pytest.mark.parametrize("device", ["cpu", "gpu"])
+def test_allocate_free(block_size: int, sequence_len: int, allocator_type: str,
+                       device: str):
+    """Test the allocation and freeing of blocks using different allocators and
+    devices.
+
+    This test creates a CpuGpuBlockAllocator with the specified block size,
+    number of blocks, allocator type, and device. It then allocates a BlockTable
+    multiple times with the same sequence and verifies that the number of free
+    blocks remains consistent after each allocation and freeing.
+    """
+    device = Device[device.upper()]
+
+    num_device_blocks = 1024
+    allocator = CpuGpuBlockAllocator.create(
+        allocator_type=allocator_type,
+        num_gpu_blocks=num_device_blocks,
+        num_cpu_blocks=num_device_blocks,
+        block_size=block_size,
+    )
+
+    token_ids = list(range(sequence_len))
+    num_blocks_per_alloc = len(list(chunk_list(token_ids, block_size)))
+
+    block_table = BlockTable(
+        block_size=block_size,
+        block_allocator=allocator,
+    )
+
+    for i in range(5):
+        block_table.allocate(token_ids=token_ids, device=device)
+        assert allocator.get_num_free_blocks(
+            device) == num_device_blocks - num_blocks_per_alloc
+        assert all(block_id is not None
+                   for block_id in block_table.physical_block_ids)
+
+        block_table.free()
+        assert allocator.get_num_free_blocks(device) == num_device_blocks
+
+
+@pytest.mark.parametrize("block_size", [1, 8])
+@pytest.mark.parametrize("sequence_len", [1, 16, 129])
+@pytest.mark.parametrize("append_len", [1, 16, 129])
+@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
+def test_append_token_ids_allocation(block_size: int, sequence_len: int,
+                                     append_len: int, allocator_type: str):
+    """Test the allocation behavior when appending token IDs to a BlockTable.
+
+    This test creates a CpuGpuBlockAllocator with the specified block size,
+    number of blocks, and allocator type. It then allocates a BlockTable with an
+    initial sequence and appends additional token IDs to it. The test verifies
+    that the number of allocated blocks before and after appending matches the
+    expected values.
+    """
+
+    num_gpu_blocks = 1024
+
+    allocator = CpuGpuBlockAllocator.create(
+        allocator_type=allocator_type,
+        num_gpu_blocks=num_gpu_blocks,
+        num_cpu_blocks=1024,
+        block_size=block_size,
+    )
+
+    token_ids = list(range(sequence_len))
+    token_ids_to_append = list(range(append_len))
+
+    block_table = BlockTable(
+        block_size=block_size,
+        block_allocator=allocator,
+    )
+
+    num_expected_blocks_before_append = len(
+        list(chunk_list(token_ids, block_size)))
+    num_expected_appended_blocks = len(
+        list(chunk_list(token_ids + token_ids_to_append,
+                        block_size))) - num_expected_blocks_before_append
+
+    block_table.allocate(token_ids=token_ids, device=Device.GPU)
+
+    assert len(
+        block_table.physical_block_ids) == num_expected_blocks_before_append
+    block_table.append_token_ids(token_ids_to_append)
+    assert len(
+        block_table.physical_block_ids
+    ) == num_expected_blocks_before_append + num_expected_appended_blocks
+
+
+@pytest.mark.parametrize("block_size", [1, 8])
+@pytest.mark.parametrize("sequence_len", [1, 16, 129])
+@pytest.mark.parametrize("num_empty_slots", [1, 16, 129])
+@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
+def test_ensure_num_empty_slots_allocation(block_size: int, sequence_len: int,
+                                           num_empty_slots: int,
+                                           allocator_type: str):
+    """Test the allocation behavior when ensuring a certain number of empty
+    slots in a BlockTable.
+
+    This test creates a CpuGpuBlockAllocator with the specified block size,
+    number of blocks, and allocator type. It then allocates a BlockTable with an
+    initial sequence and ensures a certain number of empty slots. The test
+    verifies that the number of allocated blocks before and after ensuring empty
+    slots matches the expected values. It also checks that filling up the empty
+    slots does not consume additional blocks.
+    """
+    num_gpu_blocks = 1024
+
+    allocator = CpuGpuBlockAllocator.create(
+        allocator_type=allocator_type,
+        num_gpu_blocks=num_gpu_blocks,
+        num_cpu_blocks=1024,
+        block_size=block_size,
+    )
+
+    token_ids = list(range(sequence_len))
+
+    block_table = BlockTable(
+        block_size=block_size,
+        block_allocator=allocator,
+    )
+
+    num_expected_blocks_before_append = len(
+        list(chunk_list(token_ids, block_size)))
+    num_expected_appended_blocks = len(
+        list(chunk_list(token_ids + [-1] * num_empty_slots,
+                        block_size))) - num_expected_blocks_before_append
+
+    block_table.allocate(token_ids=token_ids, device=Device.GPU)
+
+    # Assert that the empty slots consume the expected number of additional
+    # blocks.
+    assert len(
+        block_table.physical_block_ids) == num_expected_blocks_before_append
+    block_table.ensure_num_empty_slots(num_empty_slots)
+    assert len(
+        block_table.physical_block_ids
+    ) == num_expected_blocks_before_append + num_expected_appended_blocks
+
+    # Now, ensure no additional blocks consumed as we fill up the empty slots.
+    num_free_blocks = allocator.get_num_free_blocks(device=Device.GPU)
+    block_table.append_token_ids(token_ids=list(range(num_empty_slots)))
+    assert num_free_blocks == allocator.get_num_free_blocks(device=Device.GPU)
+
+
+@pytest.mark.parametrize("block_size", [1, 8])
+@pytest.mark.parametrize("sequence_len", [1, 9])
+@pytest.mark.parametrize("append_len", [1, 16, 129])
+@pytest.mark.parametrize("append_size", [1, 4, 129])
+@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
+def test_append_token_ids_correct_content(block_size: int, sequence_len: int,
+                                          append_len: int, allocator_type: str,
+                                          append_size: int):
+    """Verify token ids are correctly appended. Appends various amounts of
+    token ids in various append sizes, and verifies the final sequence is
+    correct.
+    """
+    num_gpu_blocks = 1024
+
+    allocator = CpuGpuBlockAllocator.create(
+        allocator_type=allocator_type,
+        num_gpu_blocks=num_gpu_blocks,
+        num_cpu_blocks=1024,
+        block_size=block_size,
+    )
+
+    token_ids = list(range(sequence_len))
+    token_ids_to_append = list(range(append_len))
+
+    block_table = BlockTable(
+        block_size=block_size,
+        block_allocator=allocator,
+    )
+    block_table.allocate(token_ids=token_ids, device=Device.GPU)
+
+    appended_so_far: List[int] = []
+    for append in chunk_list(token_ids_to_append, append_size):
+        block_table.append_token_ids(append)
+        appended_so_far.extend(append)
+
+        assert block_table._get_all_token_ids() == token_ids + appended_so_far
+
+    assert block_table._get_all_token_ids() == token_ids + token_ids_to_append
+
+
+@pytest.mark.parametrize("seq_len", [1, 9, 129])
+@pytest.mark.parametrize("block_size", [1, 8])
+@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
+def test_fork(seq_len: int, block_size: int, allocator_type: str):
+    """Create a sequence using the specified allocator.
+        1. Assert that after forking the sequence, the free block count is the
+            same.
+        2. Assert that the forked sequence has the same physical mappings.
+        3. Then free the original sequence; verify that the free block count is
+            the same.
+        4. Finally, free the forked sequence and verify that the free block
+            count drops to zero.
+    """
+    num_gpu_blocks = 1024
+
+    allocator = CpuGpuBlockAllocator.create(
+        allocator_type=allocator_type,
+        num_gpu_blocks=num_gpu_blocks,
+        num_cpu_blocks=0,
+        block_size=block_size,
+    )
+
+    token_ids = list(range(seq_len))
+
+    block_table = BlockTable(
+        block_size=block_size,
+        block_allocator=allocator,
+    )
+
+    block_table.allocate(token_ids)
+
+    num_free_blocks_before_fork = allocator.get_num_free_blocks(
+        device=Device.GPU)
+
+    forked_block_table = block_table.fork()
+
+    # Expect physical_block_ids and token_ids to match.
+    assert (block_table.physical_block_ids ==
+            forked_block_table.physical_block_ids)
+    assert block_table._get_all_token_ids(
+    ) == forked_block_table._get_all_token_ids()
+
+    # Do not expect any additional allocations.
+    assert allocator.get_num_free_blocks(
+        device=Device.GPU) == num_free_blocks_before_fork
+
+    # Free the original blocks. Assert num free blocks does not change, since
+    # refcount is nonzero.
+    block_table.free()
+    assert allocator.get_num_free_blocks(
+        device=Device.GPU) == num_free_blocks_before_fork
+
+    # Expect the forked block table to be unaffected by the free.
+    assert all(block_id is not None
+               for block_id in forked_block_table.physical_block_ids)
+
+    # Free the forked blocks. Assert num free blocks does change, since
+    # refcount is now zero.
+    forked_block_table.free()
+    assert allocator.get_num_free_blocks(device=Device.GPU) == num_gpu_blocks
+
+
+@pytest.mark.parametrize("block_size", [8])
+@pytest.mark.parametrize("sequence_len", [1, 16, 129])
+@pytest.mark.parametrize("append_len", [1, 16, 129])
+@pytest.mark.parametrize("appender", ["forked", "original"])
+@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
+def test_cow(block_size: int, sequence_len: int, append_len: int,
+             allocator_type: str, appender: str):
+    """Fork a sequence; append to the forked sequence; verify there's a CoW.
+    """
+    num_gpu_blocks = 1024
+
+    allocator = CpuGpuBlockAllocator.create(
+        allocator_type=allocator_type,
+        num_gpu_blocks=num_gpu_blocks,
+        num_cpu_blocks=0,
+        block_size=block_size,
+    )
+
+    token_ids = list(range(sequence_len))
+    token_ids_to_append = list(range(append_len))
+
+    original_block_table = BlockTable(
+        block_size=block_size,
+        block_allocator=allocator,
+    )
+
+    num_expected_non_cow_blocks = cdiv(sequence_len, block_size)
+    num_expected_cow_blocks = cdiv(sequence_len + append_len,
+                                   block_size) - (sequence_len // block_size)
+
+    original_block_table.allocate(token_ids=token_ids, device=Device.GPU)
+    original_block_ids = original_block_table.physical_block_ids[:]
+
+    print("original_block_ids = {}".format(original_block_ids))
+    forked_block_table = original_block_table.fork()
+
+    # Expect no additional allocation (copy on _write_).
+    assert allocator.get_num_free_blocks(
+        Device.GPU) == (num_gpu_blocks - num_expected_non_cow_blocks)
+
+    if appender == "forked":
+        appender_block_table = forked_block_table
+        static_block_table = original_block_table
+    elif appender == "original":
+        appender_block_table = original_block_table
+        static_block_table = forked_block_table
+    else:
+        raise ValueError(f"unknown test config {appender=}")
+
+    # Write tokens.
+    appender_block_table.append_token_ids(token_ids_to_append)
+
+    # Expect the non-appending block table to have no change.
+    assert static_block_table.physical_block_ids == original_block_ids
+    assert appender_block_table.physical_block_ids != original_block_ids
+
+    # Expect the blocks changed during append to have a CoW.
+    assert allocator.get_num_free_blocks(
+        Device.GPU) == num_gpu_blocks - (num_expected_non_cow_blocks +
+                                         num_expected_cow_blocks)
+
+    cows = allocator.clear_copy_on_writes()
+    if sequence_len % block_size > 0:
+        # If the last block in the sequence is not full, then when appending we
+        # expect a CoW.
+        assert cows
+
+        cow_block_id = sequence_len // block_size
+        expected_src = static_block_table.physical_block_ids[cow_block_id]
+        expected_dst = appender_block_table.physical_block_ids[cow_block_id]
+
+        assert (expected_src, expected_dst) in cows
+    else:
+        # Otherwise, there should be no copy-on-write.
+        assert not cows
+
+    static_block_table.free()
+    appender_block_table.free()
+
+    # After free, expect all blocks to be freed.
+    assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
+
+
+@pytest.mark.parametrize("block_size", [8])
+@pytest.mark.parametrize("sequence_len", [1, 16, 129])
+@pytest.mark.parametrize("append_len", [1, 16, 129])
+@pytest.mark.parametrize("lookahead_slots", [1, 16, 129])
+@pytest.mark.parametrize("appender", ["forked", "original"])
+@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
+def test_cow_lookahead_simple(block_size: int, sequence_len: int,
+                              append_len: int, lookahead_slots: int,
+                              allocator_type: str, appender: str):
+    """Similar to test_cow, except with lookahead allocation. The assertions are
+    less rigorous due to the complexity of the property under test.
+    """
+    num_gpu_blocks = 1024
+
+    allocator = CpuGpuBlockAllocator.create(
+        allocator_type=allocator_type,
+        num_gpu_blocks=num_gpu_blocks,
+        num_cpu_blocks=0,
+        block_size=block_size,
+    )
+
+    token_ids = list(range(sequence_len))
+    token_ids_to_append = list(range(append_len))
+
+    original_block_table = BlockTable(
+        block_size=block_size,
+        block_allocator=allocator,
+    )
+
+    original_block_table.allocate(token_ids=token_ids, device=Device.GPU)
+
+    # Allocate lookahead slots.
+    original_block_table.ensure_num_empty_slots(lookahead_slots)
+    original_block_ids = original_block_table.physical_block_ids[:]
+
+    forked_block_table = original_block_table.fork()
+
+    if appender == "forked":
+        appender_block_table = forked_block_table
+        static_block_table = original_block_table
+    elif appender == "original":
+        appender_block_table = original_block_table
+        static_block_table = forked_block_table
+    else:
+        raise ValueError(f"unknown test config {appender=}")
+
+    # Write tokens.
+    appender_block_table.append_token_ids(token_ids_to_append)
+
+    # Expect the non-appending block table to have no change.
+    assert static_block_table.physical_block_ids == original_block_ids
+    assert appender_block_table.physical_block_ids != original_block_ids
+
+    cows = allocator.clear_copy_on_writes()
+
+    # Always expect copy-on-write
+    assert cows
+
+    if sequence_len % block_size > 0:
+        # If the last block in the sequence is not full, then when appending we
+        # expect a CoW.
+        assert cows
+
+        cow_block_id = sequence_len // block_size
+        expected_src = static_block_table.physical_block_ids[cow_block_id]
+        expected_dst = appender_block_table.physical_block_ids[cow_block_id]
+
+        assert (expected_src, expected_dst) in cows
+
+    static_block_table.free()
+    appender_block_table.free()
+
+    # After free, expect all blocks to be freed.
+    assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
+
+
+@pytest.mark.parametrize("block_size", [1, 8])
+@pytest.mark.parametrize("sequence_len", [1, 16, 129])
+@pytest.mark.parametrize("num_new_tokens", [1, 16, 129])
+@pytest.mark.parametrize("num_lookahead_slots", [1, 7, 8])
+@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
+def test_num_blocks_touched_by_append_slots(block_size: int, sequence_len: int,
+                                            num_new_tokens: int,
+                                            num_lookahead_slots: int,
+                                            allocator_type: str):
+    """Verify correct calculation of get_num_blocks_touched_by_append_slots.
+
+    This is done by using copy-on-write, which requires any modified block to
+    be copied before write if the refcount > 1. We set the refcount>1 by forking
+    a sequence, then measure the free blocks before and after an append. If the
+    number of consumed blocks equals what `get_num_blocks_touched_by_append_
+    slots` returns, then the calculation is correct.
+    """
+
+    num_gpu_blocks = 1024
+
+    allocator = CpuGpuBlockAllocator.create(
+        allocator_type=allocator_type,
+        num_gpu_blocks=num_gpu_blocks,
+        num_cpu_blocks=0,
+        block_size=block_size,
+    )
+
+    token_ids = list(range(sequence_len))
+    token_ids_to_append = list(range(num_new_tokens))
+
+    block_table = BlockTable(
+        block_size=block_size,
+        block_allocator=allocator,
+    )
+
+    block_table.allocate(token_ids=token_ids, device=Device.GPU)
+
+    # Add lookahead before fork so both sequences have the same lookahead
+    # blocks.
+    block_table.ensure_num_empty_slots(num_empty_slots=num_lookahead_slots)
+
+    # Fork sequence so that every block has refcount > 1.
+    _ = block_table.fork()
+
+    # Determine how many blocks should be touched.
+    expected_num_touched_blocks = (
+        block_table.get_num_blocks_touched_by_append_slots(
+            token_ids=token_ids_to_append,
+            num_lookahead_slots=num_lookahead_slots))
+
+    # Measure how many blocks are touched by measuring num_free_blocks before
+    # and after the append.
+    #
+    # We expect append_token_ids to CoW all mutated blocks that have refcount>1.
+    num_free_blocks_before_append = allocator.get_num_free_blocks(Device.GPU)
+    block_table.append_token_ids(token_ids_to_append, num_lookahead_slots)
+    num_consumed_blocks = (num_free_blocks_before_append -
+                           allocator.get_num_free_blocks(Device.GPU))
+
+    # TODO(cade) ensure equality when num_lookahead_slots > 0.
+    # The reason we have < is because lookahead blocks are not copied eagerly;
+    # they are copied on first write. This will cause issues for beam search +
+    # speculative decoding. This is acceptable for now as it is a large effort
+    # to combine the two. To fix this, we can ensure single sequence ownership
+    # of lookahead blocks by appending empty slots to each block, which will
+    # trigger the CoW.
+    #
+    # Until then, we can accept that the consumed tokens are <= the expected
+    # tokens when appending with lookahead.
+    if num_lookahead_slots > 0:
+        assert num_consumed_blocks <= expected_num_touched_blocks
+    else:
+        assert num_consumed_blocks == expected_num_touched_blocks
--- a/vllm-v0.6.2/tests/core/block/test_common.py
+++ b/vllm-v0.6.2/tests/core/block/test_common.py
@@ -0,0 +1,42 @@
+import random
+
+import pytest
+
+from vllm.core.block.common import RefCounter
+
+
+@pytest.mark.parametrize("seed", list(range(20)))
+@pytest.mark.parametrize("num_incrs", [1, 100])
+@pytest.mark.parametrize("num_blocks", [1024])
+def test_incr(seed: int, num_incrs: int, num_blocks: int):
+    random.seed(seed)
+
+    all_block_indices = list(range(num_blocks))
+    counter = RefCounter(all_block_indices=all_block_indices)
+
+    block_id = random.randint(0, num_blocks - 1)
+    for i in range(num_incrs):
+        value = counter.incr(block_id)
+        assert value == i + 1
+
+
+@pytest.mark.parametrize("seed", list(range(20)))
+@pytest.mark.parametrize("num_incrs", [1, 100])
+@pytest.mark.parametrize("num_blocks", [1024])
+def test_incr_decr(seed: int, num_incrs: int, num_blocks: int):
+    random.seed(seed)
+
+    all_block_indices = list(range(num_blocks))
+    counter = RefCounter(all_block_indices=all_block_indices)
+
+    block_id = random.randint(0, num_blocks - 1)
+    for i in range(num_incrs):
+        value = counter.incr(block_id)
+        assert value == i + 1
+
+    for i in range(num_incrs):
+        value = counter.decr(block_id)
+        assert value == num_incrs - (i + 1)
+
+    with pytest.raises(AssertionError):
+        counter.decr(block_id)
--- a/vllm-v0.6.2/tests/core/block/test_cpu_gpu_block_allocator.py
+++ b/vllm-v0.6.2/tests/core/block/test_cpu_gpu_block_allocator.py
@@ -0,0 +1,93 @@
+import pytest
+
+from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator
+from vllm.utils import Device, chunk_list
+
+
+@pytest.mark.parametrize("num_cpu_blocks", [0, 512])
+@pytest.mark.parametrize("num_gpu_blocks", [1024])
+@pytest.mark.parametrize("block_size", [16])
+@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
+def test_allocate_mutable_block(num_cpu_blocks: int, num_gpu_blocks: int,
+                                block_size: int, allocator_type: str):
+    allocator = CpuGpuBlockAllocator.create(
+        allocator_type=allocator_type,
+        num_gpu_blocks=num_gpu_blocks,
+        num_cpu_blocks=num_cpu_blocks,
+        block_size=block_size,
+    )
+
+    assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks
+    assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
+
+    cpu_blocks = [
+        allocator.allocate_mutable_block(prev_block=None, device=Device.CPU)
+        for _ in range(num_cpu_blocks)
+    ]
+    assert allocator.get_num_free_blocks(Device.CPU) == 0
+    assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
+
+    gpu_blocks = [
+        allocator.allocate_mutable_block(prev_block=None, device=Device.GPU)
+        for _ in range(num_gpu_blocks)
+    ]
+    assert allocator.get_num_free_blocks(Device.CPU) == 0
+    assert allocator.get_num_free_blocks(Device.GPU) == 0
+
+    _ = [allocator.free(block) for block in cpu_blocks]
+    assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks
+    assert allocator.get_num_free_blocks(Device.GPU) == 0
+
+    _ = [allocator.free(block) for block in gpu_blocks]
+    assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks
+    assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
+
+
+@pytest.mark.parametrize("num_cpu_blocks", [0, 512])
+@pytest.mark.parametrize("num_gpu_blocks", [1024])
+@pytest.mark.parametrize("block_size", [2])
+@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
+def test_allocate_immutable_block(num_cpu_blocks: int, num_gpu_blocks: int,
+                                  block_size: int, allocator_type: str):
+    allocator = CpuGpuBlockAllocator.create(
+        allocator_type=allocator_type,
+        num_gpu_blocks=num_gpu_blocks,
+        num_cpu_blocks=num_cpu_blocks,
+        block_size=block_size,
+    )
+
+    unique_token_ids = list(
+        range((num_cpu_blocks + num_gpu_blocks) * block_size))
+    gpu_token_ids = list(
+        chunk_list(unique_token_ids[:num_gpu_blocks * block_size], block_size))
+    cpu_token_ids = list(
+        chunk_list(unique_token_ids[num_gpu_blocks * block_size:], block_size))
+
+    assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks
+    assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
+
+    cpu_blocks = [
+        allocator.allocate_immutable_block(prev_block=None,
+                                           token_ids=token_ids,
+                                           device=Device.CPU)
+        for token_ids in cpu_token_ids
+    ]
+    assert allocator.get_num_free_blocks(Device.CPU) == 0
+    assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
+
+    gpu_blocks = [
+        allocator.allocate_immutable_block(prev_block=None,
+                                           token_ids=token_ids,
+                                           device=Device.GPU)
+        for token_ids in gpu_token_ids
+    ]
+    assert allocator.get_num_free_blocks(Device.CPU) == 0
+    assert allocator.get_num_free_blocks(Device.GPU) == 0
+
+    _ = [allocator.free(block) for block in cpu_blocks]
+    assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks
+    assert allocator.get_num_free_blocks(Device.GPU) == 0
+
+    _ = [allocator.free(block) for block in gpu_blocks]
+    assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks
+    assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
--- a/vllm-v0.6.2/tests/core/block/test_naive_block.py
+++ b/vllm-v0.6.2/tests/core/block/test_naive_block.py
@@ -0,0 +1,145 @@
+from typing import List, Optional
+
+import pytest
+
+from vllm.core.block.interfaces import Block, BlockAllocator
+from vllm.core.block.naive_block import NaiveBlock, NaiveBlockAllocator
+
+
+class TestNaiveBlockAllocator:
+
+    @staticmethod
+    def create_allocate_lambda(allocate_type: str,
+                               allocator: NaiveBlockAllocator,
+                               prev_block: Optional[Block],
+                               token_ids: List[int]):
+        if allocate_type == "immutable":
+            allocate_block = lambda: allocator.allocate_immutable_block(
+                prev_block=prev_block, token_ids=token_ids)
+        elif allocate_type == "mutable":
+            allocate_block = lambda: allocator.allocate_mutable_block(
+                prev_block=prev_block)
+        else:
+            raise ValueError()
+
+        return allocate_block
+
+    @staticmethod
+    @pytest.mark.parametrize("allocate_type", ["immutable", "mutable"])
+    @pytest.mark.parametrize("num_blocks", [1, 1024])
+    @pytest.mark.parametrize("block_size", [1, 16])
+    def test_allocate_ooms(allocate_type: str, num_blocks: int,
+                           block_size: int):
+        allocator = NaiveBlockAllocator(create_block=NaiveBlock,
+                                        num_blocks=num_blocks,
+                                        block_size=block_size)
+        allocate_block = TestNaiveBlockAllocator.create_allocate_lambda(
+            allocate_type,
+            allocator,
+            prev_block=None,
+            token_ids=list(range(block_size)))
+
+        [allocate_block() for _ in range(num_blocks)]
+        with pytest.raises(BlockAllocator.NoFreeBlocksError):
+            allocate_block()
+
+    @staticmethod
+    @pytest.mark.parametrize("allocate_type", ["immutable", "mutable"])
+    @pytest.mark.parametrize("num_blocks", [1, 1024])
+    @pytest.mark.parametrize("block_size", [1, 16])
+    def test_free_prevents_oom(allocate_type: str, num_blocks: int,
+                               block_size: int):
+        allocator = NaiveBlockAllocator(create_block=NaiveBlock,
+                                        num_blocks=num_blocks,
+                                        block_size=block_size)
+        allocate_block = TestNaiveBlockAllocator.create_allocate_lambda(
+            allocate_type,
+            allocator,
+            prev_block=None,
+            token_ids=list(range(block_size)))
+
+        blocks = [allocate_block() for _ in range(num_blocks)]
+
+        with pytest.raises(BlockAllocator.NoFreeBlocksError):
+            allocate_block()
+
+        block_to_free = blocks.pop()
+
+        for _ in range(100):
+            block_id = block_to_free.block_id
+            allocator.free(block_to_free)
+            assert block_to_free.block_id is None
+
+            new_block = allocate_block()
+            assert new_block.block_id == block_id
+
+            with pytest.raises(BlockAllocator.NoFreeBlocksError):
+                allocate_block()
+
+            block_to_free = new_block
+
+    @staticmethod
+    @pytest.mark.parametrize("allocate_type", ["immutable", "mutable"])
+    @pytest.mark.parametrize("num_blocks", [1024])
+    @pytest.mark.parametrize("block_size", [16])
+    def test_get_num_free_blocks(allocate_type: str, num_blocks: int,
+                                 block_size: int):
+        allocator = NaiveBlockAllocator(create_block=NaiveBlock,
+                                        num_blocks=num_blocks,
+                                        block_size=block_size)
+        allocate_block = TestNaiveBlockAllocator.create_allocate_lambda(
+            allocate_type,
+            allocator,
+            prev_block=None,
+            token_ids=list(range(block_size)))
+
+        assert allocator.get_num_free_blocks() == num_blocks
+
+        blocks = [allocate_block() for _ in range(num_blocks)]
+
+        for i, block in enumerate(blocks):
+            assert allocator.get_num_free_blocks() == i
+            allocator.free(block)
+
+    @staticmethod
+    @pytest.mark.parametrize("num_blocks", [4])
+    @pytest.mark.parametrize("block_size", [8])
+    def test_naive_block_get_num_full_blocks_touched(num_blocks, block_size):
+        """ Verify the allocator can correctly return the number of
+        full blocks touched.
+        """
+        allocator_src = NaiveBlockAllocator(create_block=NaiveBlock,
+                                            num_blocks=num_blocks,
+                                            block_size=block_size)
+        allocator_dst = NaiveBlockAllocator(create_block=NaiveBlock,
+                                            num_blocks=num_blocks,
+                                            block_size=block_size)
+
+        # Create a chain of cacheable blocks in the dst
+        allocate_block = TestNaiveBlockAllocator.create_allocate_lambda(
+            "immutable",
+            allocator_src,
+            prev_block=None,
+            token_ids=list(range(block_size)))
+        src_blocks = [allocate_block() for _ in range(num_blocks - 1)]
+
+        # All blocks are cached
+        assert allocator_dst.get_num_full_blocks_touched(
+            src_blocks) == num_blocks - 1
+
+        # Insert one non-full block in the src
+        allocate_non_full_block = \
+            TestNaiveBlockAllocator.create_allocate_lambda(
+                "mutable", allocator_src,
+                prev_block=src_blocks[-1],token_ids=[]
+            )
+        src_blocks.append(allocate_non_full_block())
+        src_blocks[-1].append_token_ids([0])
+
+        assert allocator_dst.get_num_full_blocks_touched(
+            src_blocks) == num_blocks - 1
+        # Fill up the last source block and then invoke
+        # get_num_blocks_touched
+        src_blocks[-1].append_token_ids([0] * (block_size - 1))
+        assert allocator_dst.get_num_full_blocks_touched(
+            src_blocks) == num_blocks
--- a/vllm-v0.6.2/tests/core/block/test_prefix_caching_block.py
+++ b/vllm-v0.6.2/tests/core/block/test_prefix_caching_block.py
@@ -0,0 +1,764 @@
+import math
+import random
+from typing import List, Optional
+from unittest.mock import MagicMock
+
+import pytest
+
+from vllm.core.block.interfaces import Block, BlockAllocator
+from vllm.core.block.prefix_caching_block import (PrefixCachingBlock,
+                                                  PrefixCachingBlockAllocator)
+
+
+class TestPrefixCachingBlock:
+
+    @staticmethod
+    @pytest.mark.parametrize("seed", list(range(10)))
+    @pytest.mark.parametrize("block_size", [1, 16])
+    @pytest.mark.parametrize("is_curr_block_full", [True, False])
+    def test_first_block_has_correct_content_hash(seed: int, block_size: int,
+                                                  is_curr_block_full: bool):
+        """Verify a block which is first in the sequence has the correct hash.
+        """
+        random.seed(seed)
+        num_to_fill = block_size if is_curr_block_full else random.randint(
+            0, block_size - 1)
+        token_ids = list(range(num_to_fill))
+        mock_allocator = MagicMock(spec=PrefixCachingBlockAllocator)
+
+        block_with_prev = PrefixCachingBlock(prev_block=None,
+                                             token_ids=token_ids,
+                                             block_size=block_size,
+                                             allocator=mock_allocator)
+
+        if is_curr_block_full:
+            # Expect hash since block is full.
+            assert block_with_prev.content_hash == (
+                PrefixCachingBlock.hash_block_tokens(
+                    is_first_block=True,
+                    prev_block_hash=None,
+                    cur_block_token_ids=token_ids))
+        else:
+            # Do not expect hash since block is not full.
+            assert block_with_prev.content_hash is None
+
+    @staticmethod
+    @pytest.mark.parametrize("seed", list(range(10)))
+    @pytest.mark.parametrize("block_size", [1, 16])
+    @pytest.mark.parametrize("is_curr_block_full", [True, False])
+    @pytest.mark.parametrize("prev_block_has_hash", [True, False])
+    def test_nth_block_has_correct_content_hash(seed: int, block_size: int,
+                                                is_curr_block_full: bool,
+                                                prev_block_has_hash: bool):
+        """Verify a block which is not first in the sequence has the correct
+        hash.
+        """
+
+        random.seed(seed)
+
+        previous_block = MagicMock(spec=PrefixCachingBlock)
+        prev_block_hash = random.randint(0, 1000)
+        previous_block.content_hash = (prev_block_hash
+                                       if prev_block_has_hash else None)
+
+        num_to_fill = block_size if is_curr_block_full else random.randint(
+            0, block_size - 1)
+        token_ids = list(range(num_to_fill))
+        mock_allocator = MagicMock(spec=PrefixCachingBlockAllocator)
+
+        block_with_prev = PrefixCachingBlock(
+            prev_block=previous_block,
+            token_ids=token_ids,
+            block_size=block_size,
+            allocator=mock_allocator,
+        )
+
+        if is_curr_block_full and prev_block_has_hash:
+            # Expect hash since block is full and previous block has hash.
+            assert (block_with_prev.content_hash ==
+                    PrefixCachingBlock.hash_block_tokens(
+                        is_first_block=False,
+                        prev_block_hash=prev_block_hash,
+                        cur_block_token_ids=token_ids))
+        else:
+            # Do not expect hash since block is not full or the previous block
+            # does not have a hash.
+            assert block_with_prev.content_hash is None
+
+    @staticmethod
+    @pytest.mark.parametrize("block_size", [1, 2, 16])
+    @pytest.mark.parametrize("num_tokens", list(range(3)))
+    @pytest.mark.parametrize("num_empty_trailing_blocks", [0, 1, 10])
+    def test_blocks_have_correct_hash_in_chain(block_size: int,
+                                               num_tokens: int,
+                                               num_empty_trailing_blocks: int):
+        """Create two chains of logical blocks with the same contents.
+        Assert the hashes are equal.
+        """
+        random.seed(0)
+
+        token_ids = [random.randint(0, 50_000) for _ in range(num_tokens)]
+
+        first_chain, second_chain = (TestPrefixCachingBlock.create_chain(
+            block_size=block_size,
+            token_ids=token_ids,
+            num_empty_trailing_blocks=num_empty_trailing_blocks)
+                                     for _ in range(2))
+
+        for first_chain_block, second_chain_block in zip(
+                first_chain, second_chain):
+            assert (first_chain_block.content_hash ==
+                    second_chain_block.content_hash)
+
+        if not first_chain or not second_chain:
+            assert first_chain == second_chain
+            assert num_tokens == 0
+
+    @staticmethod
+    def create_chain(block_size: int,
+                     token_ids: List[int],
+                     num_empty_trailing_blocks=0) -> List[PrefixCachingBlock]:
+        """Helper method which creates a chain of blocks.
+        """
+        blocks: List[PrefixCachingBlock] = []
+        num_blocks = math.ceil(
+            len(token_ids) / block_size) + num_empty_trailing_blocks
+
+        if num_blocks == 0:
+            return []
+
+        allocator = MagicMock(spec=PrefixCachingBlockAllocator)
+
+        prev_block = None
+        for block_number in range(0, num_blocks):
+            prev_block = PrefixCachingBlock(
+                prev_block=prev_block,
+                token_ids=[],
+                block_size=block_size,
+                allocator=allocator,
+            )
+
+            tokens_to_append = token_ids[block_number *
+                                         block_size:(block_number + 1) *
+                                         block_size]
+            if tokens_to_append:
+                prev_block.append_token_ids(tokens_to_append)
+
+            blocks.append(prev_block)
+
+        return blocks
+
+
+class TestPrefixCachingBlockAllocator:
+
+    @staticmethod
+    def create_allocate_lambda(allocate_type: str, allocator: BlockAllocator,
+                               prev_block: Optional[Block],
+                               token_ids: List[int]):
+        if allocate_type == "immutable":
+            allocate_block = lambda: allocator.allocate_immutable_block(
+                prev_block=prev_block, token_ids=token_ids)
+        elif allocate_type == "mutable":
+            allocate_block = lambda: allocator.allocate_mutable_block(
+                prev_block=prev_block)
+        else:
+            raise ValueError()
+
+        return allocate_block
+
+    @staticmethod
+    @pytest.mark.parametrize("num_blocks", [1, 1024])
+    @pytest.mark.parametrize("block_size", [1, 16])
+    def test_allocate_mutable_ooms(num_blocks: int, block_size: int):
+        allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
+                                                block_size=block_size)
+        allocate_block = TestPrefixCachingBlockAllocator.create_allocate_lambda(
+            allocate_type="mutable",
+            allocator=allocator,
+            prev_block=None,
+            token_ids=list(range(block_size)),
+        )
+
+        [allocate_block() for _ in range(num_blocks)]
+        with pytest.raises(BlockAllocator.NoFreeBlocksError):
+            allocate_block()
+
+    @staticmethod
+    @pytest.mark.parametrize("num_blocks", [1, 1024])
+    @pytest.mark.parametrize("block_size", [1, 16])
+    def test_allocate_immutable_does_not_oom_single_hash(
+            num_blocks: int, block_size: int):
+        allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
+                                                block_size=block_size)
+        allocate_block = TestPrefixCachingBlockAllocator.create_allocate_lambda(
+            allocate_type="immutable",
+            allocator=allocator,
+            prev_block=None,
+            token_ids=list(range(block_size)),
+        )
+
+        blocks = [allocate_block() for _ in range(num_blocks)]
+
+        # Expect no OOM. If these were mutable blocks, this would OOM.
+        non_oom_block = allocate_block()
+
+        # Expect all blocks to have same physical block index.
+        for block in blocks:
+            assert (block.block_id == non_oom_block.block_id)
+
+    @staticmethod
+    @pytest.mark.parametrize("num_blocks", [1, 1024])
+    @pytest.mark.parametrize("block_size", [1, 16])
+    def test_allocate_immutable_ooms_many_hash(num_blocks: int,
+                                               block_size: int):
+        """Consume all blocks using many different hashes/block content.
+
+        Do this by creating a sequence that is very long.
+        Expect next block to OOM.
+        """
+        allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
+                                                block_size=block_size)
+
+        # Create token ids that will exhaust all blocks.
+        token_ids = list(range(num_blocks * block_size))
+
+        chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
+            block_size=block_size,
+            token_ids=token_ids,
+            allocator=allocator,
+        )
+
+        # Expect allocation with unseen hash to fail.
+        with pytest.raises(BlockAllocator.NoFreeBlocksError):
+            allocator.allocate_immutable_block(prev_block=chain[-1],
+                                               token_ids=list(
+                                                   range(block_size)))
+
+        # Expect mutable allocation to fail.
+        with pytest.raises(BlockAllocator.NoFreeBlocksError):
+            allocator.allocate_mutable_block(prev_block=chain[-1])
+
+        # Expect allocation of exact same chain to pass.
+        second_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
+            block_size=block_size,
+            token_ids=token_ids,
+            allocator=allocator,
+        )
+
+        # Expect physical block indices to be the same in both chains.
+        assert chain and second_chain
+        for first_chain_block, second_chain_block in zip(chain, second_chain):
+            assert (first_chain_block.block_id == second_chain_block.block_id)
+
+    @staticmethod
+    @pytest.mark.parametrize("num_blocks", [1, 1024])
+    @pytest.mark.parametrize("block_size", [1, 16])
+    def test_free_prevents_oom(num_blocks: int, block_size: int):
+        allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
+                                                block_size=block_size)
+
+        # Create token ids that will exhaust all blocks.
+        token_ids = list(range(num_blocks * block_size))
+
+        chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
+            block_size=block_size,
+            token_ids=token_ids,
+            allocator=allocator,
+        )
+
+        # Expect mutable allocation to fail.
+        with pytest.raises(BlockAllocator.NoFreeBlocksError):
+            allocator.allocate_mutable_block(prev_block=None)
+
+        block_to_free = chain[-1]
+
+        # Expect free/allocate loop to succeed many times.
+        for i in range(100):
+            block_id = block_to_free.block_id
+            allocator.free(block_to_free)
+            assert block_to_free.block_id is None, i
+
+            new_block = allocator.allocate_mutable_block(prev_block=None)
+            assert new_block.block_id == block_id, i
+
+            with pytest.raises(BlockAllocator.NoFreeBlocksError):
+                allocator.allocate_mutable_block(prev_block=None)
+
+            block_to_free = new_block
+
+    @staticmethod
+    @pytest.mark.parametrize("num_blocks", [1024])
+    @pytest.mark.parametrize("block_size", [16])
+    @pytest.mark.parametrize("seed", list(range(20)))
+    def test_get_num_free_blocks(num_blocks: int, block_size: int, seed: int):
+        random.seed(seed)
+        allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
+                                                block_size=block_size)
+        num_blocks_to_consume = random.randint(1, num_blocks - 1)
+
+        # Create token ids that will exhaust all blocks.
+        token_ids = list(range(num_blocks_to_consume * block_size))
+
+        chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
+            block_size=block_size,
+            token_ids=token_ids,
+            allocator=allocator,
+        )
+
+        # Free each block in chain, assert num free blocks includes new free
+        # block.
+        for i, block in enumerate(chain):
+            assert allocator.get_num_free_blocks() == (num_blocks -
+                                                       num_blocks_to_consume +
+                                                       i)
+            allocator.free(block)
+
+    @staticmethod
+    @pytest.mark.parametrize("num_blocks", [4])
+    @pytest.mark.parametrize("block_size", [8])
+    def test_prefix_caching_block_get_num_full_blocks_touched(
+            num_blocks, block_size):
+        """ Verify the allocator can correctly return the number of
+        blocks touched, when there are cached prefixes.
+        """
+        allocator_src = PrefixCachingBlockAllocator(num_blocks=num_blocks,
+                                                    block_size=block_size)
+        allocator_dst = PrefixCachingBlockAllocator(num_blocks=num_blocks,
+                                                    block_size=block_size)
+
+        # Create token ids that will exhaust all blocks except the last
+        token_ids = list(range((num_blocks - 1) * block_size))
+
+        # Create a chain of cacheable blocks in the dst
+        cached_blocks = TestPrefixCachingBlockAllocator.create_immutable_chain(
+            block_size=block_size,
+            token_ids=token_ids,
+            allocator=allocator_dst,
+        )
+
+        # Create a chain of the same blocks in the src
+        blocks_to_swap_in = \
+            TestPrefixCachingBlockAllocator.create_immutable_chain(
+                block_size=block_size,
+                token_ids=token_ids,
+                allocator=allocator_src,
+            )
+        # All blocks are cached
+        assert allocator_dst.get_num_full_blocks_touched(
+            blocks_to_swap_in) == 0
+
+        # Free the first block in the dst
+        allocator_dst.free(cached_blocks[0])
+
+        # Now the first block becomes dangling, the swapped blocks need
+        # to reclaim the first block in the dst
+        assert allocator_dst.get_num_full_blocks_touched(
+            blocks_to_swap_in) == 1
+
+        # Insert one non-full block in the src
+        non_full_block = allocator_src.allocate_mutable_block(
+            blocks_to_swap_in[-1])
+        non_full_block.append_token_ids([0])
+        blocks_to_swap_in.append(non_full_block)
+        assert allocator_dst.get_num_full_blocks_touched(
+            blocks_to_swap_in) == 1
+        # Fill up the last mutable block and invoke get_num_blocks_touched.
+        # Note: The last block is not cached so it will be touched.
+        non_full_block.append_token_ids([0] * (block_size - 1))
+        assert allocator_dst.get_num_full_blocks_touched(
+            blocks_to_swap_in) == 2
+
+    @staticmethod
+    @pytest.mark.parametrize("num_blocks", [1024])
+    @pytest.mark.parametrize("block_size", [16])
+    @pytest.mark.parametrize("seed", list(range(20)))
+    def test_get_num_free_blocks_shared(num_blocks: int, block_size: int,
+                                        seed: int):
+        """Verify sharing occurs by allocating two sequences that share prefixes
+        and incrementally freeing blocks.
+        """
+        random.seed(seed)
+        allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
+                                                block_size=block_size)
+        num_blocks_to_consume = random.randint(1, num_blocks - 1)
+
+        # Create token ids that will exhaust all blocks.
+        token_ids = list(range(num_blocks_to_consume * block_size))
+
+        first_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
+            block_size=block_size,
+            token_ids=token_ids,
+            allocator=allocator,
+        )
+        second_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
+            block_size=block_size,
+            token_ids=token_ids,
+            allocator=allocator,
+        )
+
+        # Free each block in the first chain. Since all blocks are shared, the
+        # free count should stay constant.
+        for i, block in enumerate(first_chain):
+            assert allocator.get_num_free_blocks() == (num_blocks -
+                                                       num_blocks_to_consume)
+            allocator.free(block)
+
+        # Free each block in the second chain. Since the refcount is now zero,
+        # the free count should increment with each free.
+        for i, block in enumerate(second_chain):
+            assert allocator.get_num_free_blocks() == (num_blocks -
+                                                       num_blocks_to_consume +
+                                                       i)
+            allocator.free(block)
+
+    @staticmethod
+    @pytest.mark.parametrize("num_blocks", [1024])
+    @pytest.mark.parametrize("block_size", [16])
+    @pytest.mark.parametrize("seed", list(range(20)))
+    def test_get_common_computed_block_ids(num_blocks: int, block_size: int,
+                                           seed: int):
+        """Verify get_common_computed_block_ids could get correct result
+        by create two immutable chain sharing prefix at specified pos,
+        and compare whether we also could get right result
+        from get_common_computed_block_ids.
+        """
+        random.seed(seed)
+        allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks * 2,
+                                                block_size=block_size)
+        num_blocks_to_consume = random.randint(1, num_blocks - 1)
+
+        # Create token ids that will exhaust all blocks.
+        token_ids = list(range(num_blocks_to_consume * block_size))
+
+        first_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
+            block_size=block_size,
+            token_ids=token_ids,
+            allocator=allocator,
+        )
+
+        # After zero_point, second_chain's token_ids would be set -1, which
+        # make it different from here comparing with first_chain
+        zero_point = random.randint(1, len(token_ids) - 1)
+        zero_point_blocks = zero_point // block_size
+        token_ids[zero_point:] = [-1] * (len(token_ids) - zero_point)
+
+        second_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
+            block_size=block_size,
+            token_ids=token_ids,
+            allocator=allocator,
+        )
+
+        first_computed_ids = [
+            first_chain[i].block_id for i in range(num_blocks_to_consume)
+        ]
+        second_computed_ids = [
+            second_chain[i].block_id for i in range(num_blocks_to_consume)
+        ]
+        res = allocator.get_common_computed_block_ids(
+            [first_computed_ids, second_computed_ids])
+
+        assert (len(res) == zero_point_blocks)
+
+    # Test case that assume those prompted block after first immutable would
+    # be freed into hashless allocator, while first immutable block get ref
+    # increased.
+    @staticmethod
+    @pytest.mark.parametrize("num_blocks", [3])
+    @pytest.mark.parametrize("block_size", [16])
+    @pytest.mark.parametrize("seed", list(range(10)))
+    def test_alloc_promotion(num_blocks: int, block_size: int, seed: int):
+        random.seed(seed)
+
+        allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
+                                                block_size=block_size)
+        token_ids = list(range(block_size))
+
+        block = allocator.allocate_immutable_block(prev_block=None,
+                                                   token_ids=token_ids)
+
+        assert allocator._refcounter.get(block.block_id) == 1
+        m = allocator.allocate_mutable_block(prev_block=None)
+
+        block_id = m.block_id
+        for i in range(block_size):
+            m.append_token_ids([i])
+
+        # After block get promoted to immutable from mutable, if there is
+        # already same content hash block, then it shall be released into
+        # hashless_allocator
+        # And first immutable block's ref get increased by 1
+        assert m.block_id == block.block_id
+        assert block_id in allocator._hashless_allocator._free_block_indices
+        assert allocator._refcounter.get(block.block_id) == 2
+
+    # Test case when eviction and allocation are mixed,
+    # make sure they work as expected
+    @staticmethod
+    @pytest.mark.parametrize("num_blocks", [3])
+    @pytest.mark.parametrize("block_size", [16])
+    @pytest.mark.parametrize("seed", list(range(10)))
+    def test_eviction_alloc_mixed(num_blocks: int, block_size: int, seed: int):
+        random.seed(seed)
+
+        all_blocks_list = [i for i in range(num_blocks)]
+        zero_ref = {i: 0 for i in range(num_blocks)}
+        one_ref = {i: 1 for i in range(num_blocks)}
+        allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
+                                                block_size=block_size)
+        token_ids = list(range(num_blocks * block_size))
+
+        # Verify initial/pre-alloc state
+
+        # Ensure all blocks are free inside hashless allocator
+        assert list(allocator._hashless_allocator._free_block_indices
+                    ) == all_blocks_list
+        # Ensure no tracked blocks
+        assert len(allocator._block_tracker.keys()) == num_blocks
+        for block_id in range(num_blocks):
+            assert not allocator._block_tracker[block_id].active
+        # Ensure no cached blocks
+        assert len(allocator._cached_blocks.values()) == 0
+        # Ensure no evicted blocks
+        assert len(allocator.evictor.free_table.keys()) == 0
+        # Ensure 0s ref counts for all blocks
+        assert allocator._refcounter._refcounts == zero_ref
+
+        # Allocate immutable chains with only one block residuled in
+        new_block = []
+        for i in range(num_blocks):
+            block = allocator.allocate_immutable_block(
+                prev_block=None,
+                token_ids=token_ids[block_size * i:block_size * (i + 1)])
+            new_block.append(block)
+
+        # Verify post-alloc state
+
+        # Ensure no blocks are free inside hashless allocator
+        assert (len(allocator._hashless_allocator._free_block_indices) == 0)
+        # Ensure all blocks are tracked
+        assert len(allocator._block_tracker.keys()) == num_blocks
+        for block_id in range(num_blocks):
+            assert allocator._block_tracker[block_id].active
+        # Ensure all blocks are cached (all promoted)
+        assert len(allocator._cached_blocks.values()) == num_blocks
+        # Ensure no evicted blocks
+        assert len(allocator.evictor.free_table.keys()) == 0
+        # Ensure 1s ref counts for all blocks
+        assert allocator._refcounter._refcounts == one_ref
+
+        # Free all blocks, and now all blocks shall be in the evictor
+        # there shall be no tracking data left in _block_tracker
+        # all blocks shall be tracked in _cached_blocks
+        # all blocks' ref shall be zero
+        for block in new_block:
+            allocator.free(block)
+
+        # Verify post-free state
+
+        # Ensure no tracked blocks
+        assert len(allocator._block_tracker.keys()) == num_blocks
+        for block_id in range(num_blocks):
+            assert not allocator._block_tracker[block_id].active
+        # Ensure no blocks in hashless allocator (all promoted)
+        assert len(allocator._hashless_allocator._free_block_indices) == 0
+        # Ensure all blocks are cached
+        assert list(allocator._cached_blocks.values()) == all_blocks_list
+        # Ensure all blocks are inside the evictor
+        assert list(allocator.evictor.free_table.keys()) == all_blocks_list
+        # Ensure 0s refcounts
+        assert allocator._refcounter._refcounts == zero_ref
+
+        # Allocate a mutable block, and the first block shall be evicted
+        # and set its content hash into None, ref to 1
+        mutable = allocator.allocate_mutable_block(prev_block=None)
+
+        assert mutable.block_id == 0
+        assert mutable.content_hash is None
+        assert allocator._block_tracker[0].active
+        assert allocator._refcounter.get(0) == 1
+        assert 0 not in allocator._cached_blocks
+        assert 0 not in allocator.evictor
+
+        # Since this mutable block has no hash yet, it shall be released into
+        # hashless allocator
+        allocator.free(mutable)
+
+        assert not allocator._block_tracker[0].active
+        assert allocator._refcounter._refcounts == zero_ref
+        assert 0 not in allocator._cached_blocks
+        assert 0 not in allocator.evictor
+        assert 0 in allocator._hashless_allocator._free_block_indices
+
+        # When allocate immutable with first block_size tokens, we
+        # shall get free block from hashless allocator, thus no block left
+        # in hashless
+        block = allocator.allocate_immutable_block(
+            prev_block=None, token_ids=token_ids[:block_size])
+
+        assert block.block_id == 0
+        assert len(allocator._hashless_allocator._free_block_indices) == 0
+        assert allocator._block_tracker[0].active
+        assert 0 in allocator._cached_blocks.values()
+        assert allocator._refcounter.get(0) == 1
+        assert 0 not in allocator.evictor
+
+        # allocate mutable block again, it shall be popped from evictor
+        mutable = allocator.allocate_mutable_block(prev_block=None)
+        assert len(allocator._hashless_allocator._free_block_indices) == 0
+        assert mutable.block_id not in allocator.evictor.free_table
+        assert allocator._refcounter.get(mutable.block_id) == 1
+
+    # Test case where two last accessed times are equal
+    @staticmethod
+    @pytest.mark.parametrize("num_blocks", [1024])
+    @pytest.mark.parametrize("block_size", [16])
+    @pytest.mark.parametrize("seed", list(range(20)))
+    def test_eviction_order(num_blocks: int, block_size: int, seed: int):
+        """This test case simulate the two chain created and free in order,
+        and together they would exhaust the initial freed blocks.
+
+        So the next block created after those two chain shall use the block
+        from the first chain as that block has long access time.
+        While first chain has two blocks, it shall pick up the last one, as
+        it has larger token number.
+        """
+
+        random.seed(seed)
+        allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
+                                                block_size=block_size)
+        num_blocks_to_consume = num_blocks + 1
+
+        token_ids = list(range(num_blocks_to_consume * block_size))
+
+        num_blocks_in_first_chain = 2
+        num_tokens_in_first_chain = block_size * num_blocks_in_first_chain
+        # First chain takes the first block
+        first_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
+            block_size=block_size,
+            token_ids=token_ids[:num_tokens_in_first_chain],
+            allocator=allocator,
+        )
+        # There should only be one block allocated at this point
+        assert allocator.get_num_free_blocks() == (num_blocks -
+                                                   num_blocks_in_first_chain)
+
+        # Set the last accessed time of the first block to 1
+        blocks_ids = [block.block_id for block in first_chain]
+        allocator.mark_blocks_as_accessed(blocks_ids, 1)
+
+        # Second chain takes the rest of the blocks
+        second_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
+            block_size=block_size,
+            token_ids=token_ids[num_tokens_in_first_chain:-block_size],
+            allocator=allocator,
+        )
+
+        # There shouldn't be any blocks left at this point
+        assert allocator.get_num_free_blocks() == (0)
+
+        assert len(first_chain) == num_blocks_in_first_chain
+        last_block_id = first_chain[-1].block_id
+        # Free each block in the first chain.
+        for i, block in enumerate(first_chain):
+            allocator.free(block)
+
+        # Set the last accessed time on all of the blocks in the second chain
+        # to 2
+        blocks_ids = [block.block_id for block in second_chain]
+        allocator.mark_blocks_as_accessed(blocks_ids, 2)
+
+        # Free each block in the second chain.
+        for i, block in enumerate(second_chain):
+            allocator.free(block)
+
+        # Allocate a new block and check that it's the least recently used block
+        # from the first chain.
+        new_block = TestPrefixCachingBlockAllocator.create_immutable_chain(
+            block_size=block_size,
+            token_ids=token_ids[-block_size:],
+            allocator=allocator,
+        )
+
+        assert new_block[0].block_id == last_block_id
+
+    # Test case for cache mertics
+    @staticmethod
+    def test_metric():
+        block_size = 16
+        allocator = PrefixCachingBlockAllocator(num_blocks=4,
+                                                block_size=block_size)
+        # Test when no query (0/0)
+        assert allocator.get_prefix_cache_hit_rate() == 0.0
+
+        token_ids = list(range(block_size))
+        allocator.allocate_immutable_block(prev_block=None,
+                                           token_ids=token_ids)
+        # Test 0/1 hit rate
+        assert allocator.get_prefix_cache_hit_rate() == 0.0
+
+        allocator.allocate_immutable_block(prev_block=None,
+                                           token_ids=token_ids)
+        # Test 1/2 hit rate
+        assert allocator.get_prefix_cache_hit_rate() == 0.5
+
+        # Test more than one block
+        for _ in range(2, 1005):
+            allocator.allocate_immutable_block(prev_block=None,
+                                               token_ids=token_ids)
+        assert allocator.get_prefix_cache_hit_rate() > 0.99
+
+    # Test case for marking cache hit blocks as computed right after
+    # a batch of prefill sequences are scheduled.
+    @staticmethod
+    def test_touch_block():
+        block_size = 16
+        common_blocks = 4
+        allocator = PrefixCachingBlockAllocator(num_blocks=8,
+                                                block_size=block_size)
+
+        common_token_ids = list(range(block_size * common_blocks))
+
+        # Mimic the behavior of allocating the same block chain
+        # (i.e., common prefix) for a batch of 3 different prefill sequences.
+        for _ in range(3):
+            blocks = TestPrefixCachingBlockAllocator.create_immutable_chain(
+                block_size=block_size,
+                token_ids=common_token_ids,
+                allocator=allocator,
+            )
+            block_ids = [block.block_id for block in blocks]
+            # The allocated blocks should  be marked as touched
+            # but not computed.
+            computed_block_ids = allocator.get_computed_block_ids(
+                [], block_ids, skip_last_block_id=False)
+            assert len(computed_block_ids) == 0
+
+        allocator.mark_blocks_as_computed([])
+        computed_block_ids = allocator.get_computed_block_ids(
+            [], block_ids, skip_last_block_id=False)
+        assert len(computed_block_ids) == common_blocks
+
+    @staticmethod
+    def create_immutable_chain(
+        block_size: int,
+        token_ids: List[int],
+        allocator: PrefixCachingBlockAllocator,
+    ) -> List[PrefixCachingBlock]:
+        """Helper method which creates a chain of blocks.
+        """
+        blocks: List[Block] = []
+        num_blocks = math.ceil(len(token_ids) / block_size)
+
+        if num_blocks == 0:
+            return []
+
+        prev_block = None
+        for block_number in range(0, num_blocks):
+            block_token_ids = token_ids[block_number *
+                                        block_size:(block_number + 1) *
+                                        block_size]
+            prev_block = allocator.allocate_immutable_block(
+                prev_block=prev_block, token_ids=block_token_ids)
+            blocks.append(prev_block)
+
+        return blocks
--- a/vllm-v0.6.2/tests/core/test_chunked_prefill_scheduler.py
+++ b/vllm-v0.6.2/tests/core/test_chunked_prefill_scheduler.py
@@ -0,0 +1,509 @@
+from typing import List
+from unittest.mock import MagicMock
+
+import pytest  # noqa
+
+from vllm.config import CacheConfig, SchedulerConfig
+from vllm.core.scheduler import Scheduler
+from vllm.sequence import Logprob, SequenceGroup
+
+from .utils import create_dummy_prompt
+
+
+def get_sequence_groups(scheduler_output):
+    return [s.seq_group for s in scheduler_output.scheduled_seq_groups]
+
+
+def append_new_token(seq_group, token_id: int):
+    for seq in seq_group.get_seqs():
+        seq.append_token_id(token_id, {token_id: Logprob(token_id)})
+
+
+def schedule_and_update_computed_tokens(scheduler):
+    metas, out, _ = scheduler.schedule()
+    for s, meta in zip(out.scheduled_seq_groups, metas):
+        s.seq_group.update_num_computed_tokens(meta.token_chunk_size)
+    return metas, out
+
+
+def test_simple():
+    """Verify basic scheduling works."""
+    block_size = 4
+    num_seq_group = 4
+    max_model_len = 16
+    max_num_batched_tokens = 64
+    scheduler_config = SchedulerConfig("generate",
+                                       max_num_batched_tokens,
+                                       num_seq_group,
+                                       max_model_len,
+                                       enable_chunked_prefill=True)
+    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+    cache_config.num_cpu_blocks = 8
+    cache_config.num_gpu_blocks = 8
+    scheduler = Scheduler(scheduler_config, cache_config, None)
+    running: List[SequenceGroup] = []
+
+    # Add seq groups to scheduler.
+    for i in range(num_seq_group):
+        _, seq_group = create_dummy_prompt(str(i),
+                                           prompt_length=block_size,
+                                           block_size=block_size)
+        scheduler.add_seq_group(seq_group)
+        running.append(seq_group)
+
+    # Schedule seq groups prompts.
+    num_tokens = block_size * num_seq_group
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert set(get_sequence_groups(out)) == set(running)
+    assert out.num_batched_tokens == num_tokens
+    assert (not out.blocks_to_copy and not out.blocks_to_swap_in
+            and not out.blocks_to_swap_out)
+    assert len(seq_group_meta) == num_seq_group
+    for s in running:
+        append_new_token(s, 1)
+
+    # Schedule seq groups generation.
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert set(get_sequence_groups(out)) == set(running)
+    assert out.num_batched_tokens == num_seq_group
+    assert (not out.blocks_to_copy and not out.blocks_to_swap_in
+            and not out.blocks_to_swap_out)
+    assert len(seq_group_meta) == num_seq_group
+
+
+def test_chunk():
+    """Verify prefills are chunked properly."""
+    block_size = 4
+    max_seqs = 60
+    max_model_len = 80
+    max_num_batched_tokens = 64
+    scheduler_config = SchedulerConfig(
+        "generate",
+        max_num_batched_tokens,
+        max_seqs,
+        max_model_len,
+        enable_chunked_prefill=True,
+    )
+    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+    cache_config.num_cpu_blocks = 32
+    cache_config.num_gpu_blocks = 32
+    scheduler = Scheduler(scheduler_config, cache_config, None)
+    running: List[SequenceGroup] = []
+
+    # Add seq groups to scheduler.
+    for i in range(2):
+        _, seq_group = create_dummy_prompt(str(i),
+                                           prompt_length=60,
+                                           block_size=block_size)
+        scheduler.add_seq_group(seq_group)
+        running.append(seq_group)
+
+    # Verify the second request is chunked.
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    print()
+    assert set(get_sequence_groups(out)) == set(running)
+    assert seq_group_meta[0].token_chunk_size == 60
+    # Verify it is chunked.
+    assert seq_group_meta[1].token_chunk_size == 4
+    assert out.num_prefill_groups == 2
+    assert out.num_batched_tokens == 64
+    # Only the first seq group has a new token appended.
+    append_new_token(running[0], 1)
+
+    # One chunked prefill, and one decoding.
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert set(get_sequence_groups(out)) == set(running)
+    # The first one is prefill. Scheduler guarantees ordering.
+    assert seq_group_meta[0].token_chunk_size == 56
+    # The second one is a chunked prefill.
+    assert seq_group_meta[1].token_chunk_size == 1
+    assert out.num_prefill_groups == 1
+    assert out.num_batched_tokens == 57
+
+
+def test_complex():
+    block_size = 4
+    max_seqs = 60
+    max_model_len = 80
+    max_num_batched_tokens = 64
+    scheduler_config = SchedulerConfig(
+        "generate",
+        max_num_batched_tokens,
+        max_seqs,
+        max_model_len,
+        enable_chunked_prefill=True,
+    )
+    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+    cache_config.num_cpu_blocks = 64
+    cache_config.num_gpu_blocks = 64
+    scheduler = Scheduler(scheduler_config, cache_config, None)
+    running: List[SequenceGroup] = []
+
+    # Add seq groups to scheduler.
+    for i in range(2):
+        _, seq_group = create_dummy_prompt(str(i),
+                                           prompt_length=60,
+                                           block_size=block_size)
+        scheduler.add_seq_group(seq_group)
+        running.append(seq_group)
+        assert seq_group.is_prefill()
+
+    # Verify the second request is chunked.
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+
+    assert set(get_sequence_groups(out)) == set(running)
+    assert seq_group_meta[0].token_chunk_size == 60
+    # Verify it is chunked.
+    assert seq_group_meta[1].token_chunk_size == 4
+    assert not running[0].is_prefill()
+    assert running[1].is_prefill()
+    assert out.num_prefill_groups == 2
+    assert out.num_batched_tokens == 64
+    # Only the first seq group has a new token appended.
+    append_new_token(running[0], 1)
+
+    # Add 2 more requests.
+    for i in range(2, 4):
+        _, seq_group = create_dummy_prompt(str(i),
+                                           prompt_length=60,
+                                           block_size=block_size)
+        scheduler.add_seq_group(seq_group)
+        running.append(seq_group)
+
+    # Decoding & chunked prefill & first chunk of 3rd request is scheduled.
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert len(get_sequence_groups(out)) == 3
+    # The first one is the first chunked prefill.
+    assert seq_group_meta[0].token_chunk_size == 7
+    # The second one is the second new chunked prefill.
+    assert seq_group_meta[1].token_chunk_size == 56
+    # The last one is decode.
+    assert seq_group_meta[2].token_chunk_size == 1
+    # Two of them are in chunked prefill.
+    assert out.num_prefill_groups == 2
+    assert out.num_batched_tokens == 64
+    # The first 2 requests are now in decodine phase.
+    append_new_token(running[0], 1)
+    assert not running[0].is_prefill()
+    append_new_token(running[1], 1)
+    assert not running[1].is_prefill()
+    # The third request is still in prefill stage.
+    assert running[2].is_prefill()
+
+
+def test_maximal_decoding():
+    """Verify decoding requests are prioritized."""
+    block_size = 4
+    max_seqs = 2
+    max_model_len = 8
+    max_num_batched_tokens = 2
+    scheduler_config = SchedulerConfig(
+        "generate",
+        max_num_batched_tokens,
+        max_seqs,
+        max_model_len,
+        enable_chunked_prefill=True,
+    )
+    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+    cache_config.num_cpu_blocks = 8
+    cache_config.num_gpu_blocks = 8
+    scheduler = Scheduler(scheduler_config, cache_config, None)
+    running: List[SequenceGroup] = []
+
+    # Add seq groups to scheduler.
+    for i in range(2):
+        _, seq_group = create_dummy_prompt(str(i),
+                                           prompt_length=2,
+                                           block_size=block_size)
+        scheduler.add_seq_group(seq_group)
+        running.append(seq_group)
+        assert seq_group.is_prefill()
+
+    # The first prefill is scheduled.
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert len(get_sequence_groups(out)) == 1
+    assert seq_group_meta[0].token_chunk_size == 2
+    assert not running[0].is_prefill()
+    assert running[1].is_prefill()
+    assert out.num_prefill_groups == 1
+    assert out.num_batched_tokens == 2
+    # Only the first seq group has a new token appended.
+    append_new_token(running[0], 1)
+
+    # Create one more seq_group.
+    _, seq_group = create_dummy_prompt("3",
+                                       prompt_length=2,
+                                       block_size=block_size)
+    scheduler.add_seq_group(seq_group)
+    running.append(seq_group)
+    assert seq_group.is_prefill()
+    # The first decoding + second chunk is scheduled.
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert len(get_sequence_groups(out)) == 2
+    assert seq_group_meta[0].token_chunk_size == 1
+    assert seq_group_meta[1].token_chunk_size == 1
+    assert not running[0].is_prefill()
+    assert running[1].is_prefill()
+    assert running[2].is_prefill()
+    assert out.num_prefill_groups == 1
+    assert out.num_batched_tokens == 2
+    append_new_token(running[0], 1)
+
+    # Decoding + running prefill is prioritized.
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert len(get_sequence_groups(out)) == 2
+    assert seq_group_meta[0].token_chunk_size == 1
+    assert seq_group_meta[1].token_chunk_size == 1
+    assert not running[0].is_prefill()
+    assert not running[1].is_prefill()
+    assert out.num_prefill_groups == 1
+    assert out.num_batched_tokens == 2
+    append_new_token(running[0], 1)
+    append_new_token(running[1], 1)
+
+    # Only decoding is prioritized.
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert len(get_sequence_groups(out)) == 2
+    assert seq_group_meta[0].token_chunk_size == 1
+    assert seq_group_meta[1].token_chunk_size == 1
+    assert not running[0].is_prefill()
+    assert not running[1].is_prefill()
+    assert out.num_prefill_groups == 0
+    assert out.num_batched_tokens == 2
+    append_new_token(running[0], 1)
+    append_new_token(running[1], 1)
+
+    # After aborting the decoding request, the fcfs new prefill is prioritized.
+    scheduler.abort_seq_group(running[0].request_id)
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert len(get_sequence_groups(out)) == 2
+    assert seq_group_meta[0].token_chunk_size == 1
+    assert seq_group_meta[1].token_chunk_size == 1
+    assert not running[1].is_prefill()
+    assert running[2].is_prefill()
+    assert out.num_prefill_groups == 1
+    assert out.num_batched_tokens == 2
+
+
+def test_prompt_limit():
+    """Verify max_num_batched_tokens < max_model_len is possible."""
+    block_size = 4
+    max_seqs = 32
+    max_model_len = 64
+    max_num_batched_tokens = 32
+    scheduler_config = SchedulerConfig(
+        "generate",
+        max_num_batched_tokens,
+        max_seqs,
+        max_model_len,
+        enable_chunked_prefill=True,
+    )
+    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+    cache_config.num_cpu_blocks = 16
+    cache_config.num_gpu_blocks = 16
+    scheduler = Scheduler(scheduler_config, cache_config, None)
+    running: List[SequenceGroup] = []
+
+    _, seq_group = create_dummy_prompt("1",
+                                       prompt_length=48,
+                                       block_size=block_size)
+    scheduler.add_seq_group(seq_group)
+    running.append(seq_group)
+    assert seq_group.is_prefill()
+
+    # The prompt length > max_num_batched_tokens should be still scheduled.
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert len(get_sequence_groups(out)) == 1
+    assert seq_group_meta[0].token_chunk_size == 32
+    assert running[0].is_prefill()
+    assert out.num_prefill_groups == 1
+    assert out.num_batched_tokens == 32
+
+
+def test_prompt_limit_exceed():
+    block_size = 4
+    max_seqs = 64
+    max_model_len = 32
+    max_num_batched_tokens = 64
+    scheduler_config = SchedulerConfig("generate",
+                                       max_num_batched_tokens,
+                                       max_seqs,
+                                       max_model_len,
+                                       enable_chunked_prefill=True)
+    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+    cache_config.num_cpu_blocks = 16
+    cache_config.num_gpu_blocks = 16
+    scheduler = Scheduler(scheduler_config, cache_config, None)
+    running: List[SequenceGroup] = []
+    _, seq_group = create_dummy_prompt("2",
+                                       prompt_length=48,
+                                       block_size=block_size)
+    scheduler.add_seq_group(seq_group)
+    running.append(seq_group)
+    assert seq_group.is_prefill()
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert len(out.ignored_seq_groups) == 1
+    assert out.ignored_seq_groups[0] == seq_group
+
+
+def test_chunked_prefill_preempt():
+    """Verify preempt works with chunked prefill requests"""
+    block_size = 4
+    max_seqs = 30
+    max_model_len = 200
+    max_num_batched_tokens = 30
+    scheduler_config = SchedulerConfig(
+        "generate",
+        max_num_batched_tokens,
+        max_seqs,
+        max_model_len,
+        enable_chunked_prefill=True,
+    )
+    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+    cache_config.num_cpu_blocks = 16
+    cache_config.num_gpu_blocks = 16
+    scheduler = Scheduler(scheduler_config, cache_config, None)
+
+    _, seq_group = create_dummy_prompt("1",
+                                       prompt_length=60,
+                                       block_size=block_size)
+    scheduler.add_seq_group(seq_group)
+    _, out = schedule_and_update_computed_tokens(scheduler)
+    # The request is chunked.
+    # prefill scheduled now.
+    assert len(out.scheduled_seq_groups) == 1
+    assert out.num_prefill_groups == 1
+    assert seq_group.is_prefill()
+    assert out.num_batched_tokens == max_num_batched_tokens
+
+    # The request should be preempted.
+    scheduler.block_manager.can_append_slots = MagicMock()
+
+    def cannot_append_second_group1(seq_group, num_lookahead_slots):
+        return seq_group.request_id != "1"
+
+    scheduler.block_manager.can_append_slots.side_effect = (
+        cannot_append_second_group1)
+
+    # The running prefill is now preempted.
+    _, out = schedule_and_update_computed_tokens(scheduler)
+    assert len(out.scheduled_seq_groups) == 0
+    assert out.num_batched_tokens == 0
+    assert out.blocks_to_swap_out == []
+    assert out.blocks_to_swap_in == []
+
+    # Make sure we can reschedule preempted request.
+    _, out = schedule_and_update_computed_tokens(scheduler)
+    assert len(out.scheduled_seq_groups) == 1
+    assert out.num_prefill_groups == 1
+    assert seq_group.is_prefill()
+    assert out.num_batched_tokens == max_num_batched_tokens
+    assert seq_group.get_num_uncomputed_tokens() == 30
+
+    # We should be able to run prefill twice as it is chunked.
+    def cannot_append_second_group2(seq_group, num_lookahead_slots):
+        return True
+
+    scheduler.block_manager.can_append_slots.side_effect = (
+        cannot_append_second_group2)
+    _, out = schedule_and_update_computed_tokens(scheduler)
+    assert len(out.scheduled_seq_groups) == 1
+    assert out.num_prefill_groups == 1
+    assert not seq_group.is_prefill()
+    assert out.num_batched_tokens == max_num_batched_tokens
+
+
+def test_chunked_prefill_max_seqs():
+    block_size = 4
+    max_seqs = 2
+    max_model_len = 80
+    max_num_batched_tokens = 64
+    scheduler_config = SchedulerConfig(
+        "generate",
+        max_num_batched_tokens,
+        max_seqs,
+        max_model_len,
+        enable_chunked_prefill=True,
+    )
+    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+    cache_config.num_cpu_blocks = 128
+    cache_config.num_gpu_blocks = 128
+    scheduler = Scheduler(scheduler_config, cache_config, None)
+    running: List[SequenceGroup] = []
+
+    _, seq_group = create_dummy_prompt("1",
+                                       prompt_length=65,
+                                       block_size=block_size)
+    scheduler.add_seq_group(seq_group)
+    running.append(seq_group)
+    # The first prefill is chunked.
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert seq_group_meta[0].token_chunk_size == max_num_batched_tokens
+    assert len(get_sequence_groups(out)) == 1
+
+    # Add new requests.
+    for i in range(4):
+        _, seq_group = create_dummy_prompt(str(i),
+                                           prompt_length=65,
+                                           block_size=block_size)
+        scheduler.add_seq_group(seq_group)
+        running.append(seq_group)
+
+    # Make sure only 2 requests are scheduled.
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert out.num_batched_tokens == max_num_batched_tokens
+    assert len(get_sequence_groups(out)) == 2
+    assert not running[0].is_prefill()
+    assert running[1].is_prefill()
+    append_new_token(running[0], 1)
+
+    # Although we have enough token budget, we can only schedule max_seqs.
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert seq_group_meta[0].token_chunk_size == 2
+    assert seq_group_meta[1].token_chunk_size == 1
+    assert out.num_batched_tokens == 3
+    assert len(get_sequence_groups(out)) == max_seqs
+    assert not running[0].is_prefill()
+    assert not running[1].is_prefill()
+
+
+def test_perfix_caching():
+    """Verify allocating full blocks when prefix caching is enabled."""
+    block_size = 4
+    max_seqs = 10
+    max_model_len = 80
+    max_num_batched_tokens = 64
+    scheduler_config = SchedulerConfig(
+        "generate",
+        max_num_batched_tokens,
+        max_seqs,
+        max_model_len,
+        enable_chunked_prefill=True,
+    )
+    cache_config = CacheConfig(block_size,
+                               1.0,
+                               1,
+                               "auto",
+                               enable_prefix_caching=True)
+    cache_config.num_cpu_blocks = 0
+    cache_config.num_gpu_blocks = 32
+    scheduler = Scheduler(scheduler_config, cache_config, None)
+    running: List[SequenceGroup] = []
+
+    # Add seq groups to scheduler.
+    for i in range(2):
+        _, seq_group = create_dummy_prompt(str(i),
+                                           block_size=block_size,
+                                           prompt_length=50)
+        scheduler.add_seq_group(seq_group)
+        running.append(seq_group)
+
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert set(get_sequence_groups(out)) == set(running)
+    assert seq_group_meta[0].token_chunk_size == 50
+    # Verify it is chunked. Note that although the budget is 64-50=14,
+    # we only allocate full blocks for prefix caching, so only 4*(14//4)=12
+    # tokens are allocated.
+    assert seq_group_meta[1].token_chunk_size == 12
+    assert out.num_prefill_groups == 2
+    assert out.num_batched_tokens == 62
--- a/vllm-v0.6.2/tests/core/test_num_computed_tokens_update.py
+++ b/vllm-v0.6.2/tests/core/test_num_computed_tokens_update.py
@@ -0,0 +1,80 @@
+import pytest
+
+from tests.conftest import VllmRunner
+from tests.core.utils import create_dummy_prompt
+from vllm.engine.llm_engine import LLMEngine
+from vllm.platforms import current_platform
+from vllm.sequence import SequenceGroup
+
+MODEL = "JackFram/llama-160m"
+
+
+def add_seq_group_to_engine(engine: LLMEngine, seq_group: SequenceGroup):
+    scheduler = engine.scheduler[0]
+    scheduler.add_seq_group(seq_group)
+
+
+@pytest.mark.parametrize("num_scheduler_steps", [1, 8])
+@pytest.mark.parametrize("enable_chunked_prefill", [False, True])
+@pytest.mark.parametrize("enforce_eager", [False, True])
+def test_num_computed_tokens_update(num_scheduler_steps: int,
+                                    enable_chunked_prefill: bool,
+                                    enforce_eager: bool):
+
+    is_multi_step = num_scheduler_steps > 1
+    is_multi_step_chunked_prefill = is_multi_step and enable_chunked_prefill
+
+    if is_multi_step_chunked_prefill and current_platform.is_rocm():
+        pytest.skip("Multi-step with Chunked-Prefill does not support "
+                    "rocm_flash_attn backend")
+
+    # Make a vllm engine
+    runner = VllmRunner(model_name=MODEL,
+                        gpu_memory_utilization=0.3,
+                        num_scheduler_steps=num_scheduler_steps,
+                        enable_chunked_prefill=enable_chunked_prefill,
+                        enforce_eager=enforce_eager)
+    engine: LLMEngine = runner.model.llm_engine
+
+    # In multi-step + chunked-prefill there is no separate single prompt step.
+    # What is scheduled will run for num_scheduler_steps always.
+    num_prompt_steps = num_scheduler_steps \
+        if is_multi_step_chunked_prefill else 1
+
+    num_output_tokens_list = [4, 8, 12, 15, 16, 17]
+
+    # Create sequence and add to engine
+    prompt_len = 10
+
+    for req_idx, num_output_tokens in enumerate(num_output_tokens_list):
+        seq, seq_group = create_dummy_prompt(request_id=str(req_idx),
+                                             prompt_length=prompt_len,
+                                             min_tokens=num_output_tokens,
+                                             max_tokens=num_output_tokens)
+        add_seq_group_to_engine(engine, seq_group)
+
+        assert seq.data.get_num_computed_tokens() == 0
+
+        for _ in range(num_prompt_steps):
+            # prompt steps
+            engine.step()
+
+        if not seq.is_finished():
+            prompt_num_computed_tokens = seq.data.get_num_computed_tokens()
+            # Test correctness of num_computed_tokens after the prompt steps
+            assert prompt_num_computed_tokens == \
+                        prompt_len + num_prompt_steps - 1
+
+            decode_step_counter = 0
+            while not seq.is_finished():
+                # Test correctness of num_computed_tokens after the decode steps
+                assert seq.data.get_num_computed_tokens(
+                ) == prompt_num_computed_tokens + decode_step_counter
+                for _ in range(num_scheduler_steps):
+                    # decode step
+                    engine.step()
+                    decode_step_counter += 1
+
+        # Test correctness of num_computed_tokens after the sequence finish.
+        assert seq.data.get_num_computed_tokens(
+        ) == prompt_len + num_output_tokens - 1
--- a/vllm-v0.6.2/tests/core/test_scheduler.py
+++ b/vllm-v0.6.2/tests/core/test_scheduler.py
@@ -0,0 +1,802 @@
+import time
+from collections import deque
+from typing import List, Set, Tuple
+from unittest.mock import MagicMock
+
+import pytest  # noqa
+from torch import Use  # noqa
+
+from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig
+from vllm.core.interfaces import AllocStatus
+from vllm.core.scheduler import Scheduler, SchedulingBudget
+from vllm.lora.request import LoRARequest
+from vllm.sequence import SequenceGroup
+
+from .utils import (append_new_token, append_new_token_seq_group,
+                    create_dummy_prompt, get_sequence_groups,
+                    schedule_and_update_computed_tokens)
+
+
+def test_scheduler_add_seq_group():
+    block_size = 4
+    scheduler_config = SchedulerConfig(
+        "generate",
+        max_num_batched_tokens=100,
+        max_num_seqs=64,
+        max_model_len=1,
+    )
+    cache_config = CacheConfig(block_size, 1.0, 1, cache_dtype="auto")
+    cache_config.num_cpu_blocks = 4
+    cache_config.num_gpu_blocks = 4
+    scheduler = Scheduler(scheduler_config, cache_config, None)
+
+    # Add seq group to scheduler.
+    num_seq_group = 4
+    for i in range(num_seq_group):
+        _, seq_group = create_dummy_prompt(str(i),
+                                           block_size,
+                                           block_size=block_size)
+        scheduler.add_seq_group(seq_group)
+        assert scheduler.get_num_unfinished_seq_groups() == i + 1
+
+
+def test_scheduler_abort_seq_group():
+    block_size = 4
+    scheduler_config = SchedulerConfig(
+        "generate",
+        max_num_batched_tokens=100,
+        max_num_seqs=64,
+        max_model_len=1,
+    )
+    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+    cache_config.num_cpu_blocks = 4
+    cache_config.num_gpu_blocks = 4
+    scheduler = Scheduler(scheduler_config, cache_config, None)
+
+    # Add multiple seq groups to scheduler.
+    num_seq_group = 4
+    request_ids: Set[str] = set()
+    for i in range(num_seq_group):
+        _, seq_group = create_dummy_prompt(str(i), block_size)
+        scheduler.add_seq_group(seq_group)
+        request_ids.add(str(i))
+
+    # Abort all added seq groups.
+    assert scheduler.get_num_unfinished_seq_groups() == num_seq_group
+    scheduler.abort_seq_group(request_ids)
+    assert scheduler.get_num_unfinished_seq_groups() == 0
+
+
+def test_scheduler_schedule_simple():
+    block_size = 4
+    num_seq_group = 4
+    max_model_len = 16
+    scheduler_config = SchedulerConfig(
+        "generate",
+        max_num_batched_tokens=64,
+        max_num_seqs=num_seq_group,
+        max_model_len=max_model_len,
+    )
+    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+    cache_config.num_cpu_blocks = 8
+    cache_config.num_gpu_blocks = 8
+    scheduler = Scheduler(scheduler_config, cache_config, None)
+    running: List[SequenceGroup] = []
+
+    # Add seq groups to scheduler.
+    for i in range(num_seq_group):
+        _, seq_group = create_dummy_prompt(str(i),
+                                           prompt_length=block_size,
+                                           block_size=block_size)
+        scheduler.add_seq_group(seq_group)
+        running.append(seq_group)
+
+    # Schedule seq groups prompts.
+    num_tokens = block_size * num_seq_group
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert set(get_sequence_groups(out)) == set(running)
+    assert out.num_batched_tokens == num_tokens
+    assert (not out.blocks_to_copy and not out.blocks_to_swap_in
+            and not out.blocks_to_swap_out)
+    assert len(seq_group_meta) == num_seq_group
+    append_new_token(out, 1)
+
+    # Schedule seq groups generation.
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert set(get_sequence_groups(out)) == set(running)
+    assert out.num_batched_tokens == num_seq_group
+    assert (not out.blocks_to_copy and not out.blocks_to_swap_in
+            and not out.blocks_to_swap_out)
+    assert len(seq_group_meta) == num_seq_group
+    append_new_token(out, 1)
+
+
+def test_scheduler_prefill_prioritized():
+    """Verify running batched tokens are not applied to prefill requests."""
+    block_size = 4
+    max_model_len = 30
+    max_batched_num_tokens = 30
+    scheduler_config = SchedulerConfig(
+        "generate",
+        max_num_batched_tokens=max_batched_num_tokens,
+        max_num_seqs=2,
+        max_model_len=max_model_len,
+    )
+    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+    cache_config.num_cpu_blocks = 16
+    cache_config.num_gpu_blocks = 16
+    scheduler = Scheduler(scheduler_config, cache_config, None)
+
+    # Add seq groups to scheduler.
+    _, seq_group_a = create_dummy_prompt("1", 1, block_size=block_size)
+    scheduler.add_seq_group(seq_group_a)
+
+    # Schedule seq groups prompts.
+    _, out = schedule_and_update_computed_tokens(scheduler)
+    assert get_sequence_groups(out) == [seq_group_a]
+
+    # Add a new prefill request B.
+    _, seq_group_b = create_dummy_prompt("2", 30, block_size=block_size)
+    scheduler.add_seq_group(seq_group_b)
+
+    # Verify prefill requests are prioritized. Since max_batched_num_tokens
+    # is 1, new prefill request has to be scheduled first.
+    _, out = schedule_and_update_computed_tokens(scheduler)
+    assert get_sequence_groups(out) == [seq_group_b]
+
+
+def test_scheduler_schedule_preempt_abort():
+    block_size = 4
+    max_model_len = 16
+    scheduler_config = SchedulerConfig(
+        "generate",
+        max_num_batched_tokens=64,
+        max_num_seqs=2,
+        max_model_len=max_model_len,
+    )
+    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+    cache_config.num_cpu_blocks = 2
+    cache_config.num_gpu_blocks = 2
+    scheduler = Scheduler(scheduler_config, cache_config, None)
+
+    # Add seq groups to scheduler.
+    seq_a, seq_group_a = create_dummy_prompt("1",
+                                             block_size,
+                                             block_size=block_size)
+    seq_b, seq_group_b = create_dummy_prompt("2",
+                                             block_size,
+                                             block_size=block_size)
+    scheduler.add_seq_group(seq_group_a)
+    scheduler.add_seq_group(seq_group_b)
+
+    # Schedule seq groups prompts.
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert get_sequence_groups(out) == [seq_group_a, seq_group_b]
+    assert out.num_batched_tokens == block_size * 2  # seq_a and seq_b
+    assert (not out.blocks_to_copy and not out.blocks_to_swap_in
+            and not out.blocks_to_swap_out)
+    assert len(seq_group_meta) == 2
+    assert scheduler.get_num_unfinished_seq_groups() == 2
+
+    # Append "generated" tokens, allowing the sequence to mark prompt tokens as
+    # processed.
+    append_new_token(out, 1)
+
+    # Schedule seq groups generation and preempt seq group b.
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert get_sequence_groups(out) == [seq_group_a]
+    assert out.num_batched_tokens == 1
+    assert (not out.blocks_to_copy and not out.blocks_to_swap_in
+            and not out.blocks_to_swap_out)
+    assert len(seq_group_meta) == 1
+    assert scheduler.get_num_unfinished_seq_groups() == 2
+    assert out.preempted == 1
+
+    # Abort seq group a. Re-schedule seq group b prompt with recomputation.
+    scheduler.abort_seq_group("1")
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert get_sequence_groups(out) == [seq_group_b]
+    assert out.num_batched_tokens == 5  # 4 prompt + 1 generation.
+    assert (not out.blocks_to_copy and not out.blocks_to_swap_in
+            and not out.blocks_to_swap_out)
+    assert len(seq_group_meta) == 1
+    assert scheduler.get_num_unfinished_seq_groups() == 1
+
+
+def test_scheduler_max_seqs():
+    block_size = 4
+    num_seq_group = 4
+    max_seq_group = 2
+    max_model_len = 16
+    scheduler_config = SchedulerConfig(
+        "generate",
+        max_num_batched_tokens=64,
+        max_num_seqs=max_seq_group,
+        max_model_len=max_model_len,
+    )
+    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+    cache_config.num_cpu_blocks = 8
+    cache_config.num_gpu_blocks = 8
+    scheduler = Scheduler(scheduler_config, cache_config, None)
+
+    all_seq_groups: List[SequenceGroup] = []
+    # Add seq groups to scheduler.
+    for i in range(num_seq_group):
+        _, seq_group = create_dummy_prompt(str(i),
+                                           prompt_length=block_size,
+                                           block_size=block_size)
+        all_seq_groups.append(seq_group)
+
+    # Append 1 seq group
+    scheduler.add_seq_group(all_seq_groups[0])
+
+    # Schedule seq groups prompts.
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert set(get_sequence_groups(out)) == set([all_seq_groups[0]])
+    append_new_token(out, 1)
+
+    # Schedule seq groups generation.
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert set(get_sequence_groups(out)) == set([all_seq_groups[0]])
+    append_new_token(out, 1)
+
+    # Append 2 more seq group
+    scheduler.add_seq_group(all_seq_groups[1])
+    scheduler.add_seq_group(all_seq_groups[2])
+
+    # Schedule seq groups prompts.
+    # Only 1 seq group should be scheduled since max_seq_group is 2
+    # and one is prompting.
+    _, out = schedule_and_update_computed_tokens(scheduler)
+    assert set(get_sequence_groups(out)) == set([all_seq_groups[1]])
+
+
+def test_scheduler_delay_factor():
+    block_size = 4
+    scheduler_config = SchedulerConfig(
+        "generate",
+        max_num_batched_tokens=100,
+        max_num_seqs=64,
+        max_model_len=16,
+        delay_factor=0.5,
+    )
+    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+    cache_config.num_cpu_blocks = 8
+    cache_config.num_gpu_blocks = 8
+    scheduler = Scheduler(scheduler_config, cache_config, None)
+
+    # schedule first prompt
+    seq_group_meta, seq_group = create_dummy_prompt("0",
+                                                    prompt_length=block_size,
+                                                    block_size=block_size)
+    scheduler.add_seq_group(seq_group)
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert out.num_prefill_groups > 0
+    assert seq_group_meta[0].request_id == '0'
+    append_new_token(out, 1)
+
+    # wait for a second before scheduling next prompt
+    time.sleep(1)
+    seq_group_meta, seq_group = create_dummy_prompt("1",
+                                                    prompt_length=block_size,
+                                                    block_size=block_size)
+    scheduler.add_seq_group(seq_group)
+
+    # second prompt should *not* be scheduled
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert out.num_prefill_groups == 0
+    assert seq_group_meta[0].request_id == '0'
+    append_new_token(out, 1)
+
+    # wait for more than 0.5 second and try again
+    time.sleep(0.6)
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert out.num_prefill_groups > 0
+    assert seq_group_meta[0].request_id == '1'
+    append_new_token(out, 1)
+
+
+def initialize_scheduler(
+    *,
+    max_num_seqs=1000,
+    max_token_budget=1000,
+    max_model_len=1000,
+    lora_config=None,
+    block_size=4,
+    num_cpu_blocks=8,
+    num_gpu_blocks=8,
+):
+    block_size = block_size
+    scheduler_config = SchedulerConfig(
+        "generate",
+        max_num_batched_tokens=max_token_budget,
+        max_num_seqs=max_num_seqs,
+        max_model_len=max_model_len,
+    )
+    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+    cache_config.num_cpu_blocks = num_cpu_blocks
+    cache_config.num_gpu_blocks = num_gpu_blocks
+    scheduler = Scheduler(scheduler_config, cache_config, lora_config)
+    return scheduler
+
+
+def create_token_budget(token_budget: int = 10000,
+                        max_num_seqs: int = 10000) -> SchedulingBudget:
+    return SchedulingBudget(
+        token_budget=token_budget,
+        max_num_seqs=max_num_seqs,
+    )
+
+
+def add_token_budget(budget: SchedulingBudget,
+                     num_batched_tokens: int = 0,
+                     num_curr_seqs: int = 0):
+    mock_seq_group = create_dummy_prompt('10', prompt_length=60)[1]
+    budget.add_num_batched_tokens(mock_seq_group.request_id,
+                                  num_batched_tokens)
+    budget.add_num_seqs(mock_seq_group.request_id, num_curr_seqs)
+
+
+def test_prefill_schedule_max_prompt_len():
+    """
+    Test prompt longer than max_prompt_len is aborted.
+    """
+    block_size = 4
+    scheduler = initialize_scheduler(max_model_len=30, block_size=block_size)
+    _, seq_group = create_dummy_prompt("0",
+                                       prompt_length=60,
+                                       block_size=block_size)
+    scheduler.add_seq_group(seq_group)
+    budget = create_token_budget()
+    output = scheduler._schedule_prefills(budget, None)
+    remaining_waiting = scheduler.waiting
+    assert len(output.ignored_seq_groups) == 1
+    assert len(output.seq_groups) == 0
+    assert budget.num_batched_tokens == 0
+    assert budget.num_curr_seqs == 0
+    assert len(remaining_waiting) == 0
+
+
+def test_prefill_schedule_token_budget():
+    """
+    Test token budget respected.
+    """
+    block_size = 4
+    scheduler = initialize_scheduler(block_size=block_size,
+                                     num_cpu_blocks=64,
+                                     num_gpu_blocks=64)
+    budget = create_token_budget(token_budget=0)
+    for i in range(2):
+        _, seq_group = create_dummy_prompt(str(i),
+                                           prompt_length=60,
+                                           block_size=block_size)
+        scheduler.add_seq_group(seq_group)
+
+    # 0 token budget == nothing is scheduled.
+    output = scheduler._schedule_prefills(budget, None)
+    remaining_waiting = scheduler.waiting
+    assert len(output.ignored_seq_groups) == 0
+    assert len(output.seq_groups) == 0
+    assert budget.num_batched_tokens == 0
+    assert budget.num_curr_seqs == 0
+    assert len(remaining_waiting) == 2
+
+    # 60 token budget == 1 request scheduled.
+    budget = create_token_budget(token_budget=60)
+    output = scheduler._schedule_prefills(budget, None)
+    remaining_waiting = scheduler.waiting
+    assert len(output.ignored_seq_groups) == 0
+    assert len(output.seq_groups) == 1
+    assert budget.num_batched_tokens == 60
+    assert budget.num_curr_seqs == 1
+    assert len(remaining_waiting) == 1
+
+    # Test when current_batched_tokens respected.
+    scheduler = initialize_scheduler(block_size=block_size,
+                                     num_cpu_blocks=16,
+                                     num_gpu_blocks=16)
+    budget = create_token_budget(token_budget=60)
+    add_token_budget(budget, 30, 0)
+    _, seq_group = create_dummy_prompt(str(i),
+                                       prompt_length=60,
+                                       block_size=block_size)
+    # Cannot schedule a prompt that doesn't fit the budget.
+    scheduler.add_seq_group(seq_group)
+    output = scheduler._schedule_prefills(budget, None)
+    remaining_waiting = scheduler.waiting
+    assert len(output.ignored_seq_groups) == 0
+    assert len(output.seq_groups) == 0
+    assert budget.num_batched_tokens == 30
+    assert budget.num_curr_seqs == 0
+    assert len(remaining_waiting) == 1
+    budget = create_token_budget(token_budget=90)
+    add_token_budget(budget, 30, 0)
+    output = scheduler._schedule_prefills(budget, None)
+    remaining_waiting = scheduler.waiting
+    assert len(output.seq_groups) == 1
+    assert budget.num_batched_tokens == 90
+    assert budget.num_curr_seqs == 1
+    assert len(remaining_waiting) == 0
+
+
+def test_prefill_schedule_max_seqs():
+    """
+    Test max seq respected.
+    """
+    block_size = 4
+    scheduler = initialize_scheduler(block_size=block_size,
+                                     num_cpu_blocks=64,
+                                     num_gpu_blocks=64)
+    budget = create_token_budget(max_num_seqs=2)
+    for i in range(3):
+        _, seq_group = create_dummy_prompt(str(i),
+                                           prompt_length=60,
+                                           block_size=block_size)
+        scheduler.add_seq_group(seq_group)
+    output = scheduler._schedule_prefills(budget, None)
+    remaining_waiting = scheduler.waiting
+    assert len(output.ignored_seq_groups) == 0
+    assert len(output.seq_groups) == 2
+    assert budget.num_batched_tokens == 120
+    assert budget.num_curr_seqs == 2
+    assert len(remaining_waiting) == 1
+
+    # Verify curr_num_seqs respected.
+    scheduler.waiting = deque()
+    budget = create_token_budget(max_num_seqs=2)
+    add_token_budget(budget, 0, 2)
+    _, seq_group = create_dummy_prompt(str(i),
+                                       prompt_length=60,
+                                       block_size=block_size)
+    scheduler.add_seq_group(seq_group)
+    output = scheduler._schedule_prefills(budget, None)
+    remaining_waiting = scheduler.waiting
+    assert len(output.ignored_seq_groups) == 0
+    assert len(output.seq_groups) == 0
+    assert budget.num_batched_tokens == 0
+    assert budget.num_curr_seqs == 2
+    assert len(remaining_waiting) == 1
+
+
+def test_prefill_schedule_max_lora():
+    """
+    Test max lora is respected and prioritized.
+    """
+    block_size = 4
+    lora_config = LoRAConfig(max_lora_rank=8, max_loras=1)
+    scheduler = initialize_scheduler(lora_config=lora_config,
+                                     block_size=block_size,
+                                     num_cpu_blocks=64,
+                                     num_gpu_blocks=64)
+    budget = create_token_budget(token_budget=120)
+    curr_loras: Set[int] = set()
+    for i in range(2):
+        _, seq_group = create_dummy_prompt(str(i),
+                                           prompt_length=60,
+                                           block_size=block_size,
+                                           lora_request=LoRARequest(
+                                               lora_name=str(i),
+                                               lora_int_id=i + 1,
+                                               lora_path="abc"))
+        scheduler.add_seq_group(seq_group)
+    # Add two more requests to verify lora is prioritized.
+    # 0: Lora, 1: Lora, 2: regular, 3: regular
+    # In the first iteration, index 0, 2 is scheduled.
+    # If a request is not scheduled because it hits max lora, it is
+    # prioritized. Verify that.
+    for i in range(2, 4):
+        _, seq_group = create_dummy_prompt(str(i),
+                                           prompt_length=60,
+                                           block_size=block_size)
+        scheduler.add_seq_group(seq_group)
+    # Schedule 2 requests (0 and 2)
+    output = scheduler._schedule_prefills(budget, curr_loras)
+    remaining_waiting = scheduler.waiting
+    assert len(output.ignored_seq_groups) == 0
+    assert len(output.seq_groups) == 2
+    assert budget.num_batched_tokens == 120
+    assert budget.num_curr_seqs == 2
+    assert len(remaining_waiting) == 2
+    assert len(curr_loras) == 1
+    # The second lora request is scheduled next as FCFS policy.
+    # Reset curr_loras so that it can be scheduled.
+    curr_loras = set()
+    budget = create_token_budget(token_budget=60)
+    output = scheduler._schedule_prefills(budget, curr_loras)
+    remaining_waiting = scheduler.waiting
+    assert len(output.seq_groups) == 1
+    assert output.seq_groups[0].seq_group.request_id == "1"
+    assert len(remaining_waiting) == 1
+    assert len(curr_loras) == 1
+    assert budget.num_batched_tokens == 60
+
+
+def test_prefill_schedule_no_block_manager_capacity():
+    """
+    Test sequence cannot be scheduled due to block manager has no capacity.
+    """
+    block_size = 4
+    scheduler = initialize_scheduler(block_size=block_size,
+                                     num_gpu_blocks=128,
+                                     num_cpu_blocks=128)
+    budget = create_token_budget()
+    for i in range(3):
+        _, seq_group = create_dummy_prompt(str(i),
+                                           prompt_length=60,
+                                           block_size=block_size)
+        scheduler.add_seq_group(seq_group)
+    scheduler.block_manager.can_allocate = MagicMock()
+    scheduler.block_manager.can_allocate.return_value = AllocStatus.LATER
+    output = scheduler._schedule_prefills(budget, None)
+    remaining_waiting = scheduler.waiting
+    assert len(output.ignored_seq_groups) == 0
+    assert len(output.seq_groups) == 0
+    assert budget.num_batched_tokens == 0
+    assert budget.num_curr_seqs == 0
+    assert len(remaining_waiting) == 3
+
+    scheduler = initialize_scheduler()
+    budget = create_token_budget()
+    for i in range(3):
+        _, seq_group = create_dummy_prompt(str(i),
+                                           prompt_length=60,
+                                           block_size=block_size)
+        scheduler.add_seq_group(seq_group)
+    scheduler.block_manager.can_allocate = MagicMock()
+    scheduler.block_manager.can_allocate.return_value = AllocStatus.NEVER
+    output = scheduler._schedule_prefills(budget, None)
+    remaining_waiting = scheduler.waiting
+    assert len(output.ignored_seq_groups) == 3
+    assert len(output.seq_groups) == 0
+    assert budget.num_batched_tokens == 0
+    assert budget.num_curr_seqs == 0
+    assert len(remaining_waiting) == 0
+
+
+def test_decode_schedule_preempted():
+    """
+    Test decodes cannot be scheduled and preempted.
+    """
+    block_size = 4
+    scheduler = initialize_scheduler(block_size=block_size,
+                                     num_cpu_blocks=64,
+                                     num_gpu_blocks=64)
+    curr_loras = None
+    for i in range(3):
+        _, seq_group = create_dummy_prompt(str(i),
+                                           prompt_length=60,
+                                           block_size=block_size)
+        scheduler._allocate_and_set_running(seq_group)
+        append_new_token_seq_group(60, seq_group, 1)
+        scheduler._add_seq_group_to_running(seq_group)
+    scheduler.block_manager.can_append_slots = MagicMock()
+
+    def cannot_append_second_group(seq_group, num_lookahead_slots):
+        return seq_group.request_id != "1"
+
+    scheduler.block_manager.can_append_slots.side_effect = (
+        cannot_append_second_group)
+
+    # 1 cannot be scheduled, and the lowest priority (request 2)
+    # should be preempted. 1 will also be preempted.
+    budget = create_token_budget()
+    output = scheduler._schedule_running(budget, curr_loras)
+    remainig_running = scheduler.running
+    assert len(remainig_running) == 0
+    assert len(output.decode_seq_groups) == 1
+    assert len(output.prefill_seq_groups) == 0
+    assert output.decode_seq_groups[0].seq_group.request_id == "0"
+    assert len(output.preempted) == 2
+    # Verify budgets are updated.
+    assert budget.num_batched_tokens == 1
+    # NOTE: When enable_chunk is False, num_seqs budget is not updated.
+    # assert budget.num_curr_seqs == 1
+    # Both should be preempted, not swapped.
+    assert output.blocks_to_swap_out == []
+    # Nothing is copied.
+    assert output.blocks_to_copy == []
+
+
+def test_schedule_decode_blocks_to_copy_update():
+    """
+    Verify blocks_to_copy is updated.
+    """
+    block_size = 4
+    scheduler = initialize_scheduler(block_size=4,
+                                     num_cpu_blocks=16,
+                                     num_gpu_blocks=16)
+    _, seq_group = create_dummy_prompt("1",
+                                       prompt_length=60,
+                                       best_of=2,
+                                       block_size=block_size)
+    curr_loras = None
+    scheduler._allocate_and_set_running(seq_group)
+    append_new_token_seq_group(60, seq_group, 1)
+    scheduler._add_seq_group_to_running(seq_group)
+
+    # The last request should be swapped out.
+    scheduler.block_manager.append_slots = MagicMock()
+    scheduler.block_manager.append_slots.return_value = [(2, 3)]
+
+    budget = create_token_budget()
+    output = scheduler._schedule_running(budget, curr_loras)
+    remaining_running = scheduler.running
+    assert len(remaining_running) == 0
+    assert len(output.decode_seq_groups) == 1
+    assert len(output.prefill_seq_groups) == 0
+    assert len(output.preempted) == 0
+    assert len(output.swapped_out) == 0
+    # Nothing is preempted.
+    assert output.blocks_to_swap_out == []
+    # Since append_slot returns the source -> dist mapping, it should
+    # applied.
+    assert output.blocks_to_copy == [(2, 3)]
+
+
+def test_schedule_swapped_max_loras():
+    block_size = 4
+    lora_config = LoRAConfig(max_lora_rank=8, max_loras=1)
+    scheduler = initialize_scheduler(lora_config=lora_config,
+                                     block_size=block_size,
+                                     num_cpu_blocks=32,
+                                     num_gpu_blocks=32)
+    curr_loras: Set[int] = set()
+    blocks_to_swap_out: List[Tuple[int, int]] = []
+    for i in range(2):
+        _, seq_group = create_dummy_prompt(str(i),
+                                           prompt_length=60,
+                                           block_size=block_size,
+                                           lora_request=LoRARequest(
+                                               lora_name=str(i),
+                                               lora_int_id=i + 1,
+                                               lora_path="abc"))
+        scheduler._allocate_and_set_running(seq_group)
+        append_new_token_seq_group(60, seq_group, 1)
+        scheduler._swap_out(seq_group, blocks_to_swap_out)
+        scheduler._add_seq_group_to_swapped(seq_group)
+
+    budget = create_token_budget()
+    output = scheduler._schedule_swapped(budget, curr_loras)
+    remaining_swapped = scheduler.swapped
+    assert len(remaining_swapped) == 1
+    assert budget.num_batched_tokens == 1
+    assert budget.num_curr_seqs == 1
+    assert len(output.decode_seq_groups) == 1
+    assert len(output.prefill_seq_groups) == 0
+    assert len(curr_loras) == 1
+
+
+def test_schedule_swapped_cannot_swap_in():
+    block_size = 4
+    scheduler = initialize_scheduler(block_size=block_size,
+                                     num_cpu_blocks=32,
+                                     num_gpu_blocks=32)
+    curr_loras = None
+    blocks_to_swap_out: List[Tuple[int, int]] = []
+    for i in range(2):
+        _, seq_group = create_dummy_prompt(str(i),
+                                           prompt_length=60,
+                                           best_of=2,
+                                           block_size=block_size)
+        scheduler._allocate_and_set_running(seq_group)
+        append_new_token_seq_group(60, seq_group, 1)
+        scheduler._swap_out(seq_group, blocks_to_swap_out)
+        scheduler._add_seq_group_to_swapped(seq_group)
+
+    # The last request should be swapped out.
+    scheduler.block_manager.can_swap_in = MagicMock()
+    scheduler.block_manager.can_swap_in.return_value = AllocStatus.LATER
+    # Since we cannot swap in, none of the requests are swapped in.
+    budget = create_token_budget()
+    output = scheduler._schedule_swapped(budget, curr_loras)
+    remaining_swapped = scheduler.swapped
+    assert len(remaining_swapped) == 2
+    assert budget.num_batched_tokens == 0
+    assert budget.num_curr_seqs == 0
+    assert len(output.decode_seq_groups) == 0
+    assert len(output.prefill_seq_groups) == 0
+
+
+def test_infeasible_swap():
+    block_size = 4
+    scheduler = initialize_scheduler(block_size=block_size,
+                                     num_cpu_blocks=32,
+                                     num_gpu_blocks=32)
+    curr_loras = None
+    blocks_to_swap_out: List[Tuple[int, int]] = []
+    for i in range(2):
+        _, seq_group = create_dummy_prompt(str(i),
+                                           prompt_length=60,
+                                           best_of=2,
+                                           block_size=block_size)
+        scheduler._allocate_and_set_running(seq_group)
+        append_new_token_seq_group(60, seq_group, 1)
+        scheduler._swap_out(seq_group, blocks_to_swap_out)
+        scheduler._add_seq_group_to_swapped(seq_group)
+
+    # The last request should be swapped out.
+    scheduler.block_manager.can_swap_in = MagicMock()
+    scheduler.block_manager.can_swap_in.return_value = AllocStatus.NEVER
+    # Since we cannot swap in, none of the requests are swapped in.
+    budget = create_token_budget()
+    output = scheduler._schedule_swapped(budget, curr_loras)
+    remaining_swapped = scheduler.swapped
+    assert len(remaining_swapped) == 0
+    assert len(output.infeasible_seq_groups) == 2
+    assert budget.num_batched_tokens == 0
+    assert budget.num_curr_seqs == 0
+    assert len(output.decode_seq_groups) == 0
+    assert len(output.prefill_seq_groups) == 0
+
+
+def test_schedule_swapped_blocks_to_copy():
+    block_size = 4
+    scheduler = initialize_scheduler(block_size=block_size,
+                                     num_cpu_blocks=32,
+                                     num_gpu_blocks=32)
+    curr_loras = None
+    _, seq_group = create_dummy_prompt("1",
+                                       prompt_length=60,
+                                       best_of=2,
+                                       block_size=block_size)
+    scheduler._allocate_and_set_running(seq_group)
+    append_new_token_seq_group(60, seq_group, 1)
+    blocks_to_swap_out: List[Tuple[int, int]] = []
+    scheduler._swap_out(seq_group, blocks_to_swap_out)
+    scheduler._add_seq_group_to_swapped(seq_group)
+
+    # The last request should be swapped out.
+    scheduler.block_manager.append_slots = MagicMock()
+    scheduler.block_manager.append_slots.return_value = [(2, 3)]
+
+    budget = create_token_budget()
+    output = scheduler._schedule_swapped(budget, curr_loras)
+    remaining_swapped = scheduler.swapped
+    assert len(remaining_swapped) == 0
+    assert len(output.decode_seq_groups) == 1
+    assert len(output.prefill_seq_groups) == 0
+    assert output.blocks_to_copy == [(2, 3)]
+
+
+def test_scheduling_budget():
+    TOKEN_BUDGET = 4
+    MAX_SEQS = 4
+    budget = SchedulingBudget(token_budget=TOKEN_BUDGET, max_num_seqs=MAX_SEQS)
+    assert budget.can_schedule(num_new_tokens=1, num_new_seqs=1)
+    assert budget.can_schedule(num_new_tokens=4, num_new_seqs=4)
+    assert not budget.can_schedule(num_new_tokens=1, num_new_seqs=5)
+    assert not budget.can_schedule(num_new_tokens=5, num_new_seqs=1)
+    assert not budget.can_schedule(num_new_tokens=5, num_new_seqs=5)
+    assert budget.remaining_token_budget() == TOKEN_BUDGET
+
+    # Verify add/subtract num batched tokens.
+    _, seq_group = create_dummy_prompt("1", 3)
+    budget.add_num_batched_tokens(seq_group.request_id, 2)
+    assert budget.remaining_token_budget() == 2
+    assert budget.num_batched_tokens == 2
+    assert budget.can_schedule(num_new_tokens=2, num_new_seqs=1)
+    assert not budget.can_schedule(num_new_tokens=3, num_new_seqs=1)
+    # Verify adding another seq group is no-op.
+    budget.add_num_batched_tokens(seq_group.request_id, 2)
+    assert budget.remaining_token_budget() == 2
+    assert budget.num_batched_tokens == 2
+    budget.subtract_num_batched_tokens(seq_group.request_id, 2)
+    assert budget.remaining_token_budget() == 4
+    assert budget.num_batched_tokens == 0
+    budget.subtract_num_batched_tokens(seq_group.request_id, 2)
+    assert budget.remaining_token_budget() == 4
+    assert budget.num_batched_tokens == 0
+
+    # Verify add/subtract max seqs.
+    _, seq_group = create_dummy_prompt("1", 3)
+    budget.add_num_seqs(seq_group.request_id, 2)
+    assert budget.can_schedule(num_new_tokens=1, num_new_seqs=2)
+    assert not budget.can_schedule(num_new_tokens=1, num_new_seqs=3)
+    assert budget.num_curr_seqs == 2
+    # Verify adding another seq group is no-op.
+    budget.add_num_seqs(seq_group.request_id, 2)
+    assert budget.num_curr_seqs == 2
+    budget.subtract_num_seqs(seq_group.request_id, 2)
+    assert budget.num_curr_seqs == 0
+    budget.subtract_num_seqs(seq_group.request_id, 2)
+    assert budget.num_curr_seqs == 0
--- a/vllm-v0.6.2/tests/core/test_scheduler_encoder_decoder.py
+++ b/vllm-v0.6.2/tests/core/test_scheduler_encoder_decoder.py
@@ -0,0 +1,104 @@
+from typing import List
+
+import pytest  # noqa
+
+from vllm.config import CacheConfig, SchedulerConfig
+from vllm.core.scheduler import Scheduler
+from vllm.sequence import SequenceGroup
+
+from .utils import (append_new_token, create_dummy_prompt_encoder_decoder,
+                    get_sequence_groups, schedule_and_update_computed_tokens)
+
+
+def test_scheduler_schedule_simple_encoder_decoder():
+    '''
+    Test basic scheduler functionality in the context
+    of an encoder/decoder model. Focus on testing
+    enc/dec-specific functionality sense tests already
+    exist for decoder-only functionality
+
+    Test behavior:
+    * Construct Scheduler
+    * Construct dummy encoder/decoder sequence groups
+    * Add dummy seq groups to scheduler backlog
+    * Schedule the next seq group & validate:
+        * Cross-attn block tables
+        * Updated states of seq groups
+        * Number of batched tokens
+        * Number of blocks to copy/swap-in/swap-out
+        * Number of scheduled seq groups
+    * Repeat for both prefill- and decode-phase
+    * Abort scheduled seq groups
+    * Assert that aborted seq groups no longer appear in
+      cross-attention block table
+    '''
+
+    block_size = 4
+    num_seq_group = 4
+    max_model_len = 16
+    scheduler_config = SchedulerConfig(
+        task="generate",
+        max_num_batched_tokens=64,
+        max_num_seqs=num_seq_group,
+        max_model_len=max_model_len,
+    )
+    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+    cache_config.num_cpu_blocks = 16  # enc and dec prompts per seq_group
+    cache_config.num_gpu_blocks = 16  # enc and dec prompts per seq_group
+    scheduler = Scheduler(scheduler_config, cache_config, None)
+    running: List[SequenceGroup] = []
+
+    # Add seq groups to scheduler.
+    req_id_list = []
+    for i in range(num_seq_group):
+        req_id = str(i)
+        req_id_list.append(req_id)
+        _, _, seq_group = create_dummy_prompt_encoder_decoder(
+            req_id, block_size, block_size, block_size)
+        scheduler.add_seq_group(seq_group)
+        running.append(seq_group)
+
+    # Schedule seq groups prefill.
+    num_tokens = block_size * num_seq_group
+    seq_group_meta_list, out = schedule_and_update_computed_tokens(scheduler)
+    # - Verify that sequence group cross-attention block tables are
+    #   registered with the block manager
+    assert all([(req_id in scheduler.block_manager.cross_block_tables)
+                for req_id in req_id_list])
+    # - Validate sequence-group status
+    assert set(get_sequence_groups(out)) == set(running)
+    # - Validate number of batched tokens
+    assert out.num_batched_tokens == num_tokens
+    # - Validate there are no remaining blocks to swap
+    assert (not out.blocks_to_copy and not out.blocks_to_swap_in
+            and not out.blocks_to_swap_out)
+    # - Validate all seq groups were scheduled
+    assert len(seq_group_meta_list) == num_seq_group
+    append_new_token(out, 1)
+
+    # Schedule seq groups decode.
+    seq_group_meta_list, out = schedule_and_update_computed_tokens(scheduler)
+    # - Verify that sequence group metadata includes encoder attention
+    #   and cross-attention metadata
+    assert all([
+        not ((seq_group_meta.encoder_seq_data is None) or
+             (seq_group_meta.cross_block_table is None))
+        for seq_group_meta in seq_group_meta_list
+    ])
+    # - Validate sequence-group status
+    assert set(get_sequence_groups(out)) == set(running)
+    # - Validate there is one batched token per seq group
+    assert out.num_batched_tokens == num_seq_group
+    # - Validate there are no remaining blocks to swap
+    assert (not out.blocks_to_copy and not out.blocks_to_swap_in
+            and not out.blocks_to_swap_out)
+    # - Validate that all seq groups were scheduled
+    assert len(seq_group_meta_list) == num_seq_group
+    append_new_token(out, 1)
+
+    # Abort sequences
+    for req_id in req_id_list:
+        scheduler.abort_seq_group(req_id)
+        # - Verify that sequence group cross-attention block tables are
+        #   NO LONGER registered with the block manager
+        assert req_id not in scheduler.block_manager.cross_block_tables
--- a/vllm-v0.6.2/tests/core/test_serialization.py
+++ b/vllm-v0.6.2/tests/core/test_serialization.py
@@ -0,0 +1,33 @@
+import msgspec
+
+from vllm.executor.msgspec_utils import decode_hook, encode_hook
+from vllm.sequence import ExecuteModelRequest
+
+from ..spec_decode.utils import create_batch
+
+
+def test_msgspec_serialization():
+    num_lookahead_slots = 4
+    seq_group_metadata_list, _, _ = create_batch(16, num_lookahead_slots)
+    execute_model_req = ExecuteModelRequest(
+        seq_group_metadata_list=seq_group_metadata_list,
+        num_lookahead_slots=num_lookahead_slots,
+        running_queue_size=4)
+
+    encoder = msgspec.msgpack.Encoder(enc_hook=encode_hook)
+    decoder = msgspec.msgpack.Decoder(ExecuteModelRequest,
+                                      dec_hook=decode_hook)
+    req = decoder.decode(encoder.encode(execute_model_req))
+    expected = execute_model_req.seq_group_metadata_list
+    actual = req.seq_group_metadata_list
+    assert (len(expected) == len(actual))
+    expected = expected[0]
+    actual = actual[0]
+
+    assert expected.block_tables == actual.block_tables
+    assert expected.is_prompt == actual.is_prompt
+    assert expected.request_id == actual.request_id
+    assert (expected.seq_data[0].prompt_token_ids ==
+            actual.seq_data[0].prompt_token_ids)
+    assert (expected.seq_data[0].output_token_ids ==
+            actual.seq_data[0].output_token_ids)
--- a/vllm-v0.6.2/tests/core/utils.py
+++ b/vllm-v0.6.2/tests/core/utils.py
@@ -0,0 +1,205 @@
+import time
+from typing import List, Optional
+from typing import Sequence as GenericSequence
+from typing import Tuple
+
+from vllm import SamplingParams
+from vllm.inputs import EncoderDecoderInputs, token_inputs
+from vllm.lora.request import LoRARequest
+from vllm.sequence import Logprob, Sequence, SequenceGroup
+
+
+def create_dummy_prompt(
+    request_id: str,
+    prompt_length: int,
+    block_size: Optional[int] = None,
+    lora_request: Optional[LoRARequest] = None,
+    best_of: int = 1,
+    prompt_tokens: Optional[List[int]] = None,
+    min_tokens: int = 0,
+    max_tokens: int = 16,
+) -> Tuple[Sequence, SequenceGroup]:
+    if not block_size:
+        block_size = prompt_length
+
+    if prompt_tokens is None:
+        # Create dummy prompt sequence with tokens 0...block_size-1
+        # and prompt "0 ... block_size".
+        prompt_tokens = list(range(prompt_length))
+    prompt_str = " ".join([str(t) for t in prompt_tokens])
+    prompt = Sequence(int(request_id),
+                      inputs=token_inputs(prompt_tokens, prompt=prompt_str),
+                      block_size=block_size)
+    seq_group = SequenceGroup(request_id=request_id,
+                              seqs=[prompt],
+                              arrival_time=time.time(),
+                              sampling_params=SamplingParams(
+                                  best_of=best_of,
+                                  max_tokens=max_tokens,
+                                  min_tokens=min_tokens),
+                              lora_request=lora_request)
+
+    return prompt, seq_group
+
+
+def create_dummy_prompt_encoder_decoder(
+    request_id: str,
+    decoder_prompt_length: int,
+    encoder_prompt_length: int,
+    block_size: Optional[int] = None,
+    lora_request: Optional[LoRARequest] = None,
+    best_of: int = 1,
+) -> Tuple[Sequence, Sequence, SequenceGroup]:
+    if not block_size:
+        block_size = decoder_prompt_length
+
+    # Create dummy prompt sequence with tokens 0...block_size-1
+    # and prompt "0 ... block_size". Note that the prompt string
+    # doesn't actually match the tokens
+    decoder_prompt_tokens = list(range(decoder_prompt_length))
+    decoder_prompt_str = " ".join([str(t) for t in decoder_prompt_tokens])
+    encoder_prompt_tokens = list(reversed(list(range(encoder_prompt_length))))
+    encoder_prompt_str = " ".join([str(t) for t in encoder_prompt_tokens])
+
+    inputs: EncoderDecoderInputs = {
+        "decoder": token_inputs(decoder_prompt_tokens,
+                                prompt=decoder_prompt_str),
+        "encoder": token_inputs(encoder_prompt_tokens,
+                                prompt=encoder_prompt_str),
+    }
+
+    decoder_prompt = Sequence(int(request_id),
+                              inputs=inputs["decoder"],
+                              block_size=block_size)
+
+    encoder_prompt = Sequence(int(request_id),
+                              inputs=inputs["encoder"],
+                              block_size=block_size)
+
+    seq_group = SequenceGroup(request_id=request_id,
+                              seqs=[decoder_prompt],
+                              sampling_params=SamplingParams(best_of=best_of),
+                              arrival_time=time.time(),
+                              lora_request=lora_request,
+                              encoder_seq=encoder_prompt)
+
+    return decoder_prompt, encoder_prompt, seq_group
+
+
+def create_seq_group(
+        seq_prompt_len: int = 1024,
+        seq_output_lens: GenericSequence[int] = (128, ),
+        request_id: str = '0',
+        seq_id_start: int = 0,
+        sampling_params: Optional[SamplingParams] = None) -> SequenceGroup:
+
+    assert len(seq_output_lens) > 0
+
+    if sampling_params is None:
+        sampling_params = SamplingParams()
+
+    prompt_token_ids = [0] * seq_prompt_len
+
+    seqs: List[Sequence] = []
+    for seq_id_offset, output_len in enumerate(seq_output_lens):
+        seq = Sequence(
+            seq_id=seq_id_start + seq_id_offset,
+            inputs=token_inputs(prompt_token_ids),
+            block_size=16,
+        )
+
+        for i in range(output_len):
+            seq.append_token_id(
+                token_id=i,
+                logprobs={i: Logprob(0.0)},
+            )
+        seqs.append(seq)
+
+    seq_group = SequenceGroup(
+        request_id=request_id,
+        seqs=seqs,
+        sampling_params=sampling_params,
+        arrival_time=time.time(),
+    )
+
+    return seq_group
+
+
+def create_seq_group_encoder_decoder(
+        seq_prompt_len: int = 1024,
+        seq_output_lens: GenericSequence[int] = (128, ),
+        request_id: str = '0',
+        seq_id_start: int = 0,
+        sampling_params: Optional[SamplingParams] = None) -> SequenceGroup:
+
+    assert len(seq_output_lens) > 0
+
+    if sampling_params is None:
+        sampling_params = SamplingParams()
+
+    prompt_token_ids = [0] * seq_prompt_len
+
+    inputs: EncoderDecoderInputs = {
+        "decoder": token_inputs(prompt_token_ids),
+        "encoder": token_inputs(prompt_token_ids),
+    }
+
+    seqs = []
+    for seq_id_offset, output_len in enumerate(seq_output_lens):
+        # Construct decoder input sequences
+        seq = Sequence(
+            seq_id=seq_id_start + seq_id_offset,
+            inputs=inputs["decoder"],
+            block_size=16,
+        )
+
+        for i in range(output_len):
+            seq.append_token_id(
+                token_id=i,
+                logprobs={i: Logprob(0.0)},
+            )
+        seqs.append(seq)
+
+    # Encoder input sequence
+    encoder_seq = Sequence(
+        seq_id=seq_id_start + len(seq_output_lens),
+        inputs=inputs["encoder"],
+        block_size=16,
+    )
+
+    return SequenceGroup(request_id=request_id,
+                         seqs=seqs,
+                         sampling_params=sampling_params,
+                         arrival_time=time.time(),
+                         encoder_seq=encoder_seq)
+
+
+def round_up_to_next_block(seq_len: int, block_size: int) -> int:
+    return (seq_len + block_size - 1) // block_size
+
+
+# Helper functions for scheduler tests
+
+
+def get_sequence_groups(scheduler_output):
+    return [s.seq_group for s in scheduler_output.scheduled_seq_groups]
+
+
+def append_new_token(out, token_id: int):
+    seq_groups = get_sequence_groups(out)
+    for seq_group in seq_groups:
+        for seq in seq_group.get_seqs():
+            seq.append_token_id(token_id, {token_id: Logprob(token_id)})
+
+
+def schedule_and_update_computed_tokens(scheduler):
+    metas, out, _ = scheduler.schedule()
+    for s, meta in zip(out.scheduled_seq_groups, metas):
+        s.seq_group.update_num_computed_tokens(meta.token_chunk_size)
+    return metas, out
+
+
+def append_new_token_seq_group(token_chunk_size, seq_group, token_id: int):
+    seq_group.update_num_computed_tokens(token_chunk_size)
+    for seq in seq_group.get_seqs():
+        seq.append_token_id(token_id, {token_id: Logprob(token_id)})
--- a/vllm-v0.6.2/tests/data/test_config.yaml
+++ b/vllm-v0.6.2/tests/data/test_config.yaml
@@ -0,0 +1,5 @@
+port: 12312
+served_model_name: mymodel
+tensor_parallel_size: 2
+trust_remote_code: true
+multi_step_stream_outputs: false
--- a/vllm-v0.6.2/tests/distributed/init.py
+++ b/vllm-v0.6.2/tests/distributed/init.py
--- a/vllm-v0.6.2/tests/distributed/test_ca_buffer_sharing.py
+++ b/vllm-v0.6.2/tests/distributed/test_ca_buffer_sharing.py
@@ -0,0 +1,59 @@
+# can only run on machines with p2p access across GPUs
+# can only run with torchrun:
+# torchrun --nproc_per_node=2 tests/distributed/test_ca_buffer_sharing.py
+
+import ctypes
+
+import torch
+import torch.distributed as dist
+
+from vllm.distributed.device_communicators.cuda_wrapper import CudaRTLibrary
+from vllm.distributed.device_communicators.custom_all_reduce import (  # noqa
+    CustomAllreduce)
+
+# create a cpu process group for communicating metadata (ipc handle)
+dist.init_process_group(backend="gloo")
+rank = local_rank = dist.get_rank()
+world_size = dist.get_world_size()
+
+# every process sets its own device (differently)
+lib = CudaRTLibrary()
+lib.cudaSetDevice(rank)
+
+buffer_size_in_bytes = 1024
+byte_value = 2  # the value we write to the buffer for verification
+
+pointers = CustomAllreduce.create_shared_buffer(buffer_size_in_bytes)
+
+print(f"Rank {rank} has pointers {pointers}")
+
+dist.barrier()
+torch.cuda.synchronize()
+
+if rank == 0:
+    # the first rank tries to write to all buffers
+    for p in pointers:
+        pointer = ctypes.c_void_p(p)
+        lib.cudaMemset(pointer, byte_value, buffer_size_in_bytes)
+
+dist.barrier()
+torch.cuda.synchronize()
+
+host_data = (ctypes.c_char * buffer_size_in_bytes)()
+
+# all ranks read from all buffers, and check if the data is correct
+for p in pointers:
+    pointer = ctypes.c_void_p(p)
+    lib.cudaMemcpy(host_data, pointer, buffer_size_in_bytes)
+    for i in range(buffer_size_in_bytes):
+        assert ord(host_data[i]) == byte_value, (
+            f"Rank {rank} failed"
+            f" to verify buffer {p}. Expected {byte_value}, "
+            f"got {ord(host_data[i])}")
+
+print(f"Rank {rank} verified all buffers")
+
+dist.barrier()
+torch.cuda.synchronize()
+
+CustomAllreduce.free_shared_buffer(pointers)
--- a/vllm-v0.6.2/tests/distributed/test_comm_ops.py
+++ b/vllm-v0.6.2/tests/distributed/test_comm_ops.py
@@ -0,0 +1,200 @@
+"""Test the communication operators.
+
+Run `pytest tests/distributed/test_comm_ops.py`.
+"""
+import os
+
+import pytest
+import ray
+import torch
+
+from vllm.distributed import (broadcast_tensor_dict, get_pp_group,
+                              tensor_model_parallel_all_gather,
+                              tensor_model_parallel_all_reduce)
+
+from ..utils import init_test_distributed_environment, multi_process_parallel
+
+
+@ray.remote(num_gpus=1, max_calls=1)
+def all_reduce_test_worker(tp_size: int, pp_size: int, rank: int,
+                           distributed_init_port: str):
+    # it is important to delete the CUDA_VISIBLE_DEVICES environment variable
+    # so that each worker can see all the GPUs
+    # they will be able to set the device to the correct GPU
+    del os.environ["MLU_VISIBLE_DEVICES"]
+    device = torch.device(f"cuda:{rank}")
+    torch.cuda.set_device(device)
+    init_test_distributed_environment(tp_size, pp_size, rank,
+                                      distributed_init_port)
+    num_elements = 8
+    all_tensors = [
+        torch.arange(num_elements, dtype=torch.float32, device="cuda") *
+        (r + 1) for r in range(tp_size)
+    ]
+    expected = torch.sum(torch.stack(all_tensors, dim=0), dim=0)
+    t = all_tensors[rank % tp_size]
+    t = tensor_model_parallel_all_reduce(t)
+    torch.testing.assert_close(t, expected)
+
+
+@ray.remote(num_gpus=1, max_calls=1)
+def all_gather_test_worker(tp_size: int, pp_size: int, rank: int,
+                           distributed_init_port: str):
+    # it is important to delete the CUDA_VISIBLE_DEVICES environment variable
+    # so that each worker can see all the GPUs
+    # they will be able to set the device to the correct GPU
+    del os.environ["MLU_VISIBLE_DEVICES"]
+    device = torch.device(f"cuda:{rank}")
+    torch.cuda.set_device(device)
+    init_test_distributed_environment(tp_size, pp_size, rank,
+                                      distributed_init_port)
+    num_dimensions = 3
+    tensor_size = list(range(2, num_dimensions + 2))
+    total_size = 1
+    for s in tensor_size:
+        total_size *= s
+    for all_gather_dimension in range(num_dimensions):
+        all_tensors = [
+            torch.arange(total_size, dtype=torch.float32,
+                         device="cuda").reshape(tensor_size) * (r + 1)
+            for r in range(tp_size)
+        ]
+        expected = torch.cat(all_tensors, dim=all_gather_dimension)
+        t = all_tensors[rank % tp_size]
+        t = tensor_model_parallel_all_gather(t, all_gather_dimension)
+        torch.testing.assert_close(t, expected)
+
+
+@ray.remote(num_gpus=1, max_calls=1)
+def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
+                                      distributed_init_port: str):
+    # it is important to delete the CUDA_VISIBLE_DEVICES environment variable
+    # so that each worker can see all the GPUs
+    # they will be able to set the device to the correct GPU
+    del os.environ["MLU_VISIBLE_DEVICES"]
+    device = torch.device(f"cuda:{rank}")
+    torch.cuda.set_device(device)
+    init_test_distributed_environment(tp_size, pp_size, rank,
+                                      distributed_init_port)
+    test_dict = {
+        # device tensor
+        "a": torch.arange(8, dtype=torch.float32, device="cuda"),
+        # CPU tensor
+        "b": torch.arange(16, dtype=torch.int8, device="cpu"),
+        "c": "test",
+        "d": [1, 2, 3],
+        "e": {
+            "a": 1,
+            "b": 2
+        },
+        # empty tensor
+        "f": torch.tensor([], dtype=torch.float32, device="cuda"),
+    }
+
+    if (rank % tp_size) == 0:
+        broadcast_tensor_dict(test_dict, src=0)
+    else:
+        recv_dict = broadcast_tensor_dict(src=0)
+        assert len(recv_dict) == len(test_dict)
+        torch.testing.assert_close(recv_dict["a"], test_dict["a"])
+        torch.testing.assert_close(recv_dict["b"], test_dict["b"])
+        assert recv_dict["c"] == test_dict["c"]
+        assert recv_dict["d"] == test_dict["d"]
+        assert recv_dict["e"] == test_dict["e"]
+        torch.testing.assert_close(recv_dict["f"], test_dict["f"])
+
+
+@ray.remote(num_gpus=1, max_calls=1)
+def send_recv_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
+                                      distributed_init_port: str):
+    del os.environ["MLU_VISIBLE_DEVICES"]
+    device = torch.device(f"cuda:{rank}")
+    torch.cuda.set_device(device)
+    init_test_distributed_environment(tp_size, pp_size, rank,
+                                      distributed_init_port)
+
+    test_dict = {
+        # device tensor
+        "a": torch.arange(8, dtype=torch.float32, device="cuda"),
+        # CPU tensor
+        "b": torch.arange(16, dtype=torch.int8, device="cpu"),
+        "c": "test",
+        "d": [1, 2, 3],
+        "e": {
+            "a": 1,
+            "b": 2
+        },
+        # empty tensor
+        "f": torch.tensor([], dtype=torch.float32, device="cuda"),
+    }
+
+    if not get_pp_group().is_first_rank:
+        recv_dict = get_pp_group().recv_tensor_dict()
+
+    if not get_pp_group().is_last_rank:
+        get_pp_group().send_tensor_dict(test_dict)
+
+    if not get_pp_group().is_first_rank:
+        assert len(recv_dict) == len(test_dict)
+        torch.testing.assert_close(recv_dict["a"], test_dict["a"])
+        torch.testing.assert_close(recv_dict["b"], test_dict["b"])
+        assert recv_dict["c"] == test_dict["c"]
+        assert recv_dict["d"] == test_dict["d"]
+        assert recv_dict["e"] == test_dict["e"]
+        torch.testing.assert_close(recv_dict["f"], test_dict["f"])
+
+
+@ray.remote(num_gpus=1, max_calls=1)
+def send_recv_test_worker(tp_size: int, pp_size: int, rank: int,
+                          distributed_init_port: str):
+    del os.environ["MLU_VISIBLE_DEVICES"]
+    device = torch.device(f"cuda:{rank}")
+    torch.cuda.set_device(device)
+    init_test_distributed_environment(tp_size, pp_size, rank,
+                                      distributed_init_port)
+
+    size = 64
+    test_tensor = torch.arange(64, dtype=torch.float32, device="cuda")
+
+    if not get_pp_group().is_first_rank:
+        recv_tensor = get_pp_group().recv(size, dtype=torch.float32)
+
+    if not get_pp_group().is_last_rank:
+        get_pp_group().send(test_tensor)
+
+    if not get_pp_group().is_first_rank:
+        torch.testing.assert_close(test_tensor, recv_tensor)
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 2,
+                    reason="Need at least 2 GPUs to run the test.")
+@pytest.mark.parametrize("tp_size", [2])
+@pytest.mark.parametrize("test_target", [
+    all_reduce_test_worker, all_gather_test_worker,
+    broadcast_tensor_dict_test_worker
+])
+def test_multi_process_tensor_parallel(tp_size, test_target):
+    multi_process_parallel(tp_size, 1, test_target)
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 2,
+                    reason="Need at least 2 GPUs to run the test.")
+@pytest.mark.parametrize("pp_size", [2])
+@pytest.mark.parametrize(
+    "test_target", [send_recv_test_worker, send_recv_tensor_dict_test_worker])
+def test_multi_process_pipeline_parallel(pp_size, test_target):
+    multi_process_parallel(1, pp_size, test_target)
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 4,
+                    reason="Need at least 4 GPUs to run the test.")
+@pytest.mark.parametrize("tp_size", [2])
+@pytest.mark.parametrize("pp_size", [2])
+@pytest.mark.parametrize("test_target", [
+    send_recv_test_worker, send_recv_tensor_dict_test_worker,
+    all_reduce_test_worker, all_gather_test_worker,
+    broadcast_tensor_dict_test_worker
+])
+def test_multi_process_tensor_parallel_pipeline_parallel(
+        tp_size, pp_size, test_target):
+    multi_process_parallel(tp_size, pp_size, test_target)
--- a/vllm-v0.6.2/tests/distributed/test_custom_all_reduce.py
+++ b/vllm-v0.6.2/tests/distributed/test_custom_all_reduce.py
@@ -0,0 +1,115 @@
+import os
+import random
+
+import pytest
+import ray
+import torch
+import torch.distributed as dist
+
+from vllm.distributed.communication_op import (  # noqa
+    tensor_model_parallel_all_reduce)
+from vllm.distributed.parallel_state import (get_tensor_model_parallel_group,
+                                             get_tp_group, graph_capture)
+
+from ..utils import (ensure_model_parallel_initialized,
+                     init_test_distributed_environment, multi_process_parallel)
+
+random.seed(42)
+test_sizes = [random.randint(1024, 2048 * 1024) for _ in range(8)]
+for i, v in enumerate(test_sizes):
+    test_sizes[i] -= v % 8
+
+
+@ray.remote(num_gpus=1, max_calls=1)
+def graph_allreduce(tp_size, pp_size, rank, distributed_init_port):
+    del os.environ["CUDA_VISIBLE_DEVICES"]
+    device = torch.device(f"cuda:{rank}")
+    torch.cuda.set_device(device)
+    init_test_distributed_environment(tp_size, pp_size, rank,
+                                      distributed_init_port)
+    ensure_model_parallel_initialized(tp_size, pp_size)
+    group = get_tensor_model_parallel_group().device_group
+
+    # A small all_reduce for warmup.
+    # this is needed because device communicators might be created lazily
+    # (e.g. NCCL). This will ensure that the communicator is initialized
+    # before any communication happens, so that this group can be used for
+    # graph capture immediately.
+    data = torch.zeros(1)
+    data = data.to(device=device)
+    torch.distributed.all_reduce(data, group=group)
+    torch.cuda.synchronize()
+    del data
+
+    # we use the first group to communicate once
+    # and the second group to communicate twice
+    # and so on
+    # this is used to demonstrate that each group can
+    # communicate independently
+    num_communication = rank // tp_size + 1
+
+    for sz in test_sizes:
+        for dtype in [torch.float32, torch.float16, torch.bfloat16]:
+            with graph_capture() as graph_capture_context:
+                # use integers so result matches NCCL exactly
+                inp1 = torch.randint(1,
+                                     16, (sz, ),
+                                     dtype=dtype,
+                                     device=torch.cuda.current_device())
+                inp2 = torch.randint(1,
+                                     16, (sz, ),
+                                     dtype=dtype,
+                                     device=torch.cuda.current_device())
+                torch.cuda.synchronize()
+                graph = torch.cuda.CUDAGraph()
+                with torch.cuda.graph(graph,
+                                      stream=graph_capture_context.stream):
+                    for i in range(num_communication):
+                        out1 = tensor_model_parallel_all_reduce(inp1)
+                        # the input buffer is immediately modified to test
+                        # synchronization
+                        dist.all_reduce(inp1, group=group)
+                        out2 = tensor_model_parallel_all_reduce(inp2)
+                        dist.all_reduce(inp2, group=group)
+            graph.replay()
+            torch.testing.assert_close(out1, inp1)
+            torch.testing.assert_close(out2, inp2)
+
+
+@ray.remote(num_gpus=1, max_calls=1)
+def eager_allreduce(tp_size, pp_size, rank, distributed_init_port):
+    del os.environ["CUDA_VISIBLE_DEVICES"]
+    device = torch.device(f"cuda:{rank}")
+    torch.cuda.set_device(device)
+    init_test_distributed_environment(tp_size, pp_size, rank,
+                                      distributed_init_port)
+
+    # we use the first group to communicate once
+    # and the second group to communicate twice
+    # and so on
+    # this is used to demonstrate that each group can
+    # communicate independently
+    num_communication = rank // tp_size + 1
+    sz = 1024
+    fa = get_tp_group().ca_comm
+    inp = torch.ones(sz, dtype=torch.float32, device=device)
+    out = inp
+    for _ in range(num_communication):
+        out = fa.all_reduce(out, registered=False)
+    torch.testing.assert_close(out, inp * (tp_size**num_communication))
+
+    inp = torch.ones(sz * 4, dtype=torch.bfloat16, device=device)
+    out = inp
+    for _ in range(num_communication):
+        out = fa.all_reduce(out, registered=False)
+    torch.testing.assert_close(out, inp * (tp_size**num_communication))
+
+
+@pytest.mark.parametrize("tp_size", [2])
+@pytest.mark.parametrize("pipeline_parallel_size", [1, 2])
+@pytest.mark.parametrize("test_target", [eager_allreduce, graph_allreduce])
+def test_custom_allreduce(tp_size, pipeline_parallel_size, test_target):
+    world_size = tp_size * pipeline_parallel_size
+    if world_size > torch.cuda.device_count():
+        pytest.skip("Not enough GPUs to run the test.")
+    multi_process_parallel(tp_size, pipeline_parallel_size, test_target)
--- a/vllm-v0.6.2/tests/distributed/test_distributed_oot.py
+++ b/vllm-v0.6.2/tests/distributed/test_distributed_oot.py
@@ -0,0 +1,6 @@
+from ..entrypoints.openai.test_oot_registration import (
+    run_and_test_dummy_opt_api_server)
+
+
+def test_distributed_oot(dummy_opt_path: str):
+    run_and_test_dummy_opt_api_server(dummy_opt_path, tp=2)
--- a/vllm-v0.6.2/tests/distributed/test_multi_node_assignment.py
+++ b/vllm-v0.6.2/tests/distributed/test_multi_node_assignment.py
@@ -0,0 +1,64 @@
+"""Make sure ray assigns GPU workers to the correct node.
+
+Run:
+```sh
+cd $VLLM_PATH/tests
+
+pytest distributed/test_multi_node_assignment.py
+```
+"""
+
+import os
+
+import pytest
+import ray
+from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
+
+from vllm import initialize_ray_cluster
+from vllm.config import ParallelConfig
+from vllm.executor.ray_utils import _wait_until_pg_removed
+from vllm.utils import get_ip
+
+VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"
+
+
+@pytest.mark.skipif(not VLLM_MULTI_NODE,
+                    reason="Need at least 2 nodes to run the test.")
+def test_multi_node_assignment() -> None:
+
+    # NOTE: important to keep this class definition here
+    # to let ray use cloudpickle to serialize it.
+    class Actor:
+
+        def get_ip(self):
+            return get_ip()
+
+    for _ in range(10):
+        config = ParallelConfig(1, 2)
+        initialize_ray_cluster(config)
+
+        current_ip = get_ip()
+        workers = []
+        for bundle_id, bundle in enumerate(
+                config.placement_group.bundle_specs):
+            if not bundle.get("GPU", 0):
+                continue
+            scheduling_strategy = PlacementGroupSchedulingStrategy(
+                placement_group=config.placement_group,
+                placement_group_capture_child_tasks=True,
+                placement_group_bundle_index=bundle_id,
+            )
+
+            worker = ray.remote(
+                num_cpus=0,
+                num_gpus=1,
+                scheduling_strategy=scheduling_strategy,
+            )(Actor).remote()
+            worker_ip = ray.get(worker.get_ip.remote())
+            assert worker_ip == current_ip
+            workers.append(worker)
+
+        for worker in workers:
+            ray.kill(worker)
+
+        _wait_until_pg_removed(config.placement_group)
--- a/vllm-v0.6.2/tests/distributed/test_pipeline_parallel.py
+++ b/vllm-v0.6.2/tests/distributed/test_pipeline_parallel.py
@@ -0,0 +1,414 @@
+"""
+WARNING: This test runs in both single-node (4 GPUs) and multi-node
+ (2 node with 2 GPUs each) modes. If the test only uses 2 GPUs, it is
+ important to set the distributed backend to "mp" to avoid Ray scheduling
+ all workers in a node other than the head node, which can cause the test
+ to fail.
+"""
+import os
+from dataclasses import dataclass
+from typing import List, Literal, NamedTuple, Optional
+
+import pytest
+
+from vllm.config import TaskOption
+from vllm.logger import init_logger
+
+from ..utils import compare_two_settings, fork_new_process_for_each_test
+
+logger = init_logger("test_pipeline_parallel")
+
+VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"
+
+
+class ParallelSetup(NamedTuple):
+    tp_size: int
+    pp_size: int
+    eager_mode: bool
+    chunked_prefill: bool
+
+
+class PPTestOptions(NamedTuple):
+    multi_node_only: bool
+    trust_remote_code: bool
+    tokenizer_mode: Optional[str]
+    load_format: Optional[str] = None
+    hf_overrides: Optional[str] = None
+
+
+@dataclass
+class PPTestSettings:
+    parallel_setups: List[ParallelSetup]
+    distributed_backends: List[str]
+    task: TaskOption
+    test_options: PPTestOptions
+
+    @staticmethod
+    def detailed(
+        *,
+        tp_base: int = 1,
+        pp_base: int = 2,
+        multi_node_only: bool = False,
+        task: TaskOption = "auto",
+        trust_remote_code: bool = False,
+        tokenizer_mode: Optional[str] = None,
+        load_format: Optional[str] = None,
+        hf_overrides: Optional[str] = None,
+    ):
+        return PPTestSettings(
+            parallel_setups=[
+                ParallelSetup(tp_size=tp_base,
+                              pp_size=pp_base,
+                              eager_mode=False,
+                              chunked_prefill=False),
+                ParallelSetup(tp_size=tp_base,
+                              pp_size=2 * pp_base,
+                              eager_mode=False,
+                              chunked_prefill=True),
+                ParallelSetup(tp_size=tp_base,
+                              pp_size=2 * pp_base,
+                              eager_mode=True,
+                              chunked_prefill=False),
+                ParallelSetup(tp_size=2 * tp_base,
+                              pp_size=pp_base,
+                              eager_mode=False,
+                              chunked_prefill=True),
+                ParallelSetup(tp_size=2 * tp_base,
+                              pp_size=pp_base,
+                              eager_mode=True,
+                              chunked_prefill=False),
+            ],
+            distributed_backends=["mp", "ray"],
+            task=task,
+            test_options=PPTestOptions(multi_node_only=multi_node_only,
+                                       trust_remote_code=trust_remote_code,
+                                       tokenizer_mode=tokenizer_mode,
+                                       load_format=load_format,
+                                       hf_overrides=hf_overrides),
+        )
+
+    @staticmethod
+    def fast(
+        *,
+        tp_base: int = 1,
+        pp_base: int = 2,
+        task: TaskOption = "auto",
+        multi_node_only: bool = False,
+        trust_remote_code: bool = False,
+        tokenizer_mode: Optional[str] = None,
+        load_format: Optional[str] = None,
+        hf_overrides: Optional[str] = None,
+    ):
+        return PPTestSettings(
+            parallel_setups=[
+                ParallelSetup(tp_size=tp_base,
+                              pp_size=pp_base,
+                              eager_mode=True,
+                              chunked_prefill=False),
+            ],
+            distributed_backends=["mp"],
+            task=task,
+            test_options=PPTestOptions(multi_node_only=multi_node_only,
+                                       trust_remote_code=trust_remote_code,
+                                       tokenizer_mode=tokenizer_mode,
+                                       load_format=load_format,
+                                       hf_overrides=hf_overrides),
+        )
+
+    def iter_params(self, model_name: str):
+        opts = self.test_options
+
+        for parallel_setup in self.parallel_setups:
+            for distributed_backend in self.distributed_backends:
+                yield (model_name, parallel_setup, distributed_backend,
+                       self.task, opts)
+
+
+# NOTE: You can adjust tp_base and/or pp_base locally to fit the model in GPU
+# The values displayed here are only a rough indicator of the size of the model
+
+# yapf: disable
+TEXT_GENERATION_MODELS = {
+    # [Decoder-only]
+    # Uses Llama
+    # "BAAI/AquilaChat-7B": PPTestSettings.fast(),
+    "Snowflake/snowflake-arctic-instruct": PPTestSettings.fast(tp_base=8, trust_remote_code=True),  # noqa: E501
+    "baichuan-inc/Baichuan-7B": PPTestSettings.fast(trust_remote_code=True),
+    "baichuan-inc/Baichuan2-13B-Chat": PPTestSettings.fast(trust_remote_code=True),  # noqa: E501
+    "bigscience/bloomz-1b1": PPTestSettings.fast(),
+    "THUDM/chatglm3-6b": PPTestSettings.fast(trust_remote_code=True),
+    "CohereForAI/c4ai-command-r-v01": PPTestSettings.fast(tp_base=2, trust_remote_code=True),  # noqa: E501
+    "databricks/dbrx-instruct": PPTestSettings.fast(tp_base=8),
+    "Deci/DeciLM-7B-instruct": PPTestSettings.fast(trust_remote_code=True),
+    "deepseek-ai/deepseek-llm-7b-chat": PPTestSettings.fast(),
+    "deepseek-ai/DeepSeek-V2-Lite-Chat": PPTestSettings.fast(trust_remote_code=True),  # noqa: E501
+    "LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct": PPTestSettings.fast(),
+    "tiiuae/falcon-7b": PPTestSettings.fast(),
+    "google/gemma-2b": PPTestSettings.fast(),
+    "google/gemma-2-9b": PPTestSettings.fast(),
+    "gpt2": PPTestSettings.fast(),
+    "bigcode/starcoder": PPTestSettings.fast(),
+    "EleutherAI/gpt-j-6b": PPTestSettings.fast(),
+    "EleutherAI/pythia-12b": PPTestSettings.fast(),
+    "ibm/PowerLM-3b": PPTestSettings.fast(),
+    "ibm/PowerMoE-3b": PPTestSettings.fast(),
+    # Uses Llama
+    # "internlm/internlm-chat-7b": PPTestSettings.fast(),
+    "internlm/internlm2-chat-7b": PPTestSettings.fast(trust_remote_code=True),
+    "inceptionai/jais-13b-chat": PPTestSettings.fast(),
+    # TODO: Implement PP
+    # "ai21labs/AI21-Jamba-1.5-Mini": PPTestSettings.fast(),
+    "meta-llama/Meta-Llama-3-8B": PPTestSettings.detailed(),
+    "openbmb/MiniCPM-2B-sft-bf16": PPTestSettings.fast(trust_remote_code=True),
+    "openbmb/MiniCPM3-4B": PPTestSettings.fast(trust_remote_code=True),
+    # Uses Llama
+    # "mistralai/Mistral-7B-Instruct-v0.1": PPTestSettings.fast(),
+    "mistralai/Mixtral-8x7B-Instruct-v0.1": PPTestSettings.fast(tp_base=4),
+    "mosaicml/mpt-7b": PPTestSettings.fast(),
+    "nvidia/Minitron-8B-Base": PPTestSettings.fast(),
+    "allenai/OLMo-1B-hf": PPTestSettings.fast(),
+    "allenai/OLMoE-1B-7B-0924-Instruct": PPTestSettings.fast(),
+    "facebook/opt-iml-max-1.3b": PPTestSettings.fast(),
+    "OrionStarAI/Orion-14B-Chat": PPTestSettings.fast(trust_remote_code=True),
+    "adept/persimmon-8b-chat": PPTestSettings.fast(),
+    "microsoft/phi-2": PPTestSettings.fast(),
+    "microsoft/Phi-3-small-8k-instruct": PPTestSettings.fast(trust_remote_code=True),  # noqa: E501
+    "microsoft/Phi-3.5-MoE-instruct": PPTestSettings.detailed(trust_remote_code=True, multi_node_only=True, load_format="dummy", hf_overrides='{"num_hidden_layers": 4, "hidden_size": 512, "intermediate_size": 800, "num_attention_heads": 4, "num_key_value_heads": 1}'),  # noqa: E501
+    "Qwen/Qwen-7B-Chat": PPTestSettings.fast(trust_remote_code=True),
+    "Qwen/Qwen2-7B-Instruct": PPTestSettings.fast(),
+    "Qwen/Qwen1.5-MoE-A2.7B-Chat": PPTestSettings.fast(),
+    "stabilityai/stablelm-3b-4e1t": PPTestSettings.fast(),
+    "bigcode/starcoder2-3b": PPTestSettings.fast(),
+    "upstage/solar-pro-preview-instruct": PPTestSettings.fast(tp_base=2),
+    # FIXME: Cannot load tokenizer in latest transformers version.
+    # Need to use tokenizer from `meta-llama/Llama-2-7b-chat-hf`
+    # "xverse/XVERSE-7B-Chat": PPTestSettings.fast(trust_remote_code=True),
+    # [Encoder-only]
+    # TODO: Implement PP
+    # "facebook/bart-base": PPTestSettings.fast(),
+}
+
+EMBEDDING_MODELS = {  # type: ignore[var-annotated]
+    # [Text-only]
+    "intfloat/e5-mistral-7b-instruct": PPTestSettings.fast(),
+    "BAAI/bge-multilingual-gemma2": PPTestSettings.fast(),
+    "Qwen/Qwen2.5-Math-RM-72B": PPTestSettings.fast(tp_base=4, trust_remote_code=True),  # noqa: E501
+}
+
+MULTIMODAL_MODELS = {
+    # [Decoder-only]
+    "Salesforce/blip2-opt-2.7b": PPTestSettings.fast(),
+    "facebook/chameleon-7b": PPTestSettings.fast(),
+    "adept/fuyu-8b": PPTestSettings.fast(),
+    "THUDM/glm-4v-9b": PPTestSettings.fast(trust_remote_code=True),
+    "OpenGVLab/InternVL2-1B": PPTestSettings.fast(trust_remote_code=True),
+    "llava-hf/llava-1.5-7b-hf": PPTestSettings.fast(),
+    "llava-hf/llava-v1.6-mistral-7b-hf": PPTestSettings.fast(),
+    "llava-hf/LLaVA-NeXT-Video-7B-hf": PPTestSettings.fast(),
+    "llava-hf/llava-onevision-qwen2-0.5b-ov-hf": PPTestSettings.fast(),
+    "openbmb/MiniCPM-Llama3-V-2_5": PPTestSettings.fast(trust_remote_code=True),
+    "allenai/Molmo-7B-D-0924": PPTestSettings.fast(trust_remote_code=True),
+    "microsoft/Phi-3-vision-128k-instruct": PPTestSettings.fast(trust_remote_code=True),  # noqa: E501
+    "mistralai/Pixtral-12B-2409": PPTestSettings.fast(tp_base=2, tokenizer_mode="mistral"),  # noqa: E501
+    "Qwen/Qwen-VL-Chat": PPTestSettings.fast(trust_remote_code=True),
+    "Qwen/Qwen2-Audio-7B-Instruct": PPTestSettings.fast(),
+    "Qwen/Qwen2-VL-2B-Instruct": PPTestSettings.fast(),
+    "fixie-ai/ultravox-v0_3": PPTestSettings.fast(),
+    # [Encoder-decoder]
+    # TODO: Implement PP
+    # "meta-llama/Llama-3.2-11B-Vision-Instruct": PPTestSettings.fast(),
+}
+# yapf: enable
+
+# NOTE: You can update this on your local machine to run specific tests
+TEST_MODELS = [
+    # [LANGUAGE GENERATION]
+    # "microsoft/Phi-3.5-MoE-instruct",
+    "meta-llama/Meta-Llama-3-8B",
+    # "ibm/PowerLM-3b",
+    # [LANGUAGE EMBEDDING]
+    # "intfloat/e5-mistral-7b-instruct",
+    # "BAAI/bge-multilingual-gemma2",
+    # [MULTIMODAL GENERATION]
+    # "OpenGVLab/InternVL2-1B",
+    # "microsoft/Phi-3-vision-128k-instruct",
+    # "fixie-ai/ultravox-v0_3",
+]
+
+
+def _compare_tp(
+    model_name: str,
+    parallel_setup: ParallelSetup,
+    distributed_backend: str,
+    task: TaskOption,
+    test_options: PPTestOptions,
+    num_gpus_available: int,
+    *,
+    method: Literal["generate", "encode"],
+):
+    tp_size, pp_size, eager_mode, chunked_prefill = parallel_setup
+    multi_node_only, trust_remote_code, tokenizer_mode, \
+        load_format, hf_overrides = test_options
+
+    if num_gpus_available < tp_size * pp_size:
+        pytest.skip(f"Need at least {tp_size} x {pp_size} GPUs")
+    if VLLM_MULTI_NODE and distributed_backend == "mp":
+        pytest.skip("Skipping multi-node pipeline parallel test for "
+                    "multiprocessing distributed backend")
+    if multi_node_only and not VLLM_MULTI_NODE:
+        pytest.skip("Not in multi-node setting")
+
+    common_args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "float16",
+        "--max-model-len",
+        "2048",
+        "--max-num-seqs",
+        "8",
+    ]
+    if chunked_prefill:
+        common_args.append("--enable-chunked-prefill")
+    if eager_mode:
+        common_args.append("--enforce-eager")
+    if task != "auto":
+        common_args.extend(["--task", task])
+    if trust_remote_code:
+        common_args.append("--trust-remote-code")
+    if tokenizer_mode:
+        common_args.extend(["--tokenizer-mode", tokenizer_mode])
+    if load_format:
+        common_args.extend(["--load-format", load_format])
+    if hf_overrides:
+        common_args.extend(["--hf-overrides", hf_overrides])
+
+    if (distributed_backend == "ray" and tp_size == 2 and pp_size == 2
+            and chunked_prefill):
+        # Test Ray ADAG for a subset of the tests
+        pp_env = {
+            "VLLM_USE_RAY_COMPILED_DAG": "1",
+            "VLLM_USE_RAY_SPMD_WORKER": "1",
+            "VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL": "1",
+        }
+        # Temporary. Currently when zeromq + SPMD is used, it does not properly
+        # terminate because of aDAG issue.
+        common_args.append("--disable-frontend-multiprocessing")
+    else:
+        pp_env = None
+
+    pp_args = [
+        *common_args,
+        "--pipeline-parallel-size",
+        str(pp_size),
+        "--tensor-parallel-size",
+        str(tp_size),
+        "--distributed-executor-backend",
+        distributed_backend,
+    ]
+
+    # compare without pipeline parallelism
+    # NOTE: use mp backend for TP
+    # PP tests might involve multiple nodes, and ray might
+    #  schedule all workers in a node other than the head node,
+    #  which can cause the test to fail.
+    tp_args = [
+        *common_args,
+        "--tensor-parallel-size",
+        str(tp_size),
+        "--distributed-executor-backend",
+        "mp",
+    ]
+
+    try:
+        compare_two_settings(model_name,
+                             pp_args,
+                             tp_args,
+                             pp_env,
+                             method=method)
+    except Exception:
+        if pp_env is None:
+            raise
+        else:
+            # Ray ADAG tests are flaky, so we don't want to fail the test
+            logger.exception("Ray ADAG tests failed")
+
+
+@pytest.mark.parametrize(
+    ("model_name", "parallel_setup", "distributed_backend", "task",
+     "test_options"),
+    [
+        params for model_name, settings in TEXT_GENERATION_MODELS.items()
+        for params in settings.iter_params(model_name)
+        if model_name in TEST_MODELS
+    ],
+)
+@fork_new_process_for_each_test
+def test_tp_language_generation(
+    model_name: str,
+    parallel_setup: ParallelSetup,
+    distributed_backend: str,
+    task: TaskOption,
+    test_options: PPTestOptions,
+    num_gpus_available,
+):
+    _compare_tp(model_name,
+                parallel_setup,
+                distributed_backend,
+                task,
+                test_options,
+                num_gpus_available,
+                method="generate")
+
+
+@pytest.mark.parametrize(
+    ("model_name", "parallel_setup", "distributed_backend", "task",
+     "test_options"),
+    [
+        params for model_name, settings in EMBEDDING_MODELS.items()
+        for params in settings.iter_params(model_name)
+        if model_name in TEST_MODELS
+    ],
+)
+@fork_new_process_for_each_test
+def test_tp_language_embedding(
+    model_name: str,
+    parallel_setup: ParallelSetup,
+    distributed_backend: str,
+    task: TaskOption,
+    test_options: PPTestOptions,
+    num_gpus_available,
+):
+    _compare_tp(model_name,
+                parallel_setup,
+                distributed_backend,
+                task,
+                test_options,
+                num_gpus_available,
+                method="encode")
+
+
+@pytest.mark.parametrize(
+    ("model_name", "parallel_setup", "distributed_backend", "task",
+     "test_options"),
+    [
+        params for model_name, settings in MULTIMODAL_MODELS.items()
+        for params in settings.iter_params(model_name)
+        if model_name in TEST_MODELS
+    ],
+)
+@fork_new_process_for_each_test
+def test_tp_multimodal_generation(
+    model_name: str,
+    parallel_setup: ParallelSetup,
+    distributed_backend: str,
+    task: TaskOption,
+    test_options: PPTestOptions,
+    num_gpus_available,
+):
+    _compare_tp(model_name,
+                parallel_setup,
+                distributed_backend,
+                task,
+                test_options,
+                num_gpus_available,
+                method="generate")
--- a/vllm-v0.6.2/tests/distributed/test_pipeline_partition.py
+++ b/vllm-v0.6.2/tests/distributed/test_pipeline_partition.py
@@ -0,0 +1,34 @@
+import os
+
+import pytest
+
+from vllm.distributed.utils import get_pp_indices
+
+
+def test_custom_layer_partition():
+
+    def _verify(partition_str, num_layers, pp_size, goldens):
+        bak = os.environ.get("VLLM_PP_LAYER_PARTITION", None)
+        os.environ["VLLM_PP_LAYER_PARTITION"] = partition_str
+        for pp_rank, golden in enumerate(goldens):
+            assert get_pp_indices(num_layers, pp_rank, pp_size) == golden
+        if bak is not None:
+            os.environ["VLLM_PP_LAYER_PARTITION"] = bak
+
+    # Even partition
+    _verify("5,5,5,5", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
+    # Balanced partition
+    _verify("4,6,6,4", 20, 4, [(0, 4), (4, 10), (10, 16), (16, 20)])
+    # Put reminder somewhere
+    _verify("5,6,5,6", 22, 4, [(0, 5), (5, 11), (11, 16), (16, 22)])
+    # Invalid partition strings
+    with pytest.raises(ValueError):
+        _verify("5,5,5,5,", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
+    with pytest.raises(ValueError):
+        _verify("5,5,5,a", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
+    # Wrong number of partitions
+    with pytest.raises(ValueError):
+        _verify("5,5,5", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
+    # Wrong number of layers
+    with pytest.raises(ValueError):
+        _verify("5,5,5,5", 21, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
--- a/vllm-v0.6.2/tests/distributed/test_pp_cudagraph.py
+++ b/vllm-v0.6.2/tests/distributed/test_pp_cudagraph.py
@@ -0,0 +1,30 @@
+import os
+
+import pytest
+
+from ..utils import compare_two_settings, fork_new_process_for_each_test
+
+
+@pytest.mark.parametrize("PP_SIZE, MODEL_NAME", [
+    (2, "JackFram/llama-160m"),
+])
+@pytest.mark.parametrize("ATTN_BACKEND", [
+    "FLASH_ATTN",
+    "FLASHINFER",
+])
+@fork_new_process_for_each_test
+def test_pp_cudagraph(PP_SIZE, MODEL_NAME, ATTN_BACKEND):
+    cudagraph_args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "float16",
+        "--pipeline-parallel-size",
+        str(PP_SIZE),
+        "--distributed-executor-backend",
+        "mp",
+    ]
+    os.environ["VLLM_ATTENTION_BACKEND"] = ATTN_BACKEND
+
+    eager_args = cudagraph_args + ["--enforce-eager"]
+
+    compare_two_settings(MODEL_NAME, eager_args, cudagraph_args)
--- a/vllm-v0.6.2/tests/distributed/test_pynccl.py
+++ b/vllm-v0.6.2/tests/distributed/test_pynccl.py
@@ -0,0 +1,241 @@
+import multiprocessing
+import os
+from typing import Dict, List
+
+import pytest
+import torch
+import torch.distributed
+
+from vllm.distributed.communication_op import (  # noqa
+    tensor_model_parallel_all_reduce)
+from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
+from vllm.distributed.device_communicators.pynccl_wrapper import NCCLLibrary
+from vllm.distributed.parallel_state import (ensure_model_parallel_initialized,
+                                             get_world_group, graph_capture,
+                                             init_distributed_environment)
+from vllm.utils import update_environment_variables
+
+
+def distributed_run(fn, world_size):
+    number_of_processes = world_size
+    processes: List[multiprocessing.Process] = []
+    for i in range(number_of_processes):
+        env: Dict[str, str] = {}
+        env['RANK'] = str(i)
+        env['LOCAL_RANK'] = str(i)
+        env['WORLD_SIZE'] = str(number_of_processes)
+        env['LOCAL_WORLD_SIZE'] = str(number_of_processes)
+        env['MASTER_ADDR'] = 'localhost'
+        env['MASTER_PORT'] = '12345'
+        p = multiprocessing.Process(target=fn, args=(env, ))
+        processes.append(p)
+        p.start()
+
+    for p in processes:
+        p.join()
+
+    for p in processes:
+        assert p.exitcode == 0
+
+
+def worker_fn_wrapper(fn):
+    # `multiprocessing.Process` cannot accept environment variables directly
+    # so we need to pass the environment variables as arguments
+    # and update the environment variables in the function
+    def wrapped_fn(env):
+        update_environment_variables(env)
+        local_rank = os.environ['LOCAL_RANK']
+        device = torch.device(f"cuda:{local_rank}")
+        torch.cuda.set_device(device)
+        init_distributed_environment()
+        fn()
+
+    return wrapped_fn
+
+
+@worker_fn_wrapper
+def worker_fn():
+    pynccl_comm = PyNcclCommunicator(get_world_group().cpu_group,
+                                     device=get_world_group().device)
+    tensor = torch.ones(16, 1024, 1024,
+                        dtype=torch.float32).cuda(pynccl_comm.rank)
+    with pynccl_comm.change_state(enable=True):
+        pynccl_comm.all_reduce(tensor)
+    result = tensor.mean().cpu().item()
+    assert result == pynccl_comm.world_size
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 2,
+                    reason="Need at least 2 GPUs to run the test.")
+def test_pynccl():
+    distributed_run(worker_fn, 2)
+
+
+@worker_fn_wrapper
+def multiple_allreduce_worker_fn():
+    device = torch.device(f"cuda:{torch.distributed.get_rank()}")
+    groups = [
+        torch.distributed.new_group(ranks=[0, 1], backend="gloo"),
+        torch.distributed.new_group(ranks=[2, 3], backend="gloo")
+    ]
+    group = groups[0] if torch.distributed.get_rank() in [0, 1] else groups[1]
+    pynccl_comm = PyNcclCommunicator(group=group, device=device)
+    tensor = torch.ones(16, 1024, 1024, dtype=torch.float32, device=device)
+    with pynccl_comm.change_state(enable=True):
+        # two groups can communicate independently
+        if torch.distributed.get_rank() in [0, 1]:
+            pynccl_comm.all_reduce(tensor)
+            pynccl_comm.all_reduce(tensor)
+            result = tensor.mean().cpu().item()
+            assert result == 4
+        else:
+            pynccl_comm.all_reduce(tensor)
+            result = tensor.mean().cpu().item()
+            assert result == 2
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 4,
+                    reason="Need at least 4 GPUs to run the test.")
+def test_pynccl_multiple_allreduce():
+    # this tests pynccl for multiple tp groups, in a standalone way
+    # i.e. call `pynccl_comm.all_reduce` directly
+    distributed_run(multiple_allreduce_worker_fn, 4)
+
+
+@worker_fn_wrapper
+def multiple_allreduce_with_vllm_worker_fn():
+    device = torch.device(f"cuda:{torch.distributed.get_rank()}")
+    ensure_model_parallel_initialized(2, 2)
+    tensor = torch.ones(16, 1024, 1024, dtype=torch.float32, device=device)
+    with graph_capture():
+        # two tp groups can communicate independently
+        if torch.distributed.get_rank() in [0, 1]:
+            tensor = tensor_model_parallel_all_reduce(tensor)
+            tensor = tensor_model_parallel_all_reduce(tensor)
+            result = tensor.mean().cpu().item()
+            assert result == 4
+        else:
+            tensor = tensor_model_parallel_all_reduce(tensor)
+            result = tensor.mean().cpu().item()
+            assert result == 2
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 4,
+                    reason="Need at least 4 GPUs to run the test.")
+def test_pynccl_multiple_allreduce_with_vllm():
+    # this tests pynccl for multiple tp groups, together with vllm
+    # i.e. call `tensor_model_parallel_all_reduce`
+    distributed_run(multiple_allreduce_with_vllm_worker_fn, 4)
+
+
+@worker_fn_wrapper
+def worker_fn_with_cudagraph():
+    with torch.no_grad():
+        graph = torch.cuda.CUDAGraph()
+        pynccl_comm = PyNcclCommunicator(get_world_group().cpu_group,
+                                         device=get_world_group().device)
+        # run something in the default stream to initialize torch engine
+        a = torch.ones((4, 4), device=f'cuda:{pynccl_comm.rank}')
+        torch.cuda.synchronize()
+        with torch.cuda.graph(
+                graph, stream=pynccl_comm.stream), pynccl_comm.change_state(
+                    enable=True):
+            # operation during the graph capture is recorded but not executed
+            # see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#creating-a-graph-using-stream-capture # noqa
+            pynccl_comm.all_reduce(a)
+        pynccl_comm.stream.synchronize()
+        assert a.mean().cpu().item() == pynccl_comm.world_size**0
+        graph.replay()
+        pynccl_comm.stream.synchronize()
+        assert a.mean().cpu().item() == pynccl_comm.world_size**1
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 2,
+                    reason="Need at least 2 GPUs to run the test.")
+def test_pynccl_with_cudagraph():
+    distributed_run(worker_fn_with_cudagraph, 2)
+
+
+@worker_fn_wrapper
+def send_recv_worker_fn():
+    pynccl_comm = PyNcclCommunicator(get_world_group().cpu_group,
+                                     device=get_world_group().device)
+    if pynccl_comm.rank == 0:
+        tensor = torch.ones(16, 1024, 1024,
+                            dtype=torch.float32).cuda(pynccl_comm.rank)
+    else:
+        tensor = torch.empty(16, 1024, 1024,
+                             dtype=torch.float32).cuda(pynccl_comm.rank)
+    with pynccl_comm.change_state(enable=True):
+        if pynccl_comm.rank == 0:
+            pynccl_comm.send(tensor,
+                             dst=(pynccl_comm.rank + 1) %
+                             pynccl_comm.world_size)
+        else:
+            pynccl_comm.recv(tensor,
+                             src=(pynccl_comm.rank - 1) %
+                             pynccl_comm.world_size)
+    result = tensor.mean().cpu().item()
+    assert result == 1
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 2,
+                    reason="Need at least 2 GPUs to run the test.")
+def test_pynccl_send_recv():
+    distributed_run(send_recv_worker_fn, 2)
+
+
+@worker_fn_wrapper
+def multiple_send_recv_worker_fn():
+    device = torch.device(f"cuda:{torch.distributed.get_rank()}")
+    groups = [
+        torch.distributed.new_group(ranks=[0, 2], backend="gloo"),
+        torch.distributed.new_group(ranks=[1, 3], backend="gloo")
+    ]
+    group = groups[0] if torch.distributed.get_rank() in [0, 2] else groups[1]
+    pynccl_comm = PyNcclCommunicator(group=group, device=device)
+    if torch.distributed.get_rank() == 0:
+        tensor = torch.ones(16, 1024, 1024, dtype=torch.float32, device=device)
+    elif torch.distributed.get_rank() == 1:
+        tensor = 2 * torch.ones(
+            16, 1024, 1024, dtype=torch.float32, device=device)
+    else:
+        tensor = torch.empty(16,
+                             1024,
+                             1024,
+                             dtype=torch.float32,
+                             device=device)
+    with pynccl_comm.change_state(enable=True):
+        if torch.distributed.get_rank() in [0, 1]:
+            pynccl_comm.send(tensor,
+                             dst=(pynccl_comm.rank + 1) %
+                             pynccl_comm.world_size)
+        else:
+            pynccl_comm.recv(tensor,
+                             src=(pynccl_comm.rank - 1) %
+                             pynccl_comm.world_size)
+    result = tensor.mean().cpu().item()
+    if torch.distributed.get_rank() in [0, 2]:
+        assert result == 1
+    else:
+        assert result == 2
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 4,
+                    reason="Need at least 4 GPUs to run the test.")
+def test_pynccl_multiple_send_recv():
+    distributed_run(multiple_send_recv_worker_fn, 4)
+
+
+def test_ncclGetUniqueId():
+    lib = NCCLLibrary()
+    unique_id = lib.ncclGetUniqueId()
+    # `list(unique_id.internal)` is something like this:
+    # [34, -16, 23, 83, 109, -19, 59, 95, 2, 0, -86, 55, 10, -128, 0, 29, 0,
+    # 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    # 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    # 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    # 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    # 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+    # as long as the function doesn't raise an exception, we're good
+    assert unique_id is not None
--- a/vllm-v0.6.2/tests/distributed/test_same_node.py
+++ b/vllm-v0.6.2/tests/distributed/test_same_node.py
@@ -0,0 +1,13 @@
+import os
+
+import torch.distributed as dist
+
+from vllm.distributed.parallel_state import in_the_same_node_as
+
+if __name__ == "__main__":
+    dist.init_process_group(backend="gloo")
+    test_result = all(in_the_same_node_as(dist.group.WORLD, source_rank=0))
+
+    expected = os.environ.get("VLLM_TEST_SAME_HOST", "1") == "1"
+    assert test_result == expected, f"Expected {expected}, got {test_result}"
+    print("Same node test passed!")
--- a/vllm-v0.6.2/tests/distributed/test_shm_broadcast.py
+++ b/vllm-v0.6.2/tests/distributed/test_shm_broadcast.py
@@ -0,0 +1,88 @@
+import multiprocessing
+import random
+import time
+from typing import List
+
+import numpy as np
+import torch.distributed as dist
+
+from vllm.distributed.device_communicators.shm_broadcast import MessageQueue
+from vllm.utils import update_environment_variables
+
+
+def get_arrays(n: int, seed: int = 0) -> List[np.ndarray]:
+    np.random.seed(seed)
+    sizes = np.random.randint(1, 10_000, n)
+    # on average, each array will have 5k elements
+    # with int64, each array will have 40kb
+    return [np.random.randint(1, 100, i) for i in sizes]
+
+
+def distributed_run(fn, world_size):
+    number_of_processes = world_size
+    processes = []
+    for i in range(number_of_processes):
+        env = {}
+        env['RANK'] = str(i)
+        env['LOCAL_RANK'] = str(i)
+        env['WORLD_SIZE'] = str(number_of_processes)
+        env['LOCAL_WORLD_SIZE'] = str(number_of_processes)
+        env['MASTER_ADDR'] = 'localhost'
+        env['MASTER_PORT'] = '12345'
+        p = multiprocessing.Process(target=fn, args=(env, ))
+        processes.append(p)
+        p.start()
+
+    for p in processes:
+        p.join()
+
+    for p in processes:
+        assert p.exitcode == 0
+
+
+def worker_fn_wrapper(fn):
+    # `multiprocessing.Process` cannot accept environment variables directly
+    # so we need to pass the environment variables as arguments
+    # and update the environment variables in the function
+    def wrapped_fn(env):
+        update_environment_variables(env)
+        dist.init_process_group(backend="gloo")
+        fn()
+
+    return wrapped_fn
+
+
+@worker_fn_wrapper
+def worker_fn():
+    writer_rank = 2
+    broadcaster = MessageQueue.create_from_process_group(
+        dist.group.WORLD, 40 * 1024, 2, writer_rank)
+    if dist.get_rank() == writer_rank:
+        seed = random.randint(0, 1000)
+        dist.broadcast_object_list([seed], writer_rank)
+    else:
+        recv = [None]
+        dist.broadcast_object_list(recv, writer_rank)
+        seed = recv[0]  # type: ignore
+    dist.barrier()
+    # in case we find a race condition
+    # print the seed so that we can reproduce the error
+    print(f"Rank {dist.get_rank()} got seed {seed}")
+    # test broadcasting with about 400MB of data
+    N = 10_000
+    if dist.get_rank() == writer_rank:
+        arrs = get_arrays(N, seed)
+        for x in arrs:
+            broadcaster.broadcast_object(x)
+            time.sleep(random.random() / 1000)
+    else:
+        arrs = get_arrays(N, seed)
+        for x in arrs:
+            y = broadcaster.broadcast_object(None)
+            assert np.array_equal(x, y)
+            time.sleep(random.random() / 1000)
+    dist.barrier()
+
+
+def test_shm_broadcast():
+    distributed_run(worker_fn, 4)
--- a/vllm-v0.6.2/tests/distributed/test_utils.py
+++ b/vllm-v0.6.2/tests/distributed/test_utils.py
@@ -0,0 +1,143 @@
+import socket
+
+import pytest
+import ray
+import torch
+
+import vllm.envs as envs
+from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
+from vllm.distributed.utils import StatelessProcessGroup
+from vllm.utils import (mlu_device_count_stateless, get_open_port,
+                        update_environment_variables)
+
+from ..utils import multi_gpu_test
+
+
+@ray.remote
+class _CUDADeviceCountStatelessTestActor:
+
+    def get_count(self):
+        return mlu_device_count_stateless()
+
+    def set_cuda_visible_devices(self, cuda_visible_devices: str):
+        update_environment_variables(
+            {"MLU_VISIBLE_DEVICES": cuda_visible_devices})
+
+    def get_cuda_visible_devices(self):
+        return envs.MLU_VISIBLE_DEVICES
+
+
+def test_cuda_device_count_stateless():
+    """Test that cuda_device_count_stateless changes return value if
+    CUDA_VISIBLE_DEVICES is changed."""
+    actor = _CUDADeviceCountStatelessTestActor.options(  # type: ignore
+        num_gpus=2).remote()
+    assert len(
+        sorted(ray.get(
+            actor.get_cuda_visible_devices.remote()).split(","))) == 2
+    assert ray.get(actor.get_count.remote()) == 2
+    ray.get(actor.set_cuda_visible_devices.remote("0"))
+    assert ray.get(actor.get_count.remote()) == 1
+    ray.get(actor.set_cuda_visible_devices.remote(""))
+    assert ray.get(actor.get_count.remote()) == 0
+
+
+def cpu_worker(rank, WORLD_SIZE, port1, port2):
+    pg1 = StatelessProcessGroup.create(host="127.0.0.1",
+                                       port=port1,
+                                       rank=rank,
+                                       world_size=WORLD_SIZE)
+    if rank <= 2:
+        pg2 = StatelessProcessGroup.create(host="127.0.0.1",
+                                           port=port2,
+                                           rank=rank,
+                                           world_size=3)
+    data = torch.tensor([rank])
+    data = pg1.broadcast_obj(data, src=2)
+    assert data.item() == 2
+    if rank <= 2:
+        data = torch.tensor([rank + 1])
+        data = pg2.broadcast_obj(data, src=2)
+        assert data.item() == 3
+        pg2.barrier()
+    pg1.barrier()
+
+
+def gpu_worker(rank, WORLD_SIZE, port1, port2):
+    torch.cuda.set_device(rank)
+    pg1 = StatelessProcessGroup.create(host="127.0.0.1",
+                                       port=port1,
+                                       rank=rank,
+                                       world_size=WORLD_SIZE)
+    pynccl1 = PyNcclCommunicator(pg1, device=rank)
+    pynccl1.disabled = False
+    if rank <= 2:
+        pg2 = StatelessProcessGroup.create(host="127.0.0.1",
+                                           port=port2,
+                                           rank=rank,
+                                           world_size=3)
+        pynccl2 = PyNcclCommunicator(pg2, device=rank)
+        pynccl2.disabled = False
+    data = torch.tensor([rank]).cuda()
+    pynccl1.all_reduce(data)
+    pg1.barrier()
+    torch.cuda.synchronize()
+    if rank <= 2:
+        pynccl2.all_reduce(data)
+        pg2.barrier()
+        torch.cuda.synchronize()
+    item = data[0].item()
+    print(f"rank: {rank}, item: {item}")
+    if rank == 3:
+        assert item == 6
+    else:
+        assert item == 18
+
+
+def broadcast_worker(rank, WORLD_SIZE, port1, port2):
+    pg1 = StatelessProcessGroup.create(host="127.0.0.1",
+                                       port=port1,
+                                       rank=rank,
+                                       world_size=WORLD_SIZE)
+    if rank == 2:
+        pg1.broadcast_obj("secret", src=2)
+    else:
+        obj = pg1.broadcast_obj(None, src=2)
+        assert obj == "secret"
+    pg1.barrier()
+
+
+def allgather_worker(rank, WORLD_SIZE, port1, port2):
+    pg1 = StatelessProcessGroup.create(host="127.0.0.1",
+                                       port=port1,
+                                       rank=rank,
+                                       world_size=WORLD_SIZE)
+    data = pg1.all_gather_obj(rank)
+    assert data == list(range(WORLD_SIZE))
+    pg1.barrier()
+
+
+@pytest.mark.skip(reason="This test is flaky and prone to hang.")
+# @multi_gpu_test(num_gpus=4)
+@pytest.mark.parametrize(
+    "worker", [cpu_worker, gpu_worker, broadcast_worker, allgather_worker])
+def test_stateless_process_group(worker):
+    port1 = get_open_port()
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        s.bind(("", port1))
+        port2 = get_open_port()
+    WORLD_SIZE = 4
+    from multiprocessing import get_context
+    ctx = get_context("fork")
+    processes = []
+    for i in range(WORLD_SIZE):
+        rank = i
+        processes.append(
+            ctx.Process(target=worker, args=(rank, WORLD_SIZE, port1, port2)))
+    for p in processes:
+        p.start()
+    for p in processes:
+        p.join()
+    for p in processes:
+        assert not p.exitcode
+    print("All processes finished.")
--- a/vllm-v0.6.2/tests/encoder_decoder/init.py
+++ b/vllm-v0.6.2/tests/encoder_decoder/init.py
--- a/vllm-v0.6.2/tests/encoder_decoder/test_e2e_correctness.py
+++ b/vllm-v0.6.2/tests/encoder_decoder/test_e2e_correctness.py
@@ -0,0 +1,119 @@
+"""E2E tests to verify the correctness of the encoder-decoder framework
+
+Run `pytest tests/encoder_decoder/test_e2e_correctness.py`.
+"""
+from typing import List, Optional, Tuple
+
+import pytest
+from transformers import AutoModelForSeq2SeqLM
+
+from vllm.attention.selector import (_Backend, _cached_get_attn_backend,
+                                     global_force_attn_backend_context_manager)
+from vllm.platforms import current_platform
+from vllm.sequence import SampleLogprobs
+
+from ..conftest import DecoderPromptType
+from ..models.utils import check_logprobs_close
+
+LIST_ENC_DEC_SUPPORTED_BACKENDS = [
+    _Backend.XFORMERS, _Backend.FLASH_ATTN, None
+]
+
+
+def vllm_to_hf_output(
+    vllm_output: Tuple[List[int], str, Optional[SampleLogprobs]],
+    decoder_prompt_type: DecoderPromptType,
+):
+    """Sanitize vllm output to be comparable with hf output."""
+    output_ids, output_str, out_logprobs = vllm_output
+
+    hf_output_str = output_str + "</s>"
+    if decoder_prompt_type == DecoderPromptType.NONE:
+        hf_output_str = "<s>" + hf_output_str
+
+    return output_ids, hf_output_str, out_logprobs
+
+
+@pytest.fixture(autouse=True)
+def clear_cache():
+    """Fixture to clear backend cache before each test."""
+    _cached_get_attn_backend.cache_clear()  # Clear the cache
+    yield  # This allows the test to run
+
+
+@pytest.mark.parametrize("model", ["facebook/bart-large-cnn"])
+@pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS)
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [5])
+@pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType))
+@pytest.mark.parametrize("enforce_eager", [True, False])
+@pytest.mark.skipif(
+    current_platform.is_cpu(),
+    reason="CPU backend is not currently supported with encoder/decoder models"
+)
+def test_encoder_decoder_e2e(
+    hf_runner,
+    vllm_runner,
+    example_encoder_decoder_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    decoder_prompt_type: DecoderPromptType,
+    enforce_eager: bool,
+    attn_backend: _Backend,
+) -> None:
+    '''
+    End-to-End (E2E) test for the encoder-decoder framework.
+    This test evaluates the encoder-decoder functionality using the BART
+    model. We compare the outputs of the Hugging Face and vLLM
+    implementations to ensure that both implementations produce consistent
+    and correct results.
+    '''
+    with global_force_attn_backend_context_manager(attn_backend):
+        if attn_backend == _Backend.FLASH_ATTN:
+            # Flash Attention works only with bfloat16 data-type
+            dtype = 'bfloat16'
+        test_case_prompts = example_encoder_decoder_prompts[
+            decoder_prompt_type]
+
+        # Configuration settings for HF baseline
+        hf_kwargs = {
+            "top_k": None,
+            "num_beams": 1,
+            "repetition_penalty": 1.0,
+            "top_p": 1.0,
+            "length_penalty": 1.0,
+            "early_stopping": False,
+            "no_repeat_ngram_size": None,
+            "min_length": 0
+        }
+
+        with hf_runner(model, dtype=dtype,
+                       auto_cls=AutoModelForSeq2SeqLM) as hf_model:
+            hf_outputs = (
+                hf_model.generate_encoder_decoder_greedy_logprobs_limit(
+                    test_case_prompts,
+                    max_tokens,
+                    num_logprobs,
+                    **hf_kwargs,
+                ))
+        with vllm_runner(model, dtype=dtype,
+                         enforce_eager=enforce_eager) as vllm_model:
+            vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs(
+                test_case_prompts, max_tokens, num_logprobs)
+
+        hf_skip_tokens = (1 if decoder_prompt_type == DecoderPromptType.NONE
+                          else 0)
+
+        check_logprobs_close(
+            outputs_0_lst=hf_outputs,
+            outputs_1_lst=[
+                vllm_to_hf_output(vllm_output, decoder_prompt_type)
+                for vllm_output in vllm_outputs
+            ],
+            name_0="hf",
+            name_1="vllm",
+            num_outputs_0_skip_tokens=hf_skip_tokens,
+        )
--- a/vllm-v0.6.2/tests/engine/init.py
+++ b/vllm-v0.6.2/tests/engine/init.py
--- a/vllm-v0.6.2/tests/engine/output_processor/init.py
+++ b/vllm-v0.6.2/tests/engine/output_processor/init.py
--- a/vllm-v0.6.2/tests/engine/output_processor/test_multi_step.py
+++ b/vllm-v0.6.2/tests/engine/output_processor/test_multi_step.py
@@ -0,0 +1,271 @@
+import random
+from unittest.mock import MagicMock
+
+import pytest
+from transformers import PreTrainedTokenizer
+
+from vllm.core.scheduler import Scheduler
+from vllm.engine.output_processor.multi_step import MultiStepOutputProcessor
+from vllm.engine.output_processor.stop_checker import StopChecker
+from vllm.sampling_params import SamplingParams
+from vllm.sequence import (CompletionSequenceGroupOutput, Logprob,
+                           SequenceOutput, SequenceStatus)
+from vllm.transformers_utils.detokenizer import Detokenizer
+from vllm.utils import Counter
+
+from ...core.utils import create_seq_group
+
+
+@pytest.mark.parametrize("seq_output_len", [128])
+@pytest.mark.parametrize("num_new_tokens", [1, 12])
+@pytest.mark.skip_global_cleanup
+def test_appends_token_ids(num_new_tokens: int, seq_output_len: int):
+    """Verify multi-step decoding appends token ids correctly.
+
+    We append token ids and verify all the token ids were appended correctly.
+    Note that ignore_eos=True.
+    """
+    detokenizer = MagicMock(spec=Detokenizer)
+    scheduler = MagicMock(spec=Scheduler)
+    stop_checker = MagicMock(spec=StopChecker)
+    seq_counter = Counter()
+
+    output_processor = MultiStepOutputProcessor(
+        detokenizer=detokenizer,
+        scheduler=[scheduler],
+        seq_counter=seq_counter,
+        get_tokenizer_for_seq=lambda _: mock_tokenizer(),
+        stop_checker=stop_checker,
+    )
+
+    seq_group = create_seq_group(
+        seq_prompt_len=1024,
+        seq_output_lens=[seq_output_len],
+        sampling_params=SamplingParams(max_tokens=seq_output_len +
+                                       num_new_tokens,
+                                       ignore_eos=True),
+    )
+
+    seq = seq_group.get_seqs()[0]
+    seq.status = SequenceStatus.RUNNING
+
+    new_token_ids = list(range(num_new_tokens))
+
+    outputs = [
+        CompletionSequenceGroupOutput(
+            samples=[
+                SequenceOutput(
+                    parent_seq_id=seq.seq_id,
+                    output_token=output_token,
+                    logprobs={output_token: Logprob(0.0)},
+                )
+            ],
+            prompt_logprobs=None,
+        ) for output_token in new_token_ids
+    ]
+
+    assert seq.get_token_ids()[-len(new_token_ids):] != new_token_ids
+    output_processor.process_outputs(seq_group, outputs)
+    assert seq.get_token_ids()[-len(new_token_ids):] == new_token_ids
+
+
+@pytest.mark.parametrize("seq_prompt_len", [1024])
+@pytest.mark.parametrize("seq_output_len", [128])
+@pytest.mark.parametrize("num_new_tokens", [5, 6, 7, 8])
+@pytest.mark.parametrize("max_tokens", [128 + 3])
+@pytest.mark.skip_global_cleanup
+def test_respects_max_tokens(num_new_tokens: int, seq_prompt_len: int,
+                             seq_output_len: int, max_tokens: int):
+    """Verify tokens after max_tokens are dropped and not appended to the
+    sequence.
+    """
+    detokenizer = MagicMock(spec=Detokenizer)
+    scheduler = MagicMock(spec=Scheduler)
+    stop_checker = MagicMock(spec=StopChecker)
+    seq_counter = Counter()
+
+    output_processor = MultiStepOutputProcessor(
+        detokenizer=detokenizer,
+        scheduler=[scheduler],
+        seq_counter=seq_counter,
+        get_tokenizer_for_seq=lambda _: mock_tokenizer(),
+        stop_checker=stop_checker,
+    )
+
+    seq_group = create_seq_group(
+        seq_prompt_len=seq_prompt_len,
+        seq_output_lens=[seq_output_len],
+        sampling_params=SamplingParams(max_tokens=max_tokens, ),
+    )
+
+    seq = seq_group.get_seqs()[0]
+    seq.status = SequenceStatus.RUNNING
+
+    new_token_ids = list(range(num_new_tokens))
+
+    outputs = [
+        CompletionSequenceGroupOutput(
+            samples=[
+                SequenceOutput(
+                    parent_seq_id=seq.seq_id,
+                    output_token=output_token,
+                    logprobs={output_token: Logprob(0.0)},
+                )
+            ],
+            prompt_logprobs=None,
+        ) for output_token in new_token_ids
+    ]
+
+    assert seq.get_len() == seq_prompt_len + seq_output_len
+    output_processor.process_outputs(seq_group, outputs)
+
+    # Expect the processed sequence to not go over max tokens in len.
+    assert seq.get_len() == seq_prompt_len + max_tokens
+
+    # Expect the correct tokens were appended.
+    expected_appended_tokens = new_token_ids[:max_tokens - seq_output_len]
+    assert seq.get_token_ids(
+    )[-len(expected_appended_tokens):] == expected_appended_tokens
+
+
+@pytest.mark.parametrize("seq_prompt_len", [1024])
+@pytest.mark.parametrize("seq_output_len", [128])
+@pytest.mark.parametrize("num_new_tokens", [12])
+@pytest.mark.parametrize("seed", list(range(6)))
+@pytest.mark.skip_global_cleanup
+def test_respects_eos_token_id(num_new_tokens: int, seq_prompt_len: int,
+                               seq_output_len: int, seed: int):
+    """Verify the eos token id is included in the sequence, but subsequent
+    tokens are dropped (not appended to sequence).
+    """
+    random.seed(seed)
+    detokenizer = MagicMock(spec=Detokenizer)
+    scheduler = MagicMock(spec=Scheduler)
+    stop_checker = MagicMock(spec=StopChecker)
+    seq_counter = Counter()
+
+    eos_token_id = 100
+
+    output_processor = MultiStepOutputProcessor(
+        detokenizer=detokenizer,
+        scheduler=[scheduler],
+        seq_counter=seq_counter,
+        get_tokenizer_for_seq=lambda _: mock_tokenizer(eos_token_id),
+        stop_checker=stop_checker,
+    )
+
+    seq_group = create_seq_group(
+        seq_prompt_len=seq_prompt_len,
+        seq_output_lens=[seq_output_len],
+        sampling_params=SamplingParams(
+            # Ensure enough space.
+            max_tokens=seq_output_len + num_new_tokens, ),
+    )
+
+    seq = seq_group.get_seqs()[0]
+    seq.status = SequenceStatus.RUNNING
+
+    new_token_ids = list(range(num_new_tokens))
+    assert eos_token_id not in new_token_ids
+    eos_index = random.randint(0, len(new_token_ids) - 1)
+    new_token_ids[eos_index] = eos_token_id
+
+    outputs = [
+        CompletionSequenceGroupOutput(
+            samples=[
+                SequenceOutput(
+                    parent_seq_id=seq.seq_id,
+                    output_token=output_token,
+                    logprobs={output_token: Logprob(0.0)},
+                )
+            ],
+            prompt_logprobs=None,
+        ) for output_token in new_token_ids
+    ]
+
+    assert seq.get_len() == seq_prompt_len + seq_output_len
+    output_processor.process_outputs(seq_group, outputs)
+
+    # Expect the processed sequence to not go beyond provided eos.
+    assert seq.get_len() == seq_prompt_len + seq_output_len + (eos_index + 1)
+
+    # Expect the correct tokens were appended.
+    expected_appended_tokens = new_token_ids[:eos_index + 1]
+    assert seq.get_token_ids(
+    )[-len(expected_appended_tokens):] == expected_appended_tokens
+
+
+@pytest.mark.parametrize("seq_prompt_len", [1024])
+@pytest.mark.parametrize("seq_output_len", [128])
+@pytest.mark.parametrize("num_new_tokens", [12])
+@pytest.mark.parametrize("seed", list(range(6)))
+@pytest.mark.skip_global_cleanup
+def test_ignores_eos_token_id(num_new_tokens: int, seq_prompt_len: int,
+                              seq_output_len: int, seed: int):
+    """When sampling parameters dictate that we should ignore the eos token id,
+    ensure all token ids are appended even if the eos token id is emitted.
+    """
+    random.seed(seed)
+    detokenizer = MagicMock(spec=Detokenizer)
+    scheduler = MagicMock(spec=Scheduler)
+    stop_checker = MagicMock(spec=StopChecker)
+    seq_counter = Counter()
+
+    eos_token_id = 100
+
+    output_processor = MultiStepOutputProcessor(
+        detokenizer=detokenizer,
+        scheduler=[scheduler],
+        seq_counter=seq_counter,
+        get_tokenizer_for_seq=lambda _: mock_tokenizer(eos_token_id),
+        stop_checker=stop_checker,
+    )
+
+    seq_group = create_seq_group(
+        seq_prompt_len=seq_prompt_len,
+        seq_output_lens=[seq_output_len],
+        sampling_params=SamplingParams(
+            # Ensure enough space.
+            max_tokens=seq_output_len + num_new_tokens,
+            ignore_eos=True,
+        ),
+    )
+
+    seq = seq_group.get_seqs()[0]
+    seq.status = SequenceStatus.RUNNING
+
+    new_token_ids = list(range(num_new_tokens))
+    assert eos_token_id not in new_token_ids
+    eos_index = random.randint(0, len(new_token_ids) - 1)
+    new_token_ids[eos_index] = eos_token_id
+
+    outputs = [
+        CompletionSequenceGroupOutput(
+            samples=[
+                SequenceOutput(
+                    parent_seq_id=seq.seq_id,
+                    output_token=output_token,
+                    logprobs={output_token: Logprob(0.0)},
+                )
+            ],
+            prompt_logprobs=None,
+        ) for output_token in new_token_ids
+    ]
+
+    assert seq.get_len() == seq_prompt_len + seq_output_len
+    output_processor.process_outputs(seq_group, outputs)
+
+    # Expect the processed sequence to go beyond eos.
+    assert seq.get_len() == seq_prompt_len + seq_output_len + num_new_tokens
+
+    # Expect the correct tokens were appended.
+    expected_appended_tokens = new_token_ids[:seq_output_len + num_new_tokens -
+                                             seq_output_len]
+    assert seq.get_token_ids(
+    )[-len(expected_appended_tokens):] == expected_appended_tokens
+
+
+def mock_tokenizer(eos_token_id=1000):
+    tokenizer = MagicMock(spec=PreTrainedTokenizer)
+    tokenizer.eos_token_id = eos_token_id
+    return tokenizer
--- a/vllm-v0.6.2/tests/engine/output_processor/test_stop_checker.py
+++ b/vllm-v0.6.2/tests/engine/output_processor/test_stop_checker.py
@@ -0,0 +1,86 @@
+from unittest.mock import MagicMock
+
+import pytest
+from transformers import PreTrainedTokenizer
+
+from vllm.engine.output_processor.stop_checker import StopChecker
+from vllm.inputs import token_inputs
+from vllm.sampling_params import SamplingParams
+from vllm.sequence import Logprob, Sequence, SequenceStatus
+
+
+def sequence_with_eos(text: str, eos_token: str,
+                      eos_token_id: int) -> Sequence:
+    """
+    Create a Sequence that ends with an EOS token.
+    """
+    seq = Sequence(
+        seq_id=0,
+        inputs=token_inputs([]),
+        block_size=16,
+        eos_token_id=eos_token_id,
+    )
+    seq.output_text = text + eos_token
+
+    offset = eos_token_id + 1
+    for i in range(offset, len(text) + offset):
+        seq.append_token_id(token_id=i, logprobs={i: Logprob(0.0)})
+    seq.append_token_id(token_id=eos_token_id,
+                        logprobs={eos_token_id: Logprob(0.0)})
+
+    seq.status = SequenceStatus.RUNNING
+
+    return seq
+
+
+@pytest.mark.parametrize(["text_wo_eos", "eos_token", "eos_token_id"], [
+    ("This text ends with EOS token", "</s>", 2),
+])
+@pytest.mark.parametrize("ignore_eos", [True, False])
+@pytest.mark.parametrize("include_stop_str_in_output", [True, False])
+@pytest.mark.skip_global_cleanup
+def test_stop_on_eos_token(text_wo_eos: str, eos_token: str, eos_token_id: int,
+                           ignore_eos: bool, include_stop_str_in_output: bool):
+    """
+    Test the behavior of the StopChecker's maybe_stop_sequence method
+    when an EOS token is encountered.
+
+    This test covers:
+    - When the EOS token should stop the sequence and be removed from the output
+    - When the EOS token should stop the sequence and be included in the output
+    - When the EOS token should be ignored, and the sequence continues
+    """
+
+    tokenizer = MagicMock(spec=PreTrainedTokenizer)
+    get_tokenizer_for_seq = MagicMock(return_value=tokenizer)
+    stop_checker = StopChecker(max_model_len=1024,
+                               get_tokenizer_for_seq=get_tokenizer_for_seq)
+
+    seq = sequence_with_eos(
+        text=text_wo_eos,
+        eos_token=eos_token,
+        eos_token_id=eos_token_id,
+    )
+    new_char_count = len(eos_token)
+
+    # Note that `stop` and `stop_token_ids` are not specified
+    sampling_params = SamplingParams(
+        min_tokens=1,
+        ignore_eos=ignore_eos,
+        include_stop_str_in_output=include_stop_str_in_output)
+
+    stop_checker.maybe_stop_sequence(
+        seq=seq,
+        new_char_count=new_char_count,
+        sampling_params=sampling_params,
+    )
+
+    if ignore_eos:
+        assert seq.status == SequenceStatus.RUNNING
+        assert seq.output_text == text_wo_eos + eos_token
+    elif include_stop_str_in_output:
+        assert seq.status == SequenceStatus.FINISHED_STOPPED
+        assert seq.output_text == text_wo_eos + eos_token
+    else:
+        assert seq.status == SequenceStatus.FINISHED_STOPPED
+        assert seq.output_text == text_wo_eos
--- a/vllm-v0.6.2/tests/engine/test_arg_utils.py
+++ b/vllm-v0.6.2/tests/engine/test_arg_utils.py
@@ -0,0 +1,95 @@
+from argparse import ArgumentTypeError
+
+import pytest
+
+from vllm.config import PoolerConfig
+from vllm.engine.arg_utils import EngineArgs, nullable_kvs
+from vllm.utils import FlexibleArgumentParser
+
+
+@pytest.mark.parametrize(("arg", "expected"), [
+    (None, None),
+    ("image=16", {
+        "image": 16
+    }),
+    ("image=16,video=2", {
+        "image": 16,
+        "video": 2
+    }),
+    ("Image=16, Video=2", {
+        "image": 16,
+        "video": 2
+    }),
+])
+def test_limit_mm_per_prompt_parser(arg, expected):
+    parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
+    if arg is None:
+        args = parser.parse_args([])
+    else:
+        args = parser.parse_args(["--limit-mm-per-prompt", arg])
+
+    assert args.limit_mm_per_prompt == expected
+
+
+def test_valid_pooling_config():
+    parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
+    args = parser.parse_args([
+        '--override-pooler-config',
+        '{"pooling_type": "MEAN"}',
+    ])
+    engine_args = EngineArgs.from_cli_args(args=args)
+    assert engine_args.override_pooler_config == PoolerConfig(
+        pooling_type="MEAN", )
+
+
+@pytest.mark.parametrize(
+    ("arg"),
+    [
+        "image",  # Missing =
+        "image=4,image=5",  # Conflicting values
+        "image=video=4"  # Too many = in tokenized arg
+    ])
+def test_bad_nullable_kvs(arg):
+    with pytest.raises(ArgumentTypeError):
+        nullable_kvs(arg)
+
+
+# yapf: disable
+@pytest.mark.parametrize(("arg", "expected", "option"), [
+    (None, None, "mm-processor-kwargs"),
+    ("{}", {}, "mm-processor-kwargs"),
+    (
+        '{"num_crops": 4}',
+        {
+            "num_crops": 4
+        },
+        "mm-processor-kwargs"
+    ),
+    (
+        '{"foo": {"bar": "baz"}}',
+        {
+            "foo":
+            {
+                "bar": "baz"
+            }
+        },
+        "mm-processor-kwargs"
+    ),
+    (
+        '{"cast_logits_dtype":"bfloat16","sequence_parallel_norm":true,"sequence_parallel_norm_threshold":2048}',
+        {
+            "cast_logits_dtype": "bfloat16",
+            "sequence_parallel_norm": True,
+            "sequence_parallel_norm_threshold": 2048,
+        },
+        "override-neuron-config"
+    ),
+])
+# yapf: enable
+def test_composite_arg_parser(arg, expected, option):
+    parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
+    if arg is None:
+        args = parser.parse_args([])
+    else:
+        args = parser.parse_args([f"--{option}", arg])
+    assert getattr(args, option.replace("-", "_")) == expected
--- a/vllm-v0.6.2/tests/engine/test_computed_prefix_blocks.py
+++ b/vllm-v0.6.2/tests/engine/test_computed_prefix_blocks.py
@@ -0,0 +1,40 @@
+import pytest
+
+from vllm.engine.arg_utils import EngineArgs
+from vllm.engine.llm_engine import LLMEngine
+from vllm.sampling_params import SamplingParams
+
+
+'''
+=============================
+Modify by vllm_mlu
+=============================
+@brief(enable_prefix_caching): Not support prefix caching yet, will fix in VLLM-342.
+''' 
+@pytest.mark.parametrize("model", ["facebook/opt-125m"])
+@pytest.mark.parametrize("block_size", [16])
+def test_computed_prefix_blocks(model: str, block_size: int):
+    # This test checks if we are able to run the engine to completion
+    # without triggering asserts.
+    # We are in a scenario where all blocks from the second request's prompt
+    # are full and already computed when the second request arrives.
+    prompt = (
+        "You are a helpful assistant. How do I build a car from cardboard and "
+        "paper clips? Is there an easy to follow video tutorial available "
+        "online for free?")
+    prompt2 = (
+        " Please recommend to me some resources where I can learn not only to "
+        "handle technical difficulties of building a car, but also "
+        "decoration.")
+
+    engine_args = EngineArgs(model=model,
+                             block_size=block_size,
+                             enable_prefix_caching=False)
+
+    engine = LLMEngine.from_engine_args(engine_args)
+    sampling_params = SamplingParams()
+
+    engine.add_request("0", prompt + prompt2, sampling_params)
+    engine.step()
+    engine.add_request("1", prompt, sampling_params)
+    engine.step()
--- a/vllm-v0.6.2/tests/engine/test_custom_executor.py
+++ b/vllm-v0.6.2/tests/engine/test_custom_executor.py
@@ -0,0 +1,116 @@
+import asyncio
+import os
+
+import pytest
+
+from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
+from vllm.engine.async_llm_engine import AsyncLLMEngine
+from vllm.engine.llm_engine import LLMEngine
+from vllm.executor.mlu_executor import MLUExecutor, MLUExecutorAsync
+from vllm.sampling_params import SamplingParams
+
+
+class Mock:
+    ...
+
+
+'''
+=============================
+Modify by vllm_mlu
+=============================
+@brief(GPUExecutor): Use mlu executor on MLU devices.
+''' 
+class CustomGPUExecutor(MLUExecutor):
+
+    def execute_model(self, *args, **kwargs):
+        # Drop marker to show that this was ran
+        with open(".marker", "w"):
+            ...
+        return super().execute_model(*args, **kwargs)
+
+'''
+=============================
+Modify by vllm_mlu
+=============================
+@brief(GPUExecutor): Use mlu executor on MLU devices.
+''' 
+class CustomGPUExecutorAsync(MLUExecutorAsync):
+
+    async def execute_model_async(self, *args, **kwargs):
+        with open(".marker", "w"):
+            ...
+        return await super().execute_model_async(*args, **kwargs)
+
+
+@pytest.mark.parametrize("model", ["facebook/opt-125m"])
+def test_custom_executor_type_checking(model):
+    with pytest.raises(ValueError):
+        engine_args = EngineArgs(model=model,
+                                 distributed_executor_backend=Mock)
+        LLMEngine.from_engine_args(engine_args)
+    with pytest.raises(ValueError):
+        engine_args = AsyncEngineArgs(model=model,
+                                      distributed_executor_backend=Mock)
+        AsyncLLMEngine.from_engine_args(engine_args)
+    with pytest.raises(TypeError):
+        engine_args = AsyncEngineArgs(
+            model=model, distributed_executor_backend=CustomGPUExecutor)
+        AsyncLLMEngine.from_engine_args(engine_args)
+
+
+'''
+=============================
+Modify by vllm_mlu
+=============================
+@brief(tmpdir): All test models are soft link into tests dir, do not change dir.
+''' 
+@pytest.mark.parametrize("model", ["facebook/opt-125m"])
+def test_custom_executor(model, tmp_path):
+    cwd = os.path.abspath(".")
+    # os.chdir(tmp_path)
+    try:
+        assert not os.path.exists(".marker")
+
+        engine_args = EngineArgs(
+            model=model, distributed_executor_backend=CustomGPUExecutor)
+        engine = LLMEngine.from_engine_args(engine_args)
+        sampling_params = SamplingParams(max_tokens=1)
+
+        engine.add_request("0", "foo", sampling_params)
+        engine.step()
+
+        assert os.path.exists(".marker")
+        os.remove(".marker")
+    finally:
+        os.chdir(cwd)
+
+
+'''
+=============================
+Modify by vllm_mlu
+=============================
+@brief(tmpdir): All test models are soft link into tests dir, do not change dir.
+''' 
+@pytest.mark.parametrize("model", ["facebook/opt-125m"])
+def test_custom_executor_async(model, tmp_path):
+    cwd = os.path.abspath(".")
+    # os.chdir(tmp_path)
+    try:
+        assert not os.path.exists(".marker")
+
+        engine_args = AsyncEngineArgs(
+            model=model, distributed_executor_backend=CustomGPUExecutorAsync)
+        engine = AsyncLLMEngine.from_engine_args(engine_args)
+        sampling_params = SamplingParams(max_tokens=1)
+
+        async def t():
+            stream = await engine.add_request("0", "foo", sampling_params)
+            async for x in stream:
+                ...
+
+        asyncio.run(t())
+
+        assert os.path.exists(".marker")
+        os.remove(".marker")
+    finally:
+        os.chdir(cwd)
--- a/vllm-v0.6.2/tests/engine/test_detokenization.py
+++ b/vllm-v0.6.2/tests/engine/test_detokenization.py
@@ -0,0 +1,32 @@
+import pytest
+
+from vllm.entrypoints.llm import LLM
+from vllm.sampling_params import SamplingParams
+
+
+@pytest.mark.parametrize("model", ["facebook/opt-125m"])
+def test_computed_prefix_blocks(model: str):
+    # This test checks if the engine generates completions both with and
+    # without optional detokenization, that detokenization includes text
+    # and no-detokenization doesn't, and that both completions have the same
+    # token_ids.
+    prompt = (
+        "You are a helpful assistant. How do I build a car from cardboard and "
+        "paper clips? Is there an easy to follow video tutorial available "
+        "online for free?")
+
+    llm = LLM(model=model)
+    sampling_params = SamplingParams(max_tokens=10,
+                                     temperature=0.0,
+                                     detokenize=False)
+
+    outputs_no_detokenization = llm.generate(prompt,
+                                             sampling_params)[0].outputs[0]
+    sampling_params.detokenize = True
+    outputs_with_detokenization = llm.generate(prompt,
+                                               sampling_params)[0].outputs[0]
+
+    assert outputs_no_detokenization.text == ''
+    assert outputs_with_detokenization.text != ''
+    assert outputs_no_detokenization.token_ids == \
+        outputs_with_detokenization.token_ids
--- a/vllm-v0.6.2/tests/engine/test_multiproc_workers.py
+++ b/vllm-v0.6.2/tests/engine/test_multiproc_workers.py
@@ -0,0 +1,176 @@
+import asyncio
+from concurrent.futures import ThreadPoolExecutor
+from functools import partial
+from time import sleep
+from typing import Any, List, Tuple
+
+import pytest
+
+from vllm.executor.multiproc_worker_utils import (ProcessWorkerWrapper,
+                                                  ResultHandler, WorkerMonitor)
+
+
+class DummyWorker:
+    """Dummy version of vllm.worker.worker.Worker"""
+
+    def __init__(self, rank: int):
+        self.rank = rank
+
+    def worker_method(self, worker_input: Any) -> Tuple[int, Any]:
+        sleep(0.05)
+
+        if isinstance(worker_input, Exception):
+            # simulate error case
+            raise worker_input
+
+        return self.rank, input
+
+
+def _start_workers() -> Tuple[List[ProcessWorkerWrapper], WorkerMonitor]:
+    result_handler = ResultHandler()
+    workers = [
+        ProcessWorkerWrapper(result_handler, partial(DummyWorker, rank=rank))
+        for rank in range(8)
+    ]
+
+    worker_monitor = WorkerMonitor(workers, result_handler)
+    assert not worker_monitor.is_alive()
+
+    result_handler.start()
+    worker_monitor.start()
+    assert worker_monitor.is_alive()
+
+    return workers, worker_monitor
+
+
+def test_local_workers() -> None:
+    """Test workers with sync task submission"""
+
+    workers, worker_monitor = _start_workers()
+
+    def execute_workers(worker_input: str) -> None:
+        worker_outputs = [
+            worker.execute_method("worker_method", worker_input)
+            for worker in workers
+        ]
+
+        for rank, output in enumerate(worker_outputs):
+            assert output.get() == (rank, input)
+
+    executor = ThreadPoolExecutor(max_workers=4)
+
+    # Test concurrent submission from different threads
+    futures = [
+        executor.submit(partial(execute_workers, f"thread {thread_num}"))
+        for thread_num in range(4)
+    ]
+
+    for future in futures:
+        future.result()
+
+    # Test error case
+    exception = ValueError("fake error")
+    result = workers[0].execute_method("worker_method", exception)
+    try:
+        result.get()
+        pytest.fail("task should have failed")
+    except Exception as e:
+        assert isinstance(e, ValueError)
+        assert str(e) == "fake error"
+
+    # Test cleanup when a worker fails
+    assert worker_monitor.is_alive()
+    workers[3].process.kill()
+
+    # Other workers should get shut down here
+    worker_monitor.join(20)
+
+    # Ensure everything is stopped
+    assert not worker_monitor.is_alive()
+    assert all(not worker.process.is_alive() for worker in workers)
+
+    # Further attempts to submit tasks should fail
+    try:
+        _result = workers[0].execute_method("worker_method", "test")
+        pytest.fail("task should fail once workers have been shut down")
+    except Exception as e:
+        assert isinstance(e, ChildProcessError)
+
+
+def test_local_workers_clean_shutdown() -> None:
+    """Test clean shutdown"""
+
+    workers, worker_monitor = _start_workers()
+
+    assert worker_monitor.is_alive()
+    assert all(worker.process.is_alive() for worker in workers)
+
+    # Clean shutdown
+    worker_monitor.close()
+
+    worker_monitor.join(20)
+
+    # Ensure everything is stopped
+    assert not worker_monitor.is_alive()
+    assert all(not worker.process.is_alive() for worker in workers)
+
+    # Further attempts to submit tasks should fail
+    try:
+        _result = workers[0].execute_method("worker_method", "test")
+        pytest.fail("task should fail once workers have been shut down")
+    except Exception as e:
+        assert isinstance(e, ChildProcessError)
+
+
+@pytest.mark.asyncio
+async def test_local_workers_async() -> None:
+    """Test local workers with async task submission"""
+
+    workers, worker_monitor = _start_workers()
+
+    async def execute_workers(worker_input: str) -> None:
+        worker_coros = [
+            worker.execute_method_async("worker_method", worker_input)
+            for worker in workers
+        ]
+
+        results = await asyncio.gather(*worker_coros)
+        for rank, result in enumerate(results):
+            assert result == (rank, input)
+
+    tasks = [
+        asyncio.create_task(execute_workers(f"task {task_num}"))
+        for task_num in range(4)
+    ]
+
+    for task in tasks:
+        await task
+
+    # Test error case
+    exception = ValueError("fake error")
+    try:
+        _result = await workers[0].execute_method_async(
+            "worker_method", exception)
+        pytest.fail("task should have failed")
+    except Exception as e:
+        assert isinstance(e, ValueError)
+        assert str(e) == "fake error"
+
+    # Test cleanup when a worker fails
+    assert worker_monitor.is_alive()
+    workers[3].process.kill()
+
+    # Other workers should get shut down here
+    worker_monitor.join(20)
+
+    # Ensure everything is stopped
+    assert not worker_monitor.is_alive()
+    assert all(not worker.process.is_alive() for worker in workers)
+
+    # Further attempts to submit tasks should fail
+    try:
+        _result = await workers[0].execute_method_async(
+            "worker_method", "test")
+        pytest.fail("task should fail once workers have been shut down")
+    except Exception as e:
+        assert isinstance(e, ChildProcessError)
--- a/vllm-v0.6.2/tests/engine/test_short_mm_context.py
+++ b/vllm-v0.6.2/tests/engine/test_short_mm_context.py
@@ -0,0 +1,29 @@
+import pytest
+
+from ..conftest import IMAGE_ASSETS
+
+HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
+    "stop_sign":
+    "USER: <image>\nWhat's the content of the image?\nASSISTANT:",
+    "cherry_blossom":
+    "USER: <image>\nWhat is the season?\nASSISTANT:",
+})
+
+models = ["llava-hf/llava-1.5-7b-hf"]
+
+
+@pytest.mark.parametrize("model", models)
+def test_context_length_too_short(vllm_runner, image_assets, model):
+    images = [asset.pil_image for asset in image_assets]
+
+    with pytest.raises(ValueError, match="too long to fit into the model"):
+        vllm_model = vllm_runner(
+            model,
+            max_model_len=128,  # LLaVA has a feature size of 576
+            enforce_eager=True,
+        )
+
+        with vllm_model:
+            vllm_model.generate_greedy([HF_IMAGE_PROMPTS[0]],
+                                       max_tokens=1,
+                                       images=[images[0]])
--- a/vllm-v0.6.2/tests/engine/test_skip_tokenizer_init.py
+++ b/vllm-v0.6.2/tests/engine/test_skip_tokenizer_init.py
@@ -0,0 +1,24 @@
+import pytest
+
+from vllm.entrypoints.llm import LLM
+from vllm.sampling_params import SamplingParams
+
+
+@pytest.mark.parametrize("model", ["facebook/opt-125m"])
+def test_skip_tokenizer_initialization(model: str):
+    # This test checks if the flag skip_tokenizer_init skips the initialization
+    # of tokenizer and detokenizer. The generated output is expected to contain
+    # token ids.
+    llm = LLM(model=model, skip_tokenizer_init=True)
+    sampling_params = SamplingParams(prompt_logprobs=True, detokenize=True)
+
+    with pytest.raises(ValueError, match="cannot pass text prompts when"):
+        llm.generate("abc", sampling_params)
+
+    outputs = llm.generate({"prompt_token_ids": [1, 2, 3]},
+                           sampling_params=sampling_params)
+    assert len(outputs) > 0
+    completions = outputs[0].outputs
+    assert len(completions) > 0
+    assert completions[0].text == ""
+    assert completions[0].token_ids
--- a/vllm-v0.6.2/tests/engine/test_stop_reason.py
+++ b/vllm-v0.6.2/tests/engine/test_stop_reason.py
@@ -0,0 +1,62 @@
+"""Test the different finish_reason="stop" situations during generation:
+    1. One of the provided stop strings
+    2. One of the provided stop tokens
+    3. The EOS token
+
+Run `pytest tests/engine/test_stop_reason.py`.
+"""
+
+import pytest
+import transformers
+
+from vllm import SamplingParams
+
+MODEL = "facebook/opt-350m"
+STOP_STR = "."
+SEED = 42
+MAX_TOKENS = 1024
+
+
+@pytest.fixture
+def vllm_model(vllm_runner):
+    with vllm_runner(MODEL) as vllm_model:
+        yield vllm_model
+
+
+def test_stop_reason(vllm_model, example_prompts):
+    tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL)
+    stop_token_id = tokenizer.convert_tokens_to_ids(STOP_STR)
+    llm = vllm_model.model
+
+    # test stop token
+    outputs = llm.generate(example_prompts,
+                           sampling_params=SamplingParams(
+                               ignore_eos=True,
+                               seed=SEED,
+                               max_tokens=MAX_TOKENS,
+                               stop_token_ids=[stop_token_id]))
+    for output in outputs:
+        output = output.outputs[0]
+        assert output.finish_reason == "stop"
+        assert output.stop_reason == stop_token_id
+
+    # test stop string
+    outputs = llm.generate(example_prompts,
+                           sampling_params=SamplingParams(
+                               ignore_eos=True,
+                               seed=SEED,
+                               max_tokens=MAX_TOKENS,
+                               stop="."))
+    for output in outputs:
+        output = output.outputs[0]
+        assert output.finish_reason == "stop"
+        assert output.stop_reason == STOP_STR
+
+    # test EOS token
+    outputs = llm.generate(example_prompts,
+                           sampling_params=SamplingParams(
+                               seed=SEED, max_tokens=MAX_TOKENS))
+    for output in outputs:
+        output = output.outputs[0]
+        assert output.finish_reason == "length" or (
+            output.finish_reason == "stop" and output.stop_reason is None)
--- a/vllm-v0.6.2/tests/engine/test_stop_strings.py
+++ b/vllm-v0.6.2/tests/engine/test_stop_strings.py
@@ -0,0 +1,163 @@
+from typing import Any, List, Optional
+
+import pytest
+
+from vllm import CompletionOutput, LLMEngine, SamplingParams
+
+MODEL = "meta-llama/llama-2-7b-hf"
+MAX_TOKENS = 200
+
+IS_ASYNC = False
+
+
+@pytest.fixture(scope="session")
+def vllm_model(vllm_runner):
+    with vllm_runner(MODEL) as vllm_model:
+        yield vllm_model
+
+
+def _test_stopping(llm_engine: LLMEngine,
+                   expected_output: str,
+                   expected_reason: Any,
+                   stop: Optional[List[str]] = None,
+                   stop_token_ids: Optional[List[int]] = None,
+                   include_in_output: bool = False,
+                   use_async_output_proc: bool = False) -> None:
+    llm_engine.add_request(
+        "id", "A story about vLLM:\n",
+        SamplingParams(
+            temperature=0.0,
+            max_tokens=MAX_TOKENS,
+            stop=stop,
+            stop_token_ids=stop_token_ids,
+            include_stop_str_in_output=include_in_output,
+        ), None)
+
+    output: Optional[CompletionOutput] = None
+    output_text = ""
+    stop_reason = None
+
+    if use_async_output_proc:
+        llm_engine.step()
+
+    while llm_engine.has_unfinished_requests():
+        (request_output, ) = llm_engine.step()
+        (output, ) = request_output.outputs
+
+        # Ensure we don't backtrack
+        assert output.text.startswith(output_text)
+        output_text = output.text
+        stop_reason = output.stop_reason
+
+    assert output is not None
+    assert output_text == expected_output
+    assert stop_reason == expected_reason
+
+
+def _set_async_mode(llm_engine, is_async):
+    llm_engine.scheduler[0].use_async_output_proc = is_async
+
+
+def _stop_basic(llm_engine, is_async):
+    _test_stopping(llm_engine,
+                   stop=["."],
+                   include_in_output=False,
+                   expected_output="VLLM is a 100% volunteer organization",
+                   expected_reason=".",
+                   use_async_output_proc=is_async)
+
+    _test_stopping(llm_engine,
+                   stop=["."],
+                   include_in_output=True,
+                   expected_output="VLLM is a 100% volunteer organization.",
+                   expected_reason=".",
+                   use_async_output_proc=is_async)
+
+
+def _stop_multi_tokens(llm_engine, is_async):
+    _test_stopping(
+        llm_engine,
+        stop=["group of peo", "short"],
+        include_in_output=False,
+        expected_output="VLLM is a 100% volunteer organization. We are a ",
+        expected_reason="group of peo",
+        use_async_output_proc=is_async)
+
+    _test_stopping(
+        llm_engine,
+        stop=["group of peo", "short"],
+        include_in_output=True,
+        expected_output=
+        "VLLM is a 100% volunteer organization. We are a group of peo",
+        expected_reason="group of peo",
+        use_async_output_proc=is_async)
+
+
+def _stop_partial_token(llm_engine, is_async):
+    _test_stopping(llm_engine,
+                   stop=["gani"],
+                   include_in_output=False,
+                   expected_output="VLLM is a 100% volunteer or",
+                   expected_reason="gani",
+                   use_async_output_proc=is_async)
+
+    _test_stopping(llm_engine,
+                   stop=["gani"],
+                   include_in_output=True,
+                   expected_output="VLLM is a 100% volunteer organi",
+                   expected_reason="gani",
+                   use_async_output_proc=is_async)
+
+
+def _stop_token_id(llm_engine, is_async):
+    # token id 13013 => " organization"
+
+    _test_stopping(llm_engine,
+                   stop_token_ids=[13013],
+                   include_in_output=False,
+                   expected_output="VLLM is a 100% volunteer",
+                   expected_reason=13013,
+                   use_async_output_proc=is_async)
+
+    _test_stopping(llm_engine,
+                   stop_token_ids=[13013],
+                   include_in_output=True,
+                   expected_output="VLLM is a 100% volunteer organization",
+                   expected_reason=13013,
+                   use_async_output_proc=is_async)
+
+
+@pytest.mark.skip_global_cleanup
+def test_stop_basic(vllm_model):
+    _set_async_mode(vllm_model.model.llm_engine, True)
+    _stop_basic(vllm_model.model.llm_engine, is_async=True)
+
+    _set_async_mode(vllm_model.model.llm_engine, False)
+    _stop_basic(vllm_model.model.llm_engine, is_async=False)
+
+
+@pytest.mark.skip_global_cleanup
+def test_stop_multi_tokens(vllm_model):
+    _set_async_mode(vllm_model.model.llm_engine, True)
+    _stop_multi_tokens(vllm_model.model.llm_engine, is_async=True)
+
+    _set_async_mode(vllm_model.model.llm_engine, False)
+    _stop_multi_tokens(vllm_model.model.llm_engine, is_async=False)
+
+
+@pytest.mark.skip_global_cleanup
+def test_stop_partial_token(vllm_model):
+    _set_async_mode(vllm_model.model.llm_engine, True)
+    _stop_partial_token(vllm_model.model.llm_engine, is_async=True)
+
+    _set_async_mode(vllm_model.model.llm_engine, False)
+    _stop_partial_token(vllm_model.model.llm_engine, is_async=False)
+
+
+@pytest.mark.skip_global_cleanup
+def test_stop_token_id(vllm_model):
+    _set_async_mode(vllm_model.model.llm_engine, True)
+    _stop_token_id(vllm_model.model.llm_engine, is_async=True)
+
+    _set_async_mode(vllm_model.model.llm_engine, False)
+    _stop_token_id(vllm_model.model.llm_engine, is_async=False)
--- a/vllm-v0.6.2/tests/entrypoints/init.py
+++ b/vllm-v0.6.2/tests/entrypoints/init.py
--- a/vllm-v0.6.2/tests/entrypoints/conftest.py
+++ b/vllm-v0.6.2/tests/entrypoints/conftest.py
@@ -0,0 +1,89 @@
+import pytest
+
+
+@pytest.fixture
+def sample_prompts():
+    return [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+
+
+@pytest.fixture
+def sample_token_ids():
+    return [
+        [0],
+        [0, 1],
+        [0, 2, 1],
+        [0, 3, 1, 2],
+    ]
+
+
+@pytest.fixture
+def sample_regex():
+    return (r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}"
+            r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)")
+
+
+@pytest.fixture
+def sample_json_schema():
+    return {
+        "type": "object",
+        "properties": {
+            "name": {
+                "type": "string"
+            },
+            "age": {
+                "type": "integer"
+            },
+            "skills": {
+                "type": "array",
+                "items": {
+                    "type": "string",
+                    "maxLength": 10
+                },
+                "minItems": 3
+            },
+            "work_history": {
+                "type": "array",
+                "items": {
+                    "type": "object",
+                    "properties": {
+                        "company": {
+                            "type": "string"
+                        },
+                        "duration": {
+                            "type": "number"
+                        },
+                        "position": {
+                            "type": "string"
+                        }
+                    },
+                    "required": ["company", "position"]
+                }
+            }
+        },
+        "required": ["name", "age", "skills", "work_history"]
+    }
+
+
+@pytest.fixture
+def sample_guided_choice():
+    return [
+        "Python", "Java", "JavaScript", "C++", "C#", "PHP", "TypeScript",
+        "Ruby", "Swift", "Kotlin"
+    ]
+
+
+@pytest.fixture
+def sample_sql_statements():
+    return ("""
+start: select_statement
+select_statement: "SELECT" column "from" table "where" condition
+column: "col_1" | "col_2"
+table: "table_1" | "table_2"
+condition: column "=" number
+number: "1" | "2"
+""")
--- a/vllm-v0.6.2/tests/entrypoints/llm/init.py
+++ b/vllm-v0.6.2/tests/entrypoints/llm/init.py
--- a/vllm-v0.6.2/tests/entrypoints/llm/test_accuracy.py
+++ b/vllm-v0.6.2/tests/entrypoints/llm/test_accuracy.py
@@ -0,0 +1,56 @@
+"""
+This file test accuracy of the vLLM server via LMEval.
+It uses local-completions, which interacts with vLLM
+through the OAI API with N concurrent connections.
+This simulates real work usage of the API and makes
+sure that the zmq frontend mp RPC message passing and
+AsyncLLMEngine are working correctly.
+"""
+
+import lm_eval
+import pytest
+
+from vllm.platforms import current_platform
+
+MODEL_NAME = "Qwen/Qwen2-1.5B-Instruct"
+NUM_CONCURRENT = 500
+TASK = "gsm8k"
+FILTER = "exact_match,strict-match"
+RTOL = 0.03
+EXPECTED_VALUE = 0.58
+
+
+def run_test():
+    """Run the end to end accuracy test."""
+
+    model_args = f"pretrained={MODEL_NAME},max_model_len=2048"
+
+    results = lm_eval.simple_evaluate(
+        model="vllm",
+        model_args=model_args,
+        tasks="gsm8k",
+        batch_size="auto",
+    )
+
+    measured_value = results["results"][TASK][FILTER]
+    assert (measured_value - RTOL < EXPECTED_VALUE
+            and measured_value + RTOL > EXPECTED_VALUE
+            ), f"Expected: {EXPECTED_VALUE} |  Measured: {measured_value}"
+
+
+@pytest.mark.skipif(not current_platform.is_cuda(),
+                    reason="V1 is currently only supported on CUDA.")
+def test_lm_eval_accuracy_v1_engine(monkeypatch):
+    """Run with the V1 Engine."""
+
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+        run_test()
+
+
+def test_lm_eval_accuracy_v0_engine(monkeypatch):
+    """Run with the V0 Engine."""
+
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "0")
+        run_test()
--- a/vllm-v0.6.2/tests/entrypoints/llm/test_chat.py
+++ b/vllm-v0.6.2/tests/entrypoints/llm/test_chat.py
@@ -0,0 +1,93 @@
+from typing import List
+
+import pytest
+
+from vllm import LLM
+
+from ..openai.test_vision import TEST_IMAGE_URLS
+
+
+def test_chat():
+    llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct")
+
+    prompt1 = "Explain the concept of entropy."
+    messages = [
+        {
+            "role": "system",
+            "content": "You are a helpful assistant"
+        },
+        {
+            "role": "user",
+            "content": prompt1
+        },
+    ]
+    outputs = llm.chat(messages)
+    assert len(outputs) == 1
+
+
+def test_multi_chat():
+    llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct")
+
+    prompt1 = "Explain the concept of entropy."
+    prompt2 = "Explain what among us is."
+
+    conversation1 = [
+        {
+            "role": "system",
+            "content": "You are a helpful assistant"
+        },
+        {
+            "role": "user",
+            "content": prompt1
+        },
+    ]
+
+    conversation2 = [
+        {
+            "role": "system",
+            "content": "You are a helpful assistant"
+        },
+        {
+            "role": "user",
+            "content": prompt2
+        },
+    ]
+
+    messages = [conversation1, conversation2]
+
+    outputs = llm.chat(messages)
+    assert len(outputs) == 2
+
+
+@pytest.mark.skip("Not support Phi vision model yet.")
+@pytest.mark.parametrize("image_urls",
+                         [[TEST_IMAGE_URLS[0], TEST_IMAGE_URLS[1]]])
+def test_chat_multi_image(image_urls: List[str]):
+    llm = LLM(
+        model="microsoft/Phi-3.5-vision-instruct",
+        dtype="bfloat16",
+        max_model_len=4096,
+        max_num_seqs=5,
+        enforce_eager=True,
+        trust_remote_code=True,
+        limit_mm_per_prompt={"image": 2},
+    )
+
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            *({
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url
+                }
+            } for image_url in image_urls),
+            {
+                "type": "text",
+                "text": "What's in this image?"
+            },
+        ],
+    }]
+    outputs = llm.chat(messages)
+    assert len(outputs) >= 0
--- a/vllm-v0.6.2/tests/entrypoints/llm/test_encode.py
+++ b/vllm-v0.6.2/tests/entrypoints/llm/test_encode.py
@@ -0,0 +1,107 @@
+import weakref
+from typing import List
+
+import pytest
+
+from vllm import LLM, EmbeddingRequestOutput, PoolingParams
+from vllm.distributed import cleanup_dist_env_and_memory
+
+MODEL_NAME = "intfloat/e5-mistral-7b-instruct"
+
+PROMPTS = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+
+TOKEN_IDS = [
+    # Using ID={0, 1, 2, 3} results in NaN values,
+    # so we add this offset of 1000
+    [1000],
+    [1000, 1001],
+    [1000, 1002, 1001],
+    [1000, 1003, 1001, 1002],
+]
+
+
+@pytest.fixture(scope="module")
+def llm():
+    # pytest caches the fixture so we use weakref.proxy to
+    # enable garbage collection
+    llm = LLM(model=MODEL_NAME,
+              max_num_batched_tokens=32768,
+              tensor_parallel_size=1,
+              gpu_memory_utilization=0.75,
+              enforce_eager=True)
+
+    with llm.deprecate_legacy_api():
+        yield weakref.proxy(llm)
+
+        del llm
+
+    cleanup_dist_env_and_memory()
+
+
+def assert_outputs_equal(o1: List[EmbeddingRequestOutput],
+                         o2: List[EmbeddingRequestOutput]):
+    assert [o.outputs for o in o1] == [o.outputs for o in o2]
+
+
+@pytest.mark.skip_global_cleanup
+@pytest.mark.parametrize('prompt_token_ids', TOKEN_IDS)
+def test_v1_v2_api_consistency_single_prompt_tokens(llm: LLM,
+                                                    prompt_token_ids):
+    pooling_params = PoolingParams()
+
+    with pytest.warns(DeprecationWarning, match="'prompt_token_ids'"):
+        v1_output = llm.encode(prompt_token_ids=prompt_token_ids,
+                               pooling_params=pooling_params)
+
+    v2_output = llm.encode({"prompt_token_ids": prompt_token_ids},
+                           pooling_params=pooling_params)
+    assert_outputs_equal(v1_output, v2_output)
+
+
+@pytest.mark.skip_global_cleanup
+def test_v1_v2_api_consistency_multi_prompt_tokens(llm: LLM):
+    pooling_params = PoolingParams()
+
+    with pytest.warns(DeprecationWarning, match="'prompt_token_ids'"):
+        v1_output = llm.encode(prompt_token_ids=TOKEN_IDS,
+                               pooling_params=pooling_params)
+
+    v2_output = llm.encode(
+        [{
+            "prompt_token_ids": p
+        } for p in TOKEN_IDS],
+        pooling_params=pooling_params,
+    )
+    assert_outputs_equal(v1_output, v2_output)
+
+
+@pytest.mark.skip_global_cleanup
+def test_multiple_pooling_params(llm: LLM):
+    pooling_params = [
+        PoolingParams(),
+        PoolingParams(),
+        PoolingParams(),
+        PoolingParams(),
+    ]
+
+    # Multiple PoolingParams should be matched with each prompt
+    outputs = llm.encode(PROMPTS, pooling_params=pooling_params)
+    assert len(PROMPTS) == len(outputs)
+
+    # Exception raised, if the size of params does not match the size of prompts
+    with pytest.raises(ValueError):
+        outputs = llm.encode(PROMPTS, pooling_params=pooling_params[:3])
+
+    # Single PoolingParams should be applied to every prompt
+    single_pooling_params = PoolingParams()
+    outputs = llm.encode(PROMPTS, pooling_params=single_pooling_params)
+    assert len(PROMPTS) == len(outputs)
+
+    # pooling_params is None, default params should be applied
+    outputs = llm.encode(PROMPTS, pooling_params=None)
+    assert len(PROMPTS) == len(outputs)
--- a/vllm-v0.6.2/tests/entrypoints/llm/test_generate.py
+++ b/vllm-v0.6.2/tests/entrypoints/llm/test_generate.py
@@ -0,0 +1,104 @@
+import weakref
+from typing import List
+
+import pytest
+
+from vllm import LLM, RequestOutput, SamplingParams
+from vllm.distributed import cleanup_dist_env_and_memory
+
+MODEL_NAME = "facebook/opt-125m"
+
+PROMPTS = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+
+TOKEN_IDS = [
+    [0],
+    [0, 1],
+    [0, 2, 1],
+    [0, 3, 1, 2],
+]
+
+
+@pytest.fixture(scope="module")
+def llm():
+    # pytest caches the fixture so we use weakref.proxy to
+    # enable garbage collection
+    llm = LLM(model=MODEL_NAME,
+              max_num_batched_tokens=4096,
+              tensor_parallel_size=1,
+              gpu_memory_utilization=0.10,
+              enforce_eager=True)
+
+    with llm.deprecate_legacy_api():
+        yield weakref.proxy(llm)
+
+        del llm
+
+    cleanup_dist_env_and_memory()
+
+
+def assert_outputs_equal(o1: List[RequestOutput], o2: List[RequestOutput]):
+    assert [o.outputs for o in o1] == [o.outputs for o in o2]
+
+
+@pytest.mark.skip_global_cleanup
+@pytest.mark.parametrize('prompt_token_ids', TOKEN_IDS)
+def test_v1_v2_api_consistency_single_prompt_tokens(llm: LLM,
+                                                    prompt_token_ids):
+    sampling_params = SamplingParams(temperature=0.0, top_p=1.0)
+
+    with pytest.warns(DeprecationWarning, match="'prompt_token_ids'"):
+        v1_output = llm.generate(prompt_token_ids=prompt_token_ids,
+                                 sampling_params=sampling_params)
+
+    v2_output = llm.generate({"prompt_token_ids": prompt_token_ids},
+                             sampling_params=sampling_params)
+    assert_outputs_equal(v1_output, v2_output)
+
+
+@pytest.mark.skip_global_cleanup
+def test_v1_v2_api_consistency_multi_prompt_tokens(llm: LLM):
+    sampling_params = SamplingParams(temperature=0.0, top_p=1.0)
+
+    with pytest.warns(DeprecationWarning, match="'prompt_token_ids'"):
+        v1_output = llm.generate(prompt_token_ids=TOKEN_IDS,
+                                 sampling_params=sampling_params)
+
+    v2_output = llm.generate(
+        [{
+            "prompt_token_ids": p
+        } for p in TOKEN_IDS],
+        sampling_params=sampling_params,
+    )
+    assert_outputs_equal(v1_output, v2_output)
+
+
+@pytest.mark.skip_global_cleanup
+def test_multiple_sampling_params(llm: LLM):
+    sampling_params = [
+        SamplingParams(temperature=0.01, top_p=0.95),
+        SamplingParams(temperature=0.3, top_p=0.95),
+        SamplingParams(temperature=0.7, top_p=0.95),
+        SamplingParams(temperature=0.99, top_p=0.95),
+    ]
+
+    # Multiple SamplingParams should be matched with each prompt
+    outputs = llm.generate(PROMPTS, sampling_params=sampling_params)
+    assert len(PROMPTS) == len(outputs)
+
+    # Exception raised, if the size of params does not match the size of prompts
+    with pytest.raises(ValueError):
+        outputs = llm.generate(PROMPTS, sampling_params=sampling_params[:3])
+
+    # Single SamplingParams should be applied to every prompt
+    single_sampling_params = SamplingParams(temperature=0.3, top_p=0.95)
+    outputs = llm.generate(PROMPTS, sampling_params=single_sampling_params)
+    assert len(PROMPTS) == len(outputs)
+
+    # sampling_params is None, default params should be applied
+    outputs = llm.generate(PROMPTS, sampling_params=None)
+    assert len(PROMPTS) == len(outputs)
--- a/vllm-v0.6.2/tests/entrypoints/llm/test_generate_multiple_loras.py
+++ b/vllm-v0.6.2/tests/entrypoints/llm/test_generate_multiple_loras.py
@@ -0,0 +1,66 @@
+import weakref
+
+import pytest
+# downloading lora to test lora requests
+from huggingface_hub import snapshot_download
+
+from vllm import LLM
+from vllm.distributed import cleanup_dist_env_and_memory
+from vllm.lora.request import LoRARequest
+
+MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+
+PROMPTS = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+
+LORA_NAME = "typeof/zephyr-7b-beta-lora"
+
+
+@pytest.fixture(scope="module")
+def llm():
+    # pytest caches the fixture so we use weakref.proxy to
+    # enable garbage collection
+    llm = LLM(model=MODEL_NAME,
+              tensor_parallel_size=1,
+              max_model_len=8192,
+              enable_lora=True,
+              max_loras=4,
+              max_lora_rank=64,
+              max_num_seqs=128,
+              enforce_eager=True)
+
+    with llm.deprecate_legacy_api():
+        yield weakref.proxy(llm)
+
+        del llm
+
+    cleanup_dist_env_and_memory()
+
+
+@pytest.fixture(scope="module")
+def zephyr_lora_files():
+    return snapshot_download(repo_id=LORA_NAME)
+
+
+@pytest.mark.skip_global_cleanup
+def test_multiple_lora_requests(llm: LLM, zephyr_lora_files):
+    lora_request = [
+        LoRARequest(LORA_NAME + str(idx), idx + 1, zephyr_lora_files)
+        for idx in range(len(PROMPTS))
+    ]
+    # Multiple SamplingParams should be matched with each prompt
+    outputs = llm.generate(PROMPTS, lora_request=lora_request)
+    assert len(PROMPTS) == len(outputs)
+
+    # Exception raised, if the size of params does not match the size of prompts
+    with pytest.raises(ValueError):
+        outputs = llm.generate(PROMPTS, lora_request=lora_request[:1])
+
+    # Single LoRARequest should be applied to every prompt
+    single_lora_request = lora_request[0]
+    outputs = llm.generate(PROMPTS, lora_request=single_lora_request)
+    assert len(PROMPTS) == len(outputs)
--- a/vllm-v0.6.2/tests/entrypoints/llm/test_guided_generate.py
+++ b/vllm-v0.6.2/tests/entrypoints/llm/test_guided_generate.py
@@ -0,0 +1,161 @@
+import json
+import re
+import weakref
+
+import jsonschema
+import pytest
+
+from vllm.distributed import cleanup_dist_env_and_memory
+from vllm.entrypoints.llm import LLM
+from vllm.outputs import RequestOutput
+from vllm.sampling_params import GuidedDecodingParams, SamplingParams
+
+MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+
+
+@pytest.fixture(scope="module")
+def llm():
+    # pytest caches the fixture so we use weakref.proxy to
+    # enable garbage collection
+    llm = LLM(model=MODEL_NAME, max_model_len=1024)
+
+    with llm.deprecate_legacy_api():
+        yield weakref.proxy(llm)
+        del llm
+    cleanup_dist_env_and_memory()
+
+
+@pytest.mark.skip_global_cleanup
+def test_guided_regex(sample_regex, llm):
+    sampling_params = SamplingParams(
+        temperature=0.8,
+        top_p=0.95,
+        guided_decoding=GuidedDecodingParams(regex=sample_regex))
+    outputs = llm.generate(prompts=[
+        f"Give an example IPv4 address with this regex: {sample_regex}"
+    ] * 2,
+                           sampling_params=sampling_params,
+                           use_tqdm=True)
+
+    assert outputs is not None
+    for output in outputs:
+        assert output is not None
+        assert isinstance(output, RequestOutput)
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(generated_text)
+        assert generated_text is not None
+        assert re.fullmatch(sample_regex, generated_text) is not None
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+
+@pytest.mark.skip_global_cleanup
+def test_guided_json_completion(sample_json_schema, llm):
+    sampling_params = SamplingParams(
+        temperature=1.0,
+        max_tokens=1000,
+        guided_decoding=GuidedDecodingParams(json=sample_json_schema))
+    outputs = llm.generate(prompts=[
+        f"Give an example JSON for an employee profile "
+        f"that fits this schema: {sample_json_schema}"
+    ] * 2,
+                           sampling_params=sampling_params,
+                           use_tqdm=True)
+
+    assert outputs is not None
+
+    for output in outputs:
+        assert output is not None
+        assert isinstance(output, RequestOutput)
+        prompt = output.prompt
+
+        generated_text = output.outputs[0].text
+        assert generated_text is not None
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+        output_json = json.loads(generated_text)
+        jsonschema.validate(instance=output_json, schema=sample_json_schema)
+
+
+@pytest.mark.skip_global_cleanup
+def test_guided_choice_completion(sample_guided_choice, llm):
+    sampling_params = SamplingParams(
+        temperature=0.8,
+        top_p=0.95,
+        guided_decoding=GuidedDecodingParams(choice=sample_guided_choice))
+    outputs = llm.generate(
+        prompts="The best language for type-safe systems programming is ",
+        sampling_params=sampling_params,
+        use_tqdm=True)
+
+    assert outputs is not None
+    for output in outputs:
+        assert output is not None
+        assert isinstance(output, RequestOutput)
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(generated_text)
+        assert generated_text is not None
+        assert generated_text in sample_guided_choice
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+
+@pytest.mark.skip_global_cleanup
+def test_guided_grammar(sample_sql_statements, llm):
+
+    sampling_params = SamplingParams(
+        temperature=0.8,
+        top_p=0.95,
+        max_tokens=1000,
+        guided_decoding=GuidedDecodingParams(grammar=sample_sql_statements))
+    outputs = llm.generate(
+        prompts=("Generate a sql state that select col_1 from "
+                 "table_1 where it is equals to 1"),
+        sampling_params=sampling_params,
+        use_tqdm=True,
+    )
+
+    assert outputs is not None
+    for output in outputs:
+        assert output is not None
+        assert isinstance(output, RequestOutput)
+        prompt = output.prompt
+
+        generated_text = output.outputs[0].text
+        assert generated_text is not None
+        # use Lark to parse the output, and make sure it's a valid parse tree
+        from lark import Lark
+        parser = Lark(sample_sql_statements)
+        parser.parse(generated_text)
+
+        # remove spaces for comparison b/c we removed them in the grammar
+        ground_truth = "SELECT col_1 from table_1 where col_1 = 1".replace(
+            " ", "")
+
+        assert generated_text.strip() == ground_truth
+
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+
+@pytest.mark.skip_global_cleanup
+def test_guided_options_request_deprecation_warning(sample_regex, llm):
+    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+    with pytest.warns(DeprecationWarning, match="guided_options_request"):
+        llm.generate(prompts="This should fail",
+                     sampling_params=sampling_params,
+                     use_tqdm=True,
+                     guided_options_request=dict(guided_regex=sample_regex))
+
+
+@pytest.mark.skip_global_cleanup
+def test_validation_against_both_guided_decoding_options(sample_regex, llm):
+    sampling_params = SamplingParams(
+        temperature=0.8,
+        top_p=0.95,
+        guided_decoding=GuidedDecodingParams(regex=sample_regex))
+
+    with pytest.raises(ValueError, match="Cannot set both"):
+        llm.generate(prompts="This should fail",
+                     sampling_params=sampling_params,
+                     use_tqdm=True,
+                     guided_options_request=dict(guided_regex=sample_regex))
--- a/vllm-v0.6.2/tests/entrypoints/llm/test_init.py
+++ b/vllm-v0.6.2/tests/entrypoints/llm/test_init.py
@@ -0,0 +1,22 @@
+import pytest
+
+from vllm import LLM
+
+from ...utils import error_on_warning
+
+MODEL_NAME = "facebook/opt-125m"
+
+
+def test_pos_args_deprecated():
+    with error_on_warning(DeprecationWarning):
+        LLM(model=MODEL_NAME, tokenizer=MODEL_NAME)
+
+    with error_on_warning(DeprecationWarning):
+        LLM(MODEL_NAME, tokenizer=MODEL_NAME)
+
+    with pytest.warns(DeprecationWarning, match="'tokenizer'"):
+        LLM(MODEL_NAME, MODEL_NAME)
+
+    with pytest.warns(DeprecationWarning,
+                      match="'tokenizer', 'tokenizer_mode'"):
+        LLM(MODEL_NAME, MODEL_NAME, "auto")
--- a/vllm-v0.6.2/tests/entrypoints/llm/test_lazy_outlines.py
+++ b/vllm-v0.6.2/tests/entrypoints/llm/test_lazy_outlines.py
@@ -0,0 +1,55 @@
+import sys
+
+from vllm import LLM, SamplingParams
+from vllm.distributed import cleanup_dist_env_and_memory
+
+
+def test_lazy_outlines(sample_regex):
+    """If users don't use guided decoding, outlines should not be imported.
+    """
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+    # Create an LLM without guided decoding as a baseline.
+    llm = LLM(model="facebook/opt-125m",
+              enforce_eager=True,
+              gpu_memory_utilization=0.3)
+    outputs = llm.generate(prompts, sampling_params)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+    # make sure outlines is not imported
+    assert 'outlines' not in sys.modules
+
+    # Destroy the LLM object and free up the GPU memory.
+    del llm
+    cleanup_dist_env_and_memory()
+
+    # Create an LLM with guided decoding enabled.
+    llm = LLM(model="facebook/opt-125m",
+              enforce_eager=True,
+              guided_decoding_backend="lm-format-enforcer",
+              gpu_memory_utilization=0.6)
+    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+    outputs = llm.generate(
+        prompts=[
+            f"Give an example IPv4 address with this regex: {sample_regex}"
+        ] * 2,
+        sampling_params=sampling_params,
+        use_tqdm=True,
+        guided_options_request=dict(guided_regex=sample_regex))
+
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+    # make sure outlines is not imported
+    assert 'outlines' not in sys.modules
--- a/vllm-v0.6.2/tests/entrypoints/llm/test_prompt_validation.py
+++ b/vllm-v0.6.2/tests/entrypoints/llm/test_prompt_validation.py
@@ -0,0 +1,25 @@
+import pytest
+
+from vllm import LLM
+
+
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
+@pytest.mark.skip_v1
+def test_empty_prompt():
+    llm = LLM(model="gpt2", enforce_eager=True)
+    with pytest.raises(ValueError, match='Prompt cannot be empty'):
+        llm.generate([""])
+
+
+@pytest.mark.skip_v1
+def test_out_of_vocab_token():
+    llm = LLM(model="gpt2", enforce_eager=True)
+    with pytest.raises(ValueError, match='out of vocabulary'):
+        llm.generate({"prompt_token_ids": [999999]})
--- a/vllm-v0.6.2/tests/entrypoints/offline_mode/init.py
+++ b/vllm-v0.6.2/tests/entrypoints/offline_mode/init.py
--- a/vllm-v0.6.2/tests/entrypoints/offline_mode/test_offline_mode.py
+++ b/vllm-v0.6.2/tests/entrypoints/offline_mode/test_offline_mode.py
@@ -0,0 +1,82 @@
+"""Tests for HF_HUB_OFFLINE mode"""
+import importlib
+import sys
+
+import pytest
+
+from vllm import LLM
+from vllm.distributed import cleanup_dist_env_and_memory
+
+MODEL_CONFIGS = [
+    {
+        "model": "facebook/opt-125m",
+        "enforce_eager": True,
+        "gpu_memory_utilization": 0.20,
+        "max_model_len": 64,
+        "max_num_batched_tokens": 64,
+        "max_num_seqs": 64,
+        "tensor_parallel_size": 1,
+    },
+    # {
+    #     "model": "mistralai/Mistral-7B-Instruct-v0.1",
+    #     "enforce_eager": True,
+    #     "gpu_memory_utilization": 0.95,
+    #     "max_model_len": 64,
+    #     "max_num_batched_tokens": 64,
+    #     "max_num_seqs": 64,
+    #     "tensor_parallel_size": 1,
+    #     "tokenizer_mode": "mistral",
+    # },
+]
+
+
+@pytest.fixture(scope="module")
+def cache_models():
+    # Cache model files first
+    for model_config in MODEL_CONFIGS:
+        LLM(**model_config)
+        cleanup_dist_env_and_memory()
+
+    yield
+
+
+@pytest.mark.skip_global_cleanup
+@pytest.mark.usefixtures("cache_models")
+def test_offline_mode(monkeypatch):
+    # Set HF to offline mode and ensure we can still construct an LLM
+    try:
+        monkeypatch.setenv("HF_HUB_OFFLINE", "1")
+        # Need to re-import huggingface_hub and friends to setup offline mode
+        _re_import_modules()
+        # Cached model files should be used in offline mode
+        for model_config in MODEL_CONFIGS:
+            LLM(**model_config)
+    finally:
+        # Reset the environment after the test
+        # NB: Assuming tests are run in online mode
+        monkeypatch.delenv("HF_HUB_OFFLINE")
+        _re_import_modules()
+        pass
+
+
+def _re_import_modules():
+    hf_hub_module_names = [
+        k for k in sys.modules if k.startswith("huggingface_hub")
+    ]
+    transformers_module_names = [
+        k for k in sys.modules if k.startswith("transformers")
+        and not k.startswith("transformers_modules")
+    ]
+
+    reload_exception = None
+    for module_name in hf_hub_module_names + transformers_module_names:
+        try:
+            importlib.reload(sys.modules[module_name])
+        except Exception as e:
+            reload_exception = e
+            # Try to continue clean up so that other tests are less likely to
+            # be affected
+
+    # Error this test if reloading a module failed
+    if reload_exception is not None:
+        raise reload_exception
--- a/vllm-v0.6.2/tests/entrypoints/openai/init.py
+++ b/vllm-v0.6.2/tests/entrypoints/openai/init.py
--- a/vllm-v0.6.2/tests/entrypoints/openai/test_accuracy.py
+++ b/vllm-v0.6.2/tests/entrypoints/openai/test_accuracy.py
@@ -0,0 +1,92 @@
+"""
+This file test accuracy of the vLLM server via LMEval.
+It uses local-completions, which interacts with vLLM
+through the OAI API with N concurrent connections.
+This simulates real work usage of the API and makes
+sure that the zmq frontend mp RPC message passing and
+AsyncLLMEngine are working correctly.
+"""
+
+import lm_eval
+import pytest
+
+from vllm.platforms import current_platform
+
+from ...utils import RemoteOpenAIServer
+
+'''
+=============================
+Modify by vllm_mlu
+=============================
+We do not have Qwen2-1.5B-Instruct locally, so we use Qwen2-7B-Instruct instead.
+'''
+# The original model is: MODEL_NAME = "Qwen/Qwen2-1.5B-Instruct"
+MODEL_NAME = "Qwen/Qwen2-7B-Instruct"
+NUM_CONCURRENT = 500
+TASK = "gsm8k"
+FILTER = "exact_match,strict-match"
+RTOL = 0.03
+EXPECTED_VALUE = 0.67
+DEFAULT_ARGS = ["--max-model-len", "2048", "--disable-log-requests"]
+MORE_ARGS_LIST = [
+    [],  # Default
+    ["--enable-chunked-prefill"],  # Chunked
+    ["--num-scheduler-steps", "8"],  # MS
+    ["--num-scheduler-steps", "8", "--multi-step-stream-outputs"]  # MS+Stream
+]
+MAX_WAIT_SECONDS = None
+
+if current_platform.is_tpu():
+    MORE_ARGS_LIST = [
+        [],  # Default
+        # ["--num-scheduler-steps", "8"], # Multi-step << currently fails
+    ]
+    MAX_WAIT_SECONDS = 600
+
+
+def run_test(more_args):
+    """Run the end to end accuracy test."""
+
+    args = list(DEFAULT_ARGS)
+    args.extend(more_args)
+    print(f"Running with: {args}")
+
+    with RemoteOpenAIServer(
+            MODEL_NAME, args,
+            max_wait_seconds=MAX_WAIT_SECONDS) as remote_server:
+        url = f"{remote_server.url_for('v1')}/completions"
+
+        model_args = (
+            f"model={MODEL_NAME},"
+            f"base_url={url},"
+            f"num_concurrent={NUM_CONCURRENT},tokenized_requests=False")
+
+        results = lm_eval.simple_evaluate(
+            model="local-completions",
+            model_args=model_args,
+            tasks=TASK,
+        )
+
+        measured_value = results["results"][TASK][FILTER]
+        assert (measured_value - RTOL < EXPECTED_VALUE
+                and measured_value + RTOL > EXPECTED_VALUE
+                ), f"Expected: {EXPECTED_VALUE} |  Measured: {measured_value}"
+
+
+@pytest.mark.skipif(not current_platform.is_cuda(),
+                    reason="V1 currently only supported on CUDA")
+def test_lm_eval_accuracy_v1_engine(monkeypatch):
+    """Run with the V1 Engine."""
+
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+        run_test([])
+
+
+@pytest.mark.parametrize("more_args", MORE_ARGS_LIST)
+def test_lm_eval_accuracy_v0_engine(monkeypatch, more_args):
+    """Run with the V0 Engine."""
+
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "0")
+        run_test(more_args)
--- a/vllm-v0.6.2/tests/entrypoints/openai/test_audio.py
+++ b/vllm-v0.6.2/tests/entrypoints/openai/test_audio.py
@@ -0,0 +1,259 @@
+from typing import Dict, List
+
+import openai
+import pytest
+import pytest_asyncio
+
+from vllm.assets.audio import AudioAsset
+from vllm.multimodal.utils import encode_audio_base64, fetch_audio
+
+from ...utils import RemoteOpenAIServer
+
+MODEL_NAME = "fixie-ai/ultravox-v0_3"
+TEST_AUDIO_URLS = [
+    AudioAsset("winning_call").url,
+]
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "2048",
+        "--max-num-seqs",
+        "5",
+        "--enforce-eager",
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.fixture(scope="session")
+def base64_encoded_audio() -> Dict[str, str]:
+    return {
+        audio_url: encode_audio_base64(*fetch_audio(audio_url))
+        for audio_url in TEST_AUDIO_URLS
+    }
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
+async def test_single_chat_session_audio(client: openai.AsyncOpenAI,
+                                         model_name: str, audio_url: str):
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "audio_url",
+                "audio_url": {
+                    "url": audio_url
+                }
+            },
+            {
+                "type": "text",
+                "text": "What's happening in this audio?"
+            },
+        ],
+    }]
+
+    # test single completion
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+        logprobs=True,
+        top_logprobs=5)
+    assert len(chat_completion.choices) == 1
+
+    choice = chat_completion.choices[0]
+    assert choice.finish_reason == "length"
+    assert chat_completion.usage == openai.types.CompletionUsage(
+        completion_tokens=10, prompt_tokens=202, total_tokens=212)
+
+    message = choice.message
+    message = chat_completion.choices[0].message
+    assert message.content is not None and len(message.content) >= 10
+    assert message.role == "assistant"
+    messages.append({"role": "assistant", "content": message.content})
+
+    # test multi-turn dialogue
+    messages.append({"role": "user", "content": "express your result in json"})
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+    )
+    message = chat_completion.choices[0].message
+    assert message.content is not None and len(message.content) >= 0
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
+async def test_single_chat_session_audio_base64encoded(
+        client: openai.AsyncOpenAI, model_name: str, audio_url: str,
+        base64_encoded_audio: Dict[str, str]):
+
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "audio_url",
+                "audio_url": {
+                    "url":
+                    f"data:audio/wav;base64,{base64_encoded_audio[audio_url]}"
+                }
+            },
+            {
+                "type": "text",
+                "text": "What's happening in this audio?"
+            },
+        ],
+    }]
+
+    # test single completion
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+        logprobs=True,
+        top_logprobs=5)
+    assert len(chat_completion.choices) == 1
+
+    choice = chat_completion.choices[0]
+    assert choice.finish_reason == "length"
+    assert chat_completion.usage == openai.types.CompletionUsage(
+        completion_tokens=10, prompt_tokens=202, total_tokens=212)
+
+    message = choice.message
+    message = chat_completion.choices[0].message
+    assert message.content is not None and len(message.content) >= 10
+    assert message.role == "assistant"
+    messages.append({"role": "assistant", "content": message.content})
+
+    # test multi-turn dialogue
+    messages.append({"role": "user", "content": "express your result in json"})
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+    )
+    message = chat_completion.choices[0].message
+    assert message.content is not None and len(message.content) >= 0
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
+async def test_chat_streaming_audio(client: openai.AsyncOpenAI,
+                                    model_name: str, audio_url: str):
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "audio_url",
+                "audio_url": {
+                    "url": audio_url
+                }
+            },
+            {
+                "type": "text",
+                "text": "What's happening in this audio?"
+            },
+        ],
+    }]
+
+    # test single completion
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+        temperature=0.0,
+    )
+    output = chat_completion.choices[0].message.content
+    stop_reason = chat_completion.choices[0].finish_reason
+
+    # test streaming
+    stream = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+        temperature=0.0,
+        stream=True,
+    )
+    chunks: List[str] = []
+    finish_reason_count = 0
+    async for chunk in stream:
+        delta = chunk.choices[0].delta
+        if delta.role:
+            assert delta.role == "assistant"
+        if delta.content:
+            chunks.append(delta.content)
+        if chunk.choices[0].finish_reason is not None:
+            finish_reason_count += 1
+    # finish reason should only return in last block
+    assert finish_reason_count == 1
+    assert chunk.choices[0].finish_reason == stop_reason
+    assert delta.content
+    assert "".join(chunks) == output
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
+async def test_multi_audio_input(client: openai.AsyncOpenAI, model_name: str,
+                                 audio_url: str):
+
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "audio_url",
+                "audio_url": {
+                    "url": audio_url
+                }
+            },
+            {
+                "type": "audio_url",
+                "audio_url": {
+                    "url": audio_url
+                }
+            },
+            {
+                "type": "text",
+                "text": "What's happening in this audio?"
+            },
+        ],
+    }]
+
+    with pytest.raises(openai.BadRequestError):  # test multi-audio input
+        await client.chat.completions.create(
+            model=model_name,
+            messages=messages,
+            max_completion_tokens=10,
+            temperature=0.0,
+        )
+
+    # the server should still work afterwards
+    completion = await client.completions.create(
+        model=model_name,
+        prompt=[0, 0, 0, 0, 0],
+        max_tokens=5,
+        temperature=0.0,
+    )
+    completion = completion.choices[0].text
+    assert completion is not None and len(completion) >= 0
--- a/vllm-v0.6.2/tests/entrypoints/openai/test_basic.py
+++ b/vllm-v0.6.2/tests/entrypoints/openai/test_basic.py
@@ -0,0 +1,105 @@
+from http import HTTPStatus
+from typing import List
+
+import pytest
+import pytest_asyncio
+import requests
+
+from vllm.version import __version__ as VLLM_VERSION
+
+from ...utils import RemoteOpenAIServer
+
+MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+
+
+@pytest.fixture(scope='module')
+def server_args(request: pytest.FixtureRequest) -> List[str]:
+    """ Provide extra arguments to the server via indirect parametrization
+
+    Usage:
+
+    >>> @pytest.mark.parametrize(
+    >>>     "server_args",
+    >>>     [
+    >>>         ["--disable-frontend-multiprocessing"],
+    >>>         [
+    >>>             "--model=NousResearch/Hermes-3-Llama-3.1-70B",
+    >>>             "--enable-auto-tool-choice",
+    >>>         ],
+    >>>     ],
+    >>>     indirect=True,
+    >>> )
+    >>> def test_foo(server, client):
+    >>>     ...
+
+    This will run `test_foo` twice with servers with:
+    - `--disable-frontend-multiprocessing`
+    - `--model=NousResearch/Hermes-3-Llama-3.1-70B --enable-auto-tool-choice`.
+
+    """
+    if not hasattr(request, "param"):
+        return []
+
+    val = request.param
+
+    if isinstance(val, str):
+        return [val]
+
+    return request.param
+
+
+@pytest.fixture(scope="module")
+def server(server_args):
+    args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "8192",
+        "--enforce-eager",
+        "--max-num-seqs",
+        "128",
+        *server_args,
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.mark.parametrize(
+    "server_args",
+    [
+        pytest.param([], id="default-frontend-multiprocessing"),
+        pytest.param(["--disable-frontend-multiprocessing"],
+                     id="disable-frontend-multiprocessing")
+    ],
+    indirect=True,
+)
+@pytest.mark.asyncio
+async def test_show_version(server: RemoteOpenAIServer):
+    response = requests.get(server.url_for("version"))
+    response.raise_for_status()
+
+    assert response.json() == {"version": VLLM_VERSION}
+
+
+@pytest.mark.parametrize(
+    "server_args",
+    [
+        pytest.param([], id="default-frontend-multiprocessing"),
+        pytest.param(["--disable-frontend-multiprocessing"],
+                     id="disable-frontend-multiprocessing")
+    ],
+    indirect=True,
+)
+@pytest.mark.asyncio
+async def test_check_health(server: RemoteOpenAIServer):
+    response = requests.get(server.url_for("health"))
+
+    assert response.status_code == HTTPStatus.OK
--- a/vllm-v0.6.2/tests/entrypoints/openai/test_chat.py
+++ b/vllm-v0.6.2/tests/entrypoints/openai/test_chat.py
@@ -0,0 +1,985 @@
+# imports for guided decoding tests
+import json
+import re
+from typing import Dict, List, Optional
+
+import jsonschema
+import openai  # use the official client for correctness check
+import pytest
+import pytest_asyncio
+import torch
+from openai import BadRequestError
+
+from ...utils import RemoteOpenAIServer
+from .test_completion import zephyr_lora_added_tokens_files  # noqa: F401
+from .test_completion import zephyr_lora_files  # noqa: F401
+
+# any model with a chat template should work here
+MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+
+
+@pytest.fixture(scope="module")
+def server(zephyr_lora_files, zephyr_lora_added_tokens_files):  # noqa: F811
+    args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "8192",
+        "--enforce-eager",
+        # lora config below
+        "--enable-lora",
+        "--lora-modules",
+        f"zephyr-lora={zephyr_lora_files}",
+        f"zephyr-lora2={zephyr_lora_added_tokens_files}",
+        "--max-lora-rank",
+        "64",
+        "--max-cpu-loras",
+        "2",
+        "--max-num-seqs",
+        "128",
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    # first test base model, then test loras
+    "model_name",
+    [MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
+)
+async def test_no_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
+    messages = [{
+        "role": "system",
+        "content": "you are a helpful assistant"
+    }, {
+        "role": "user",
+        "content": "what is 1+1?"
+    }]
+
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=5,
+        temperature=0.0,
+        logprobs=False)
+
+    choice = chat_completion.choices[0]
+    assert choice.logprobs is None
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    # just test 1 lora hereafter
+    "model_name",
+    [MODEL_NAME, "zephyr-lora"],
+)
+async def test_zero_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
+    messages = [{
+        "role": "system",
+        "content": "you are a helpful assistant"
+    }, {
+        "role": "user",
+        "content": "what is 1+1?"
+    }]
+
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=5,
+        temperature=0.0,
+        logprobs=True,
+        top_logprobs=0)
+
+    choice = chat_completion.choices[0]
+    assert choice.logprobs is not None
+    assert choice.logprobs.content is not None
+    assert len(choice.logprobs.content[0].top_logprobs) == 0
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME, "zephyr-lora"],
+)
+async def test_some_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
+    messages = [{
+        "role": "system",
+        "content": "you are a helpful assistant"
+    }, {
+        "role": "user",
+        "content": "what is 1+1?"
+    }]
+
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=5,
+        temperature=0.0,
+        logprobs=True,
+        top_logprobs=5)
+
+    choice = chat_completion.choices[0]
+    assert choice.logprobs is not None
+    assert choice.logprobs.content is not None
+    assert len(choice.logprobs.content[0].top_logprobs) == 5
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME, "zephyr-lora"],
+)
+async def test_too_many_chat_logprobs(client: openai.AsyncOpenAI,
+                                      model_name: str):
+    messages = [{
+        "role": "system",
+        "content": "you are a helpful assistant"
+    }, {
+        "role": "user",
+        "content": "what is 1+1?"
+    }]
+
+    # Default max_logprobs is 20, so this should raise an error
+    with pytest.raises((openai.BadRequestError, openai.APIError)):
+        stream = await client.chat.completions.create(model=model_name,
+                                                      messages=messages,
+                                                      max_completion_tokens=10,
+                                                      logprobs=True,
+                                                      top_logprobs=21,
+                                                      stream=True)
+        async for chunk in stream:
+            ...
+
+    with pytest.raises(openai.BadRequestError):
+        await client.chat.completions.create(model=model_name,
+                                             messages=messages,
+                                             max_completion_tokens=10,
+                                             logprobs=True,
+                                             top_logprobs=30,
+                                             stream=False)
+
+    # the server should still work afterwards
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+        stream=False)
+    message = chat_completion.choices[0].message
+    assert message.content is not None and len(message.content) >= 0
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name, prompt_logprobs",
+    [(MODEL_NAME, 1), (MODEL_NAME, 0), (MODEL_NAME, -1), (MODEL_NAME, None)],
+)
+async def test_prompt_logprobs_chat(client: openai.AsyncOpenAI,
+                                    model_name: str,
+                                    prompt_logprobs: Optional[int]):
+    params: Dict = {
+        "messages": [{
+            "role": "system",
+            "content": "You are a helpful assistant."
+        }, {
+            "role": "user",
+            "content": "Who won the world series in 2020?"
+        }, {
+            "role":
+            "assistant",
+            "content":
+            "The Los Angeles Dodgers won the World Series in 2020."
+        }, {
+            "role": "user",
+            "content": "Where was it played?"
+        }],
+        "model":
+        model_name
+    }
+
+    if prompt_logprobs is not None:
+        params["extra_body"] = {"prompt_logprobs": prompt_logprobs}
+
+    if prompt_logprobs is not None and prompt_logprobs < 0:
+        with pytest.raises(BadRequestError):
+            await client.chat.completions.create(**params)
+    else:
+        completion = await client.chat.completions.create(**params)
+        if prompt_logprobs is not None:
+            assert completion.prompt_logprobs is not None
+            assert len(completion.prompt_logprobs) > 0
+        else:
+            assert completion.prompt_logprobs is None
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
+async def test_more_than_one_prompt_logprobs_chat(client: openai.AsyncOpenAI,
+                                                  model_name: str):
+    params: Dict = {
+        "messages": [{
+            "role": "system",
+            "content": "You are a helpful assistant."
+        }, {
+            "role": "user",
+            "content": "Who won the world series in 2020?"
+        }, {
+            "role":
+            "assistant",
+            "content":
+            "The Los Angeles Dodgers won the World Series in 2020."
+        }, {
+            "role": "user",
+            "content": "Where was it played?"
+        }],
+        "model":
+        model_name,
+        "extra_body": {
+            "prompt_logprobs": 1
+        }
+    }
+
+    completion_1 = await client.chat.completions.create(**params)
+
+    params["extra_body"] = {"prompt_logprobs": 2}
+    completion_2 = await client.chat.completions.create(**params)
+
+    assert len(completion_1.prompt_logprobs[3]) == 1
+    assert len(completion_2.prompt_logprobs[3]) == 2
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME, "zephyr-lora"],
+)
+async def test_single_chat_session(client: openai.AsyncOpenAI,
+                                   model_name: str):
+    messages = [{
+        "role": "system",
+        "content": "you are a helpful assistant"
+    }, {
+        "role": "user",
+        "content": "what is 1+1?"
+    }]
+
+    # test single completion
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+        logprobs=True,
+        top_logprobs=5)
+    assert chat_completion.id is not None
+    assert len(chat_completion.choices) == 1
+
+    choice = chat_completion.choices[0]
+    assert choice.finish_reason == "length"
+    assert chat_completion.usage == openai.types.CompletionUsage(
+        completion_tokens=10, prompt_tokens=37, total_tokens=47)
+
+    message = choice.message
+    assert message.content is not None and len(message.content) >= 10
+    assert message.role == "assistant"
+    messages.append({"role": "assistant", "content": message.content})
+
+    # test multi-turn dialogue
+    messages.append({"role": "user", "content": "express your result in json"})
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+    )
+    message = chat_completion.choices[0].message
+    assert message.content is not None and len(message.content) >= 0
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    # just test 1 lora hereafter
+    "model_name",
+    [MODEL_NAME, "zephyr-lora"],
+)
+async def test_chat_streaming(client: openai.AsyncOpenAI, model_name: str):
+    messages = [{
+        "role": "system",
+        "content": "you are a helpful assistant"
+    }, {
+        "role": "user",
+        "content": "what is 1+1?"
+    }]
+
+    # test single completion
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+        temperature=0.0,
+    )
+    output = chat_completion.choices[0].message.content
+    stop_reason = chat_completion.choices[0].finish_reason
+
+    # test streaming
+    stream = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+        temperature=0.0,
+        stream=True,
+    )
+    chunks: List[str] = []
+    finish_reason_count = 0
+    async for chunk in stream:
+        delta = chunk.choices[0].delta
+        if delta.role:
+            assert delta.role == "assistant"
+        if delta.content:
+            chunks.append(delta.content)
+        if chunk.choices[0].finish_reason is not None:
+            finish_reason_count += 1
+    # finish reason should only return in last block
+    assert finish_reason_count == 1
+    assert chunk.choices[0].finish_reason == stop_reason
+    assert delta.content
+    assert "".join(chunks) == output
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    ["HuggingFaceH4/zephyr-7b-beta", "zephyr-lora"],
+)
+async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
+                                              model_name: str):
+    messages = [{
+        "role": "system",
+        "content": "You are a helpful assistant."
+    }, {
+        "role": "user",
+        "content": "What is the capital of France?"
+    }]
+
+    # Test stream=True, stream_options={"include_usage": False}
+    stream = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+        temperature=0.0,
+        stream=True,
+        stream_options={"include_usage": False})
+    async for chunk in stream:
+        assert chunk.usage is None
+
+    # Test stream=True, stream_options={"include_usage": True,
+    #                                   "continuous_usage_stats": False}}
+    stream = await client.chat.completions.create(model=model_name,
+                                                  messages=messages,
+                                                  max_completion_tokens=10,
+                                                  temperature=0.0,
+                                                  stream=True,
+                                                  stream_options={
+                                                      "include_usage":
+                                                      True,
+                                                      "continuous_usage_stats":
+                                                      False
+                                                  })
+
+    async for chunk in stream:
+        if chunk.choices[0].finish_reason is None:
+            assert chunk.usage is None
+        else:
+            assert chunk.usage is None
+            final_chunk = await stream.__anext__()
+            assert final_chunk.usage is not None
+            assert final_chunk.usage.prompt_tokens > 0
+            assert final_chunk.usage.completion_tokens > 0
+            assert final_chunk.usage.total_tokens == (
+                final_chunk.usage.prompt_tokens +
+                final_chunk.usage.completion_tokens)
+            assert final_chunk.choices == []
+
+    # Test stream=False, stream_options={"include_usage": None}
+    with pytest.raises(BadRequestError):
+        await client.chat.completions.create(
+            model=model_name,
+            messages=messages,
+            max_completion_tokens=10,
+            temperature=0.0,
+            stream=False,
+            stream_options={"include_usage": None})
+
+    # Test stream=False, stream_options={"include_usage": True}
+    with pytest.raises(BadRequestError):
+        await client.chat.completions.create(
+            model=model_name,
+            messages=messages,
+            max_completion_tokens=10,
+            temperature=0.0,
+            stream=False,
+            stream_options={"include_usage": True})
+
+    # Test stream=True, stream_options={"include_usage": True,
+    #                           "continuous_usage_stats": True}
+    stream = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+        extra_body=dict(min_tokens=10),
+        temperature=0.0,
+        stream=True,
+        stream_options={
+            "include_usage": True,
+            "continuous_usage_stats": True,
+        },
+    )
+    last_completion_tokens = 0
+    async for chunk in stream:
+        assert chunk.usage.prompt_tokens >= 0
+        assert last_completion_tokens == 0 or \
+               chunk.usage.completion_tokens > last_completion_tokens or \
+               (
+                   not chunk.choices and
+                   chunk.usage.completion_tokens == last_completion_tokens
+               )
+        assert chunk.usage.total_tokens == (chunk.usage.prompt_tokens +
+                                            chunk.usage.completion_tokens)
+        last_completion_tokens = chunk.usage.completion_tokens
+
+    assert last_completion_tokens == 10
+
+
+# NOTE: Not sure why, but when I place this after `test_guided_regex_chat`
+# (i.e. using the same ordering as in the Completions API tests), the test
+# will fail on the second `guided_decoding_backend` even when I swap their order
+# (ref: https://github.com/vllm-project/vllm/pull/5526#issuecomment-2173772256)
+@pytest.mark.asyncio
+@pytest.mark.parametrize("guided_decoding_backend",
+                         ["outlines", "lm-format-enforcer"])
+async def test_guided_choice_chat(client: openai.AsyncOpenAI,
+                                  guided_decoding_backend: str,
+                                  sample_guided_choice):
+    messages = [{
+        "role": "system",
+        "content": "you are a helpful assistant"
+    }, {
+        "role":
+        "user",
+        "content":
+        "The best language for type-safe systems programming is "
+    }]
+    chat_completion = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=messages,
+        max_completion_tokens=10,
+        extra_body=dict(guided_choice=sample_guided_choice,
+                        guided_decoding_backend=guided_decoding_backend))
+    choice1 = chat_completion.choices[0].message.content
+    assert choice1 in sample_guided_choice
+
+    messages.append({"role": "assistant", "content": choice1})
+    messages.append({
+        "role": "user",
+        "content": "I disagree, pick another one"
+    })
+    chat_completion = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=messages,
+        max_completion_tokens=10,
+        extra_body=dict(guided_choice=sample_guided_choice,
+                        guided_decoding_backend=guided_decoding_backend))
+    choice2 = chat_completion.choices[0].message.content
+    assert choice2 in sample_guided_choice
+    assert choice1 != choice2
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("guided_decoding_backend",
+                         ["outlines", "lm-format-enforcer"])
+async def test_guided_json_chat(client: openai.AsyncOpenAI,
+                                guided_decoding_backend: str,
+                                sample_json_schema):
+    messages = [{
+        "role": "system",
+        "content": "you are a helpful assistant"
+    }, {
+        "role":
+        "user",
+        "content":
+        f"Give an example JSON for an employee profile that "
+        f"fits this schema: {sample_json_schema}"
+    }]
+    chat_completion = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=messages,
+        max_completion_tokens=1000,
+        extra_body=dict(guided_json=sample_json_schema,
+                        guided_decoding_backend=guided_decoding_backend))
+    message = chat_completion.choices[0].message
+    assert message.content is not None
+    json1 = json.loads(message.content)
+    jsonschema.validate(instance=json1, schema=sample_json_schema)
+
+    messages.append({"role": "assistant", "content": message.content})
+    messages.append({
+        "role":
+        "user",
+        "content":
+        "Give me another one with a different name and age"
+    })
+    chat_completion = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=messages,
+        max_completion_tokens=1000,
+        extra_body=dict(guided_json=sample_json_schema,
+                        guided_decoding_backend=guided_decoding_backend))
+    message = chat_completion.choices[0].message
+    assert message.content is not None
+    json2 = json.loads(message.content)
+    jsonschema.validate(instance=json2, schema=sample_json_schema)
+    assert json1["name"] != json2["name"]
+    assert json1["age"] != json2["age"]
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("guided_decoding_backend",
+                         ["outlines", "lm-format-enforcer"])
+async def test_guided_regex_chat(client: openai.AsyncOpenAI,
+                                 guided_decoding_backend: str, sample_regex):
+    messages = [{
+        "role": "system",
+        "content": "you are a helpful assistant"
+    }, {
+        "role":
+        "user",
+        "content":
+        f"Give an example IP address with this regex: {sample_regex}"
+    }]
+    chat_completion = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=messages,
+        max_completion_tokens=20,
+        extra_body=dict(guided_regex=sample_regex,
+                        guided_decoding_backend=guided_decoding_backend))
+    ip1 = chat_completion.choices[0].message.content
+    assert ip1 is not None
+    assert re.fullmatch(sample_regex, ip1) is not None
+
+    messages.append({"role": "assistant", "content": ip1})
+    messages.append({"role": "user", "content": "Give me a different one"})
+    chat_completion = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=messages,
+        max_completion_tokens=20,
+        extra_body=dict(guided_regex=sample_regex,
+                        guided_decoding_backend=guided_decoding_backend))
+    ip2 = chat_completion.choices[0].message.content
+    assert ip2 is not None
+    assert re.fullmatch(sample_regex, ip2) is not None
+    assert ip1 != ip2
+
+
+@pytest.mark.asyncio
+async def test_guided_decoding_type_error(client: openai.AsyncOpenAI):
+    messages = [{
+        "role": "system",
+        "content": "you are a helpful assistant"
+    }, {
+        "role":
+        "user",
+        "content":
+        "The best language for type-safe systems programming is "
+    }]
+
+    with pytest.raises(openai.BadRequestError):
+        _ = await client.chat.completions.create(model=MODEL_NAME,
+                                                 messages=messages,
+                                                 extra_body=dict(guided_regex={
+                                                     1: "Python",
+                                                     2: "C++"
+                                                 }))
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("guided_decoding_backend",
+                         ["outlines", "lm-format-enforcer"])
+async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
+                                           guided_decoding_backend: str,
+                                           sample_guided_choice):
+    messages = [{
+        "role": "system",
+        "content": "you are a helpful assistant"
+    }, {
+        "role":
+        "user",
+        "content":
+        "The best language for type-safe systems programming is "
+    }]
+    chat_completion = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=messages,
+        max_completion_tokens=10,
+        logprobs=True,
+        top_logprobs=5,
+        extra_body=dict(guided_choice=sample_guided_choice,
+                        guided_decoding_backend=guided_decoding_backend))
+
+    assert chat_completion.choices[0].logprobs is not None
+    assert chat_completion.choices[0].logprobs.content is not None
+    top_logprobs = chat_completion.choices[0].logprobs.content[0].top_logprobs
+
+    # -9999.0 is the minimum logprob returned by OpenAI
+    for item in top_logprobs:
+        assert item.logprob >= -9999.0, f"Failed (top_logprobs={top_logprobs})"
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("guided_decoding_backend",
+                         ["outlines", "lm-format-enforcer"])
+async def test_named_tool_use(client: openai.AsyncOpenAI,
+                              guided_decoding_backend: str,
+                              sample_json_schema):
+    messages = [{
+        "role": "system",
+        "content": "you are a helpful assistant"
+    }, {
+        "role":
+        "user",
+        "content":
+        f"Give an example JSON for an employee profile that "
+        f"fits this schema: {sample_json_schema}"
+    }]
+
+    # non-streaming
+
+    chat_completion = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=messages,
+        max_completion_tokens=1000,
+        tools=[{
+            "type": "function",
+            "function": {
+                "name": "dummy_function_name",
+                "description": "This is a dummy function",
+                "parameters": sample_json_schema
+            }
+        }],
+        tool_choice={
+            "type": "function",
+            "function": {
+                "name": "dummy_function_name"
+            }
+        })
+    message = chat_completion.choices[0].message
+    assert len(message.content) == 0
+    json_string = message.tool_calls[0].function.arguments
+    json1 = json.loads(json_string)
+    jsonschema.validate(instance=json1, schema=sample_json_schema)
+
+    messages.append({"role": "assistant", "content": json_string})
+    messages.append({
+        "role":
+        "user",
+        "content":
+        "Give me another one with a different name and age"
+    })
+
+    # streaming
+
+    stream = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=messages,
+        max_completion_tokens=1000,
+        tools=[{
+            "type": "function",
+            "function": {
+                "name": "dummy_function_name",
+                "description": "This is a dummy function",
+                "parameters": sample_json_schema
+            }
+        }],
+        tool_choice={
+            "type": "function",
+            "function": {
+                "name": "dummy_function_name"
+            }
+        },
+        stream=True)
+
+    output = []
+    finish_reason_count = 0
+    async for chunk in stream:
+        delta = chunk.choices[0].delta
+        if delta.role:
+            assert delta.role == "assistant"
+        assert delta.content is None or len(delta.content) == 0
+        if delta.tool_calls:
+            output.append(delta.tool_calls[0].function.arguments)
+        if chunk.choices[0].finish_reason is not None:
+            finish_reason_count += 1
+    # finish reason should only return in last block
+    assert finish_reason_count == 1
+    json2 = json.loads("".join(output))
+    jsonschema.validate(instance=json2, schema=sample_json_schema)
+    assert json1["name"] != json2["name"]
+    assert json1["age"] != json2["age"]
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("guided_decoding_backend", ["outlines"])
+async def test_required_tool_use_not_yet_supported(
+        client: openai.AsyncOpenAI, guided_decoding_backend: str,
+        sample_json_schema):
+    messages = [{
+        "role": "system",
+        "content": "you are a helpful assistant"
+    }, {
+        "role":
+        "user",
+        "content":
+        f"Give an example JSON for an employee profile that "
+        f"fits this schema: {sample_json_schema}"
+    }]
+
+    with pytest.raises(openai.BadRequestError):
+        await client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=messages,
+            max_completion_tokens=1000,
+            tools=[{
+                "type": "function",
+                "function": {
+                    "name": "dummy_function_name",
+                    "description": "This is a dummy function",
+                    "parameters": sample_json_schema
+                }
+            }],
+            tool_choice="required")
+
+    with pytest.raises(openai.BadRequestError):
+        await client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=messages,
+            max_completion_tokens=1000,
+            tools=[{
+                "type": "function",
+                "function": {
+                    "name": "dummy_function_name",
+                    "description": "This is a dummy function",
+                    "parameters": sample_json_schema
+                }
+            }],
+            tool_choice="auto")
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("guided_decoding_backend", ["outlines"])
+async def test_inconsistent_tool_choice_and_tools(client: openai.AsyncOpenAI,
+                                                  guided_decoding_backend: str,
+                                                  sample_json_schema):
+    messages = [{
+        "role": "system",
+        "content": "you are a helpful assistant"
+    }, {
+        "role":
+        "user",
+        "content":
+        f"Give an example JSON for an employee profile that "
+        f"fits this schema: {sample_json_schema}"
+    }]
+
+    with pytest.raises(openai.BadRequestError):
+        await client.chat.completions.create(model=MODEL_NAME,
+                                             messages=messages,
+                                             max_completion_tokens=1000,
+                                             tool_choice={
+                                                 "type": "function",
+                                                 "function": {
+                                                     "name":
+                                                     "dummy_function_name"
+                                                 }
+                                             })
+
+    with pytest.raises(openai.BadRequestError):
+        await client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=messages,
+            max_completion_tokens=1000,
+            tools=[{
+                "type": "function",
+                "function": {
+                    "name": "dummy_function_name",
+                    "description": "This is a dummy function",
+                    "parameters": sample_json_schema
+                }
+            }],
+            tool_choice={
+                "type": "function",
+                "function": {
+                    "name": "nondefined_function_name"
+                }
+            })
+
+
+@pytest.mark.asyncio
+async def test_response_format_json_object(client: openai.AsyncOpenAI):
+    for _ in range(2):
+        resp = await client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=[{
+                "role":
+                "user",
+                "content": ('what is 1+1? please respond with a JSON object, '
+                            'the format is {"result": 2}')
+            }],
+            response_format={"type": "json_object"})
+
+        content = resp.choices[0].message.content
+        assert content is not None
+
+        loaded = json.loads(content)
+        assert loaded == {"result": 2}, loaded
+
+
+@pytest.mark.asyncio
+async def test_response_format_json_schema(client: openai.AsyncOpenAI):
+    prompt = 'what is 1+1? The format is "result": 2'
+    # Check that this prompt cannot lead to a valid JSON without json_schema
+    for _ in range(2):
+        resp = await client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=[{
+                "role": "user",
+                "content": prompt
+            }],
+        )
+        content = resp.choices[0].message.content
+        assert content is not None
+        with pytest.raises((json.JSONDecodeError, AssertionError)):
+            loaded = json.loads(content)
+            assert loaded == {"result": 2}, loaded
+
+    for _ in range(2):
+        resp = await client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=[{
+                "role": "user",
+                "content": prompt
+            }],
+            response_format={
+                "type": "json_schema",
+                "json_schema": {
+                    "name": "foo_test",
+                    "schema": {
+                        "type": "object",
+                        "properties": {
+                            "result": {
+                                "type": "integer"
+                            },
+                        },
+                    },
+                }
+            })
+
+        content = resp.choices[0].message.content
+        assert content is not None
+
+        loaded = json.loads(content)
+        assert loaded == {"result": 2}, loaded
+
+
+@pytest.mark.asyncio
+async def test_extra_fields(client: openai.AsyncOpenAI):
+    with pytest.raises(BadRequestError) as exc_info:
+        await client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=[{
+                "role": "system",
+                "content": "You are a helpful assistant.",
+                "extra_field": "0",
+            }],  # type: ignore
+            temperature=0,
+            seed=0)
+
+    assert "extra_forbidden" in exc_info.value.message
+
+
+@pytest.mark.asyncio
+async def test_complex_message_content(client: openai.AsyncOpenAI):
+    resp = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=[{
+            "role":
+            "user",
+            "content": [{
+                "type":
+                "text",
+                "text":
+                "what is 1+1? please provide the result without any other text."
+            }]
+        }],
+        temperature=0,
+        seed=0)
+    content = resp.choices[0].message.content
+    assert content == "2"
+
+
+@pytest.mark.asyncio
+async def test_custom_role(client: openai.AsyncOpenAI):
+    # Not sure how the model handles custom roles so we just check that
+    # both string and complex message content are handled in the same way
+
+    resp1 = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=[{
+            "role": "my-custom-role",
+            "content": "what is 1+1?",
+        }],  # type: ignore
+        temperature=0,
+        seed=0)
+
+    resp2 = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=[{
+            "role": "my-custom-role",
+            "content": [{
+                "type": "text",
+                "text": "what is 1+1?"
+            }]
+        }],  # type: ignore
+        temperature=0,
+        seed=0)
+
+    content1 = resp1.choices[0].message.content
+    content2 = resp2.choices[0].message.content
+    assert content1 == content2
+
+
+@pytest.mark.asyncio
+async def test_long_seed(client: openai.AsyncOpenAI):
+    for seed in [
+            torch.iinfo(torch.long).min - 1,
+            torch.iinfo(torch.long).max + 1
+    ]:
+        with pytest.raises(BadRequestError) as exc_info:
+            await client.chat.completions.create(
+                model=MODEL_NAME,
+                messages=[{
+                    "role": "system",
+                    "content": "You are a helpful assistant.",
+                }],
+                temperature=0,
+                seed=seed)
+
+        assert ("greater_than_equal" in exc_info.value.message
+                or "less_than_equal" in exc_info.value.message)
--- a/vllm-v0.6.2/tests/entrypoints/openai/test_chat_template.py
+++ b/vllm-v0.6.2/tests/entrypoints/openai/test_chat_template.py
@@ -0,0 +1,117 @@
+import pytest
+
+from vllm.entrypoints.chat_utils import (apply_hf_chat_template,
+                                         load_chat_template)
+from vllm.entrypoints.openai.protocol import ChatCompletionRequest
+from vllm.transformers_utils.tokenizer import get_tokenizer
+
+from ...utils import VLLM_PATH
+
+chatml_jinja_path = VLLM_PATH / "examples/template_chatml.jinja"
+assert chatml_jinja_path.exists()
+
+# Define models, templates, and their corresponding expected outputs
+MODEL_TEMPLATE_GENERATON_OUTPUT = [
+    ("facebook/opt-125m", chatml_jinja_path, True, False, """<|im_start|>user
+Hello<|im_end|>
+<|im_start|>assistant
+Hi there!<|im_end|>
+<|im_start|>user
+What is the capital of<|im_end|>
+<|im_start|>assistant
+"""),
+    ("facebook/opt-125m", chatml_jinja_path, False, False, """<|im_start|>user
+Hello<|im_end|>
+<|im_start|>assistant
+Hi there!<|im_end|>
+<|im_start|>user
+What is the capital of"""),
+    ("facebook/opt-125m", chatml_jinja_path, False, True, """<|im_start|>user
+Hello<|im_end|>
+<|im_start|>assistant
+Hi there!<|im_end|>
+<|im_start|>user
+What is the capital of<|im_end|>
+<|im_start|>assistant
+The capital of"""),
+]
+
+TEST_MESSAGES = [
+    {
+        'role': 'user',
+        'content': 'Hello'
+    },
+    {
+        'role': 'assistant',
+        'content': 'Hi there!'
+    },
+    {
+        'role': 'user',
+        'content': 'What is the capital of'
+    },
+]
+ASSISTANT_MESSAGE_TO_CONTINUE = {
+    'role': 'assistant',
+    'content': 'The capital of'
+}
+
+
+def test_load_chat_template():
+    # Testing chatml template
+    template_content = load_chat_template(chat_template=chatml_jinja_path)
+
+    # Test assertions
+    assert template_content is not None
+    # Hard coded value for template_chatml.jinja
+    assert template_content == """{% for message in messages %}{{'<|im_start|>' + message['role'] + '\\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\\n'}}{% endif %}{% endfor %}
+{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\\n' }}{% endif %}"""  # noqa: E501
+
+
+def test_no_load_chat_template_filelike():
+    # Testing chatml template
+    template = "../../examples/does_not_exist"
+
+    with pytest.raises(ValueError, match="looks like a file path"):
+        load_chat_template(chat_template=template)
+
+
+def test_no_load_chat_template_literallike():
+    # Testing chatml template
+    template = "{{ messages }}"
+
+    template_content = load_chat_template(chat_template=template)
+
+    assert template_content == template
+
+
+@pytest.mark.parametrize(
+    "model,template,add_generation_prompt,continue_final_message,expected_output",
+    MODEL_TEMPLATE_GENERATON_OUTPUT)
+def test_get_gen_prompt(model, template, add_generation_prompt,
+                        continue_final_message, expected_output):
+    # Initialize the tokenizer
+    tokenizer = get_tokenizer(tokenizer_name=model)
+    template_content = load_chat_template(chat_template=template)
+
+    # Create a mock request object using keyword arguments
+    mock_request = ChatCompletionRequest(
+        model=model,
+        messages=TEST_MESSAGES + [ASSISTANT_MESSAGE_TO_CONTINUE]
+        if continue_final_message else TEST_MESSAGES,
+        add_generation_prompt=add_generation_prompt,
+        continue_final_message=continue_final_message,
+    )
+
+    # Call the function and get the result
+    result = apply_hf_chat_template(
+        tokenizer,
+        conversation=mock_request.messages,
+        chat_template=mock_request.chat_template or template_content,
+        add_generation_prompt=mock_request.add_generation_prompt,
+        continue_final_message=mock_request.continue_final_message,
+    )
+
+    # Test assertion
+    assert result == expected_output, (
+        f"The generated prompt does not match the expected output for "
+        f"model {model} and template {template}")
--- a/vllm-v0.6.2/tests/entrypoints/openai/test_chunked_prompt.py
+++ b/vllm-v0.6.2/tests/entrypoints/openai/test_chunked_prompt.py
@@ -0,0 +1,126 @@
+import openai  # use the official client for correctness check
+import pytest
+import pytest_asyncio
+
+from ...utils import RemoteOpenAIServer
+
+# any model with a chat template should work here
+MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "8192",
+        "--enforce-eager",
+        # lora config below
+        "--max-num-seqs",
+        "128",
+        "--enable-chunked-prefill",
+        "--max-num-batched-tokens",
+        "1000",
+        # large prompts create a lot of output
+        "--disable-log-requests",
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+async def test_completion_stream_options_and_logprobs_with_long_prompts(
+        client: openai.AsyncOpenAI):
+    # Test stream with long prompt
+    prompt = "What is the capital of France?" * 400
+
+    stream = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=prompt,
+        max_tokens=5,
+        temperature=0.0,
+        stream=True,
+        stream_options={
+            "include_usage": True,
+            "continuous_usage_stats": True,
+        },
+        logprobs=5,
+    )
+
+    tokens_received = 0
+    finished = False
+    async for chunk in stream:
+        assert chunk.usage.prompt_tokens >= 0
+        assert chunk.usage.completion_tokens >= 0
+        assert chunk.usage.total_tokens == (chunk.usage.prompt_tokens +
+                                            chunk.usage.completion_tokens)
+        if not finished:
+            tokens_received += 1
+            assert chunk.choices[0].text
+
+            if chunk.choices[0].finish_reason is not None:
+                finished = True
+
+        if finished:
+            assert chunk.usage.completion_tokens == tokens_received
+
+
+@pytest.mark.asyncio
+async def test_chat_completion_stream_options_and_logprobs_with_long_prompts(
+        client: openai.AsyncOpenAI):
+    # Test stream with long prompt
+    messages = [{
+        "role": "system",
+        "content": "You are a helpful assistant."
+    }, {
+        "role": "user",
+        "content": "What is the capital of France?" * 400
+    }]
+    stream = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=messages,
+        max_tokens=5,
+        temperature=0.0,
+        stream=True,
+        stream_options={
+            "include_usage": True,
+            "continuous_usage_stats": True,
+        },
+        logprobs=True,
+        top_logprobs=5,
+    )
+
+    tokens_received = 0
+    empty_chunks_received = 0
+    finished = False
+    async for chunk in stream:
+        assert chunk.usage.prompt_tokens >= 0
+        assert chunk.usage.completion_tokens >= 0
+        assert chunk.usage.total_tokens == (chunk.usage.prompt_tokens +
+                                            chunk.usage.completion_tokens)
+
+        if not finished:
+            if chunk.choices[0].delta.content == "":
+                # when there is no tokens generated
+                assert chunk.usage.completion_tokens == 0
+                assert chunk.choices[0].logprobs is None
+                empty_chunks_received += 1
+            else:
+                tokens_received += 1
+
+            if chunk.choices[0].finish_reason is not None:
+                finished = True
+
+        if finished:
+            assert chunk.usage.completion_tokens == tokens_received
+
+    assert empty_chunks_received <= 1
--- a/vllm-v0.6.2/tests/entrypoints/openai/test_cli_args.py
+++ b/vllm-v0.6.2/tests/entrypoints/openai/test_cli_args.py
@@ -0,0 +1,131 @@
+import json
+
+import pytest
+
+from vllm.entrypoints.openai.cli_args import (make_arg_parser,
+                                              validate_parsed_serve_args)
+from vllm.entrypoints.openai.serving_engine import LoRAModulePath
+from vllm.utils import FlexibleArgumentParser
+
+from ...utils import VLLM_PATH
+
+LORA_MODULE = {
+    "name": "module2",
+    "path": "/path/to/module2",
+    "base_model_name": "llama"
+}
+CHATML_JINJA_PATH = VLLM_PATH / "examples/template_chatml.jinja"
+assert CHATML_JINJA_PATH.exists()
+
+
+@pytest.fixture
+def serve_parser():
+    parser = FlexibleArgumentParser(description="vLLM's remote OpenAI server.")
+    return make_arg_parser(parser)
+
+
+### Tests for Lora module parsing
+def test_valid_key_value_format(serve_parser):
+    # Test old format: name=path
+    args = serve_parser.parse_args([
+        '--lora-modules',
+        'module1=/path/to/module1',
+    ])
+    expected = [LoRAModulePath(name='module1', path='/path/to/module1')]
+    assert args.lora_modules == expected
+
+
+def test_valid_json_format(serve_parser):
+    # Test valid JSON format input
+    args = serve_parser.parse_args([
+        '--lora-modules',
+        json.dumps(LORA_MODULE),
+    ])
+    expected = [
+        LoRAModulePath(name='module2',
+                       path='/path/to/module2',
+                       base_model_name='llama')
+    ]
+    assert args.lora_modules == expected
+
+
+def test_invalid_json_format(serve_parser):
+    # Test invalid JSON format input, missing closing brace
+    with pytest.raises(SystemExit):
+        serve_parser.parse_args([
+            '--lora-modules', '{"name": "module3", "path": "/path/to/module3"'
+        ])
+
+
+def test_invalid_type_error(serve_parser):
+    # Test type error when values are not JSON or key=value
+    with pytest.raises(SystemExit):
+        serve_parser.parse_args([
+            '--lora-modules',
+            'invalid_format'  # This is not JSON or key=value format
+        ])
+
+
+def test_invalid_json_field(serve_parser):
+    # Test valid JSON format but missing required fields
+    with pytest.raises(SystemExit):
+        serve_parser.parse_args([
+            '--lora-modules',
+            '{"name": "module4"}'  # Missing required 'path' field
+        ])
+
+
+def test_empty_values(serve_parser):
+    # Test when no LoRA modules are provided
+    args = serve_parser.parse_args(['--lora-modules', ''])
+    assert args.lora_modules == []
+
+
+def test_multiple_valid_inputs(serve_parser):
+    # Test multiple valid inputs (both old and JSON format)
+    args = serve_parser.parse_args([
+        '--lora-modules',
+        'module1=/path/to/module1',
+        json.dumps(LORA_MODULE),
+    ])
+    expected = [
+        LoRAModulePath(name='module1', path='/path/to/module1'),
+        LoRAModulePath(name='module2',
+                       path='/path/to/module2',
+                       base_model_name='llama')
+    ]
+    assert args.lora_modules == expected
+
+
+### Tests for serve argument validation that run prior to loading
+def test_enable_auto_choice_passes_without_tool_call_parser(serve_parser):
+    """Ensure validation fails if tool choice is enabled with no call parser"""
+    # If we enable-auto-tool-choice, explode with no tool-call-parser
+    args = serve_parser.parse_args(args=["--enable-auto-tool-choice"])
+    with pytest.raises(TypeError):
+        validate_parsed_serve_args(args)
+
+
+def test_enable_auto_choice_passes_with_tool_call_parser(serve_parser):
+    """Ensure validation passes with tool choice enabled with a call parser"""
+    args = serve_parser.parse_args(args=[
+        "--enable-auto-tool-choice",
+        "--tool-call-parser",
+        "mistral",
+    ])
+    validate_parsed_serve_args(args)
+
+
+def test_chat_template_validation_for_happy_paths(serve_parser):
+    """Ensure validation passes if the chat template exists"""
+    args = serve_parser.parse_args(
+        args=["--chat-template",
+              CHATML_JINJA_PATH.absolute().as_posix()])
+    validate_parsed_serve_args(args)
+
+
+def test_chat_template_validation_for_sad_paths(serve_parser):
+    """Ensure validation fails if the chat template doesn't exist"""
+    args = serve_parser.parse_args(args=["--chat-template", "does/not/exist"])
+    with pytest.raises(ValueError):
+        validate_parsed_serve_args(args)
--- a/vllm-v0.6.2/tests/entrypoints/openai/test_completion.py
+++ b/vllm-v0.6.2/tests/entrypoints/openai/test_completion.py
@@ -0,0 +1,781 @@
+# imports for guided decoding tests
+import json
+import re
+import shutil
+from tempfile import TemporaryDirectory
+from typing import Dict, List, Optional
+
+import jsonschema
+import openai  # use the official client for correctness check
+import pytest
+import pytest_asyncio
+# downloading lora to test lora requests
+from huggingface_hub import snapshot_download
+from openai import BadRequestError
+from transformers import AutoTokenizer
+
+from vllm.transformers_utils.tokenizer import get_tokenizer
+
+from ...utils import RemoteOpenAIServer
+
+# any model with a chat template should work here
+MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+# technically these adapters use a different base model,
+# but we're not testing generation quality here
+LORA_NAME = "typeof/zephyr-7b-beta-lora"
+PA_NAME = "swapnilbp/llama_tweet_ptune"
+# if PA_NAME changes, PA_NUM_VIRTUAL_TOKENS might also
+# need to change to match the prompt adapter
+PA_NUM_VIRTUAL_TOKENS = 8
+
+
+@pytest.fixture(scope="module")
+def zephyr_lora_files():
+    return snapshot_download(repo_id=LORA_NAME)
+
+
+@pytest.fixture(scope="module")
+def zephyr_lora_added_tokens_files(zephyr_lora_files):
+    tmp_dir = TemporaryDirectory()
+    tmp_model_dir = f"{tmp_dir.name}/zephyr"
+    shutil.copytree(zephyr_lora_files, tmp_model_dir)
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+    # Copy tokenizer to adapter and add some unique tokens
+    # 32000, 32001, 32002
+    added = tokenizer.add_tokens(["vllm1", "vllm2", "vllm3"],
+                                 special_tokens=True)
+    assert added == 3
+    tokenizer.save_pretrained(tmp_model_dir)
+    yield tmp_model_dir
+    tmp_dir.cleanup()
+
+
+@pytest.fixture(scope="module")
+def zephyr_pa_files():
+    return snapshot_download(repo_id=PA_NAME)
+
+
+@pytest.fixture(scope="module")
+def default_server_args(zephyr_lora_files, zephyr_lora_added_tokens_files,
+                        zephyr_pa_files):
+    return [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "8192",
+        "--max-num-seqs",
+        "128",
+        "--enforce-eager",
+        # lora config
+        "--enable-lora",
+        "--lora-modules",
+        f"zephyr-lora={zephyr_lora_files}",
+        f"zephyr-lora2={zephyr_lora_added_tokens_files}",
+        "--max-lora-rank",
+        "64",
+        "--max-cpu-loras",
+        "2",
+        # pa config
+        "--enable-prompt-adapter",
+        "--prompt-adapters",
+        f"zephyr-pa={zephyr_pa_files}",
+        f"zephyr-pa2={zephyr_pa_files}",
+        "--max-prompt-adapters",
+        "2",
+        "--max-prompt-adapter-token",
+        "128",
+    ]
+
+
+@pytest.fixture(scope="module",
+                params=["", "--disable-frontend-multiprocessing"])
+def server(default_server_args, request):
+    if request.param:
+        default_server_args.append(request.param)
+    with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    # first test base model, then test loras, then test prompt adapters
+    "model_name,num_virtual_tokens",
+    [(MODEL_NAME, 0), ("zephyr-lora", 0), ("zephyr-lora2", 0),
+     ("zephyr-pa", PA_NUM_VIRTUAL_TOKENS),
+     ("zephyr-pa2", PA_NUM_VIRTUAL_TOKENS)],
+)
+async def test_single_completion(client: openai.AsyncOpenAI, model_name: str,
+                                 num_virtual_tokens: int):
+    completion = await client.completions.create(model=model_name,
+                                                 prompt="Hello, my name is",
+                                                 max_tokens=5,
+                                                 temperature=0.0)
+
+    assert completion.id is not None
+    assert completion.choices is not None and len(completion.choices) == 1
+
+    choice = completion.choices[0]
+    assert len(choice.text) >= 5
+    assert choice.finish_reason == "length"
+    assert completion.usage == openai.types.CompletionUsage(
+        completion_tokens=5,
+        prompt_tokens=6 + num_virtual_tokens,
+        total_tokens=11 + num_virtual_tokens)
+
+    # test using token IDs
+    completion = await client.completions.create(
+        model=model_name,
+        prompt=[0, 0, 0, 0, 0],
+        max_tokens=5,
+        temperature=0.0,
+    )
+    assert len(completion.choices[0].text) >= 1
+    assert completion.choices[0].prompt_logprobs is None
+
+
+@pytest.mark.asyncio
+async def test_added_lora_tokens(client: openai.AsyncOpenAI):
+    # test using token IDs
+    completion = await client.completions.create(
+        model="zephyr-lora2",
+        prompt=[0, 0, 32000, 32001, 32002],
+        echo=True,
+        max_tokens=5,
+        temperature=0.0,
+    )
+    # Added tokens should appear in tokenized prompt
+    assert completion.choices[0].text.startswith("<unk><unk>vllm1vllm2vllm3")
+
+
+@pytest.mark.asyncio
+async def test_added_lora_tokens_base_model(client: openai.AsyncOpenAI):
+    # test using token IDs
+    with pytest.raises(openai.BadRequestError, match="out of vocabulary"):
+        # Added tokens should be rejected by the base model
+        await client.completions.create(
+            model=MODEL_NAME,
+            prompt=[0, 0, 32000, 32001, 32002],
+            echo=True,
+            max_tokens=5,
+            temperature=0.0,
+        )
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    # first test base model, then test loras, then test prompt adapters
+    "model_name",
+    [MODEL_NAME, "zephyr-lora", "zephyr-lora2", "zephyr-pa", "zephyr-pa2"],
+)
+async def test_no_logprobs(client: openai.AsyncOpenAI, model_name: str):
+    # test using token IDs
+    completion = await client.completions.create(
+        model=model_name,
+        prompt=[0, 0, 0, 0, 0],
+        max_tokens=5,
+        temperature=0.0,
+        logprobs=None,
+    )
+    choice = completion.choices[0]
+    assert choice.logprobs is None
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    # just test 1 lora and 1 pa hereafter
+    "model_name",
+    [MODEL_NAME, "zephyr-lora", "zephyr-pa"],
+)
+async def test_zero_logprobs(client: openai.AsyncOpenAI, model_name: str):
+    # test using token IDs
+    completion = await client.completions.create(
+        model=model_name,
+        prompt=[0, 0, 0, 0, 0],
+        max_tokens=5,
+        temperature=0.0,
+        logprobs=0,
+    )
+    choice = completion.choices[0]
+    assert choice.logprobs is not None
+    assert choice.logprobs.token_logprobs is not None
+    assert choice.logprobs.top_logprobs is not None
+    assert len(choice.logprobs.top_logprobs[0]) == 1
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME, "zephyr-lora", "zephyr-pa"],
+)
+async def test_some_logprobs(client: openai.AsyncOpenAI, model_name: str):
+    # test using token IDs
+    completion = await client.completions.create(
+        model=model_name,
+        prompt=[0, 0, 0, 0, 0],
+        max_tokens=5,
+        temperature=0.0,
+        logprobs=5,
+    )
+    choice = completion.choices[0]
+    assert choice.logprobs is not None
+    assert choice.logprobs.token_logprobs is not None
+    assert choice.logprobs.top_logprobs is not None
+    assert 5 <= len(choice.logprobs.top_logprobs[0]) <= 6
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME, "zephyr-lora", "zephyr-pa"],
+)
+async def test_too_many_completion_logprobs(client: openai.AsyncOpenAI,
+                                            model_name: str):
+
+    with pytest.raises(
+        (openai.BadRequestError, openai.APIError)):  # test using token IDs
+        await client.completions.create(
+            model=model_name,
+            prompt=[0, 0, 0, 0, 0],
+            max_tokens=5,
+            temperature=0.0,
+            # vLLM has higher default max_logprobs (20 instead of 5) to support
+            # both Completion API and Chat Completion API
+            logprobs=21,
+        )
+        ...
+    with pytest.raises(
+        (openai.BadRequestError, openai.APIError)):  # test using token IDs
+        stream = await client.completions.create(
+            model=model_name,
+            prompt=[0, 0, 0, 0, 0],
+            max_tokens=5,
+            temperature=0.0,
+            # vLLM has higher default max_logprobs (20 instead of 5) to support
+            # both Completion API and Chat Completion API
+            logprobs=30,
+            stream=True,
+        )
+        async for chunk in stream:
+            ...
+
+    # the server should still work afterwards
+    completion = await client.completions.create(
+        model=model_name,
+        prompt=[0, 0, 0, 0, 0],
+        max_tokens=5,
+        temperature=0.0,
+    )
+    assert len(completion.choices[0].text) >= 0
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name, prompt_logprobs", [(MODEL_NAME, -1),
+                                                         (MODEL_NAME, 0),
+                                                         (MODEL_NAME, 1),
+                                                         (MODEL_NAME, None)])
+async def test_prompt_logprobs_completion(client: openai.AsyncOpenAI,
+                                          model_name: str,
+                                          prompt_logprobs: Optional[int]):
+    params: Dict = {
+        "prompt": ["A robot may not injure another robot", "My name is"],
+        "model": model_name,
+    }
+    if prompt_logprobs is not None:
+        params["extra_body"] = {"prompt_logprobs": prompt_logprobs}
+
+    if prompt_logprobs is not None and prompt_logprobs < 0:
+        with pytest.raises(BadRequestError):
+            await client.completions.create(**params)
+    else:
+        completion = await client.completions.create(**params)
+        if prompt_logprobs is not None:
+            assert completion.choices[0].prompt_logprobs is not None
+            assert len(completion.choices[0].prompt_logprobs) > 0
+
+            assert completion.choices[1].prompt_logprobs is not None
+            assert len(completion.choices[1].prompt_logprobs) > 0
+
+        else:
+            assert completion.choices[0].prompt_logprobs is None
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME, "zephyr-lora", "zephyr-pa"],
+)
+async def test_completion_streaming(client: openai.AsyncOpenAI,
+                                    model_name: str):
+    prompt = "What is an LLM?"
+
+    single_completion = await client.completions.create(
+        model=model_name,
+        prompt=prompt,
+        max_tokens=5,
+        temperature=0.0,
+    )
+    single_output = single_completion.choices[0].text
+    stream = await client.completions.create(model=model_name,
+                                             prompt=prompt,
+                                             max_tokens=5,
+                                             temperature=0.0,
+                                             stream=True)
+    chunks: List[str] = []
+    finish_reason_count = 0
+    async for chunk in stream:
+        chunks.append(chunk.choices[0].text)
+        if chunk.choices[0].finish_reason is not None:
+            finish_reason_count += 1
+    # finish reason should only return in last block
+    assert finish_reason_count == 1
+    assert chunk.choices[0].finish_reason == "length"
+    assert chunk.choices[0].text
+    assert "".join(chunks) == single_output
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME, "zephyr-lora", "zephyr-pa"],
+)
+async def test_parallel_streaming(client: openai.AsyncOpenAI, model_name: str):
+    """Streaming for parallel sampling.
+    The tokens from multiple samples, are flattened into a single stream,
+    with an index to indicate which sample the token belongs to.
+    """
+
+    prompt = "What is an LLM?"
+    n = 3
+    max_tokens = 5
+
+    stream = await client.completions.create(model=model_name,
+                                             prompt=prompt,
+                                             max_tokens=max_tokens,
+                                             n=n,
+                                             stream=True)
+    chunks: List[List[str]] = [[] for i in range(n)]
+    finish_reason_count = 0
+    async for chunk in stream:
+        index = chunk.choices[0].index
+        text = chunk.choices[0].text
+        chunks[index].append(text)
+        if chunk.choices[0].finish_reason is not None:
+            finish_reason_count += 1
+    assert finish_reason_count == n
+    for chunk in chunks:
+        assert len(chunk) == max_tokens
+        print("".join(chunk))
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME, "zephyr-lora", "zephyr-pa"],
+)
+async def test_completion_stream_options(client: openai.AsyncOpenAI,
+                                         model_name: str):
+    prompt = "What is the capital of France?"
+
+    # Test stream=True, stream_options=
+    #     {"include_usage": False, "continuous_usage_stats": False}
+    stream = await client.completions.create(model=model_name,
+                                             prompt=prompt,
+                                             max_tokens=5,
+                                             temperature=0.0,
+                                             stream=True,
+                                             stream_options={
+                                                 "include_usage": False,
+                                                 "continuous_usage_stats":
+                                                 False,
+                                             })
+
+    async for chunk in stream:
+        assert chunk.usage is None
+
+    # Test stream=True, stream_options=
+    #     {"include_usage": False, "continuous_usage_stats": True}
+    stream = await client.completions.create(model=model_name,
+                                             prompt=prompt,
+                                             max_tokens=5,
+                                             temperature=0.0,
+                                             stream=True,
+                                             stream_options={
+                                                 "include_usage": False,
+                                                 "continuous_usage_stats":
+                                                 True,
+                                             })
+    async for chunk in stream:
+        assert chunk.usage is None
+
+    # Test stream=True, stream_options=
+    #     {"include_usage": True, "continuous_usage_stats": False}
+    stream = await client.completions.create(model=model_name,
+                                             prompt=prompt,
+                                             max_tokens=5,
+                                             temperature=0.0,
+                                             stream=True,
+                                             stream_options={
+                                                 "include_usage": True,
+                                                 "continuous_usage_stats":
+                                                 False,
+                                             })
+    async for chunk in stream:
+        if chunk.choices[0].finish_reason is None:
+            assert chunk.usage is None
+        else:
+            assert chunk.usage is None
+            final_chunk = await stream.__anext__()
+            assert final_chunk.usage is not None
+            assert final_chunk.usage.prompt_tokens > 0
+            assert final_chunk.usage.completion_tokens > 0
+            assert final_chunk.usage.total_tokens == (
+                final_chunk.usage.prompt_tokens +
+                final_chunk.usage.completion_tokens)
+            assert final_chunk.choices == []
+
+    # Test stream=True, stream_options=
+    #     {"include_usage": True, "continuous_usage_stats": True}
+    stream = await client.completions.create(model=model_name,
+                                             prompt=prompt,
+                                             max_tokens=5,
+                                             temperature=0.0,
+                                             stream=True,
+                                             stream_options={
+                                                 "include_usage": True,
+                                                 "continuous_usage_stats":
+                                                 True,
+                                             })
+    async for chunk in stream:
+        assert chunk.usage is not None
+        assert chunk.usage.prompt_tokens > 0
+        assert chunk.usage.completion_tokens > 0
+        assert chunk.usage.total_tokens == (chunk.usage.prompt_tokens +
+                                            chunk.usage.completion_tokens)
+        if chunk.choices[0].finish_reason is not None:
+            final_chunk = await stream.__anext__()
+            assert final_chunk.usage is not None
+            assert final_chunk.usage.prompt_tokens > 0
+            assert final_chunk.usage.completion_tokens > 0
+            assert final_chunk.usage.total_tokens == (
+                final_chunk.usage.prompt_tokens +
+                final_chunk.usage.completion_tokens)
+            assert final_chunk.choices == []
+
+    # Test stream=False, stream_options=
+    #     {"include_usage": None}
+    with pytest.raises(BadRequestError):
+        await client.completions.create(model=model_name,
+                                        prompt=prompt,
+                                        max_tokens=5,
+                                        temperature=0.0,
+                                        stream=False,
+                                        stream_options={"include_usage": None})
+
+    # Test stream=False, stream_options=
+    #    {"include_usage": True}
+    with pytest.raises(BadRequestError):
+        await client.completions.create(model=model_name,
+                                        prompt=prompt,
+                                        max_tokens=5,
+                                        temperature=0.0,
+                                        stream=False,
+                                        stream_options={"include_usage": True})
+
+    # Test stream=False, stream_options=
+    #     {"continuous_usage_stats": None}
+    with pytest.raises(BadRequestError):
+        await client.completions.create(
+            model=model_name,
+            prompt=prompt,
+            max_tokens=5,
+            temperature=0.0,
+            stream=False,
+            stream_options={"continuous_usage_stats": None})
+
+    # Test stream=False, stream_options=
+    #    {"continuous_usage_stats": True}
+    with pytest.raises(BadRequestError):
+        await client.completions.create(
+            model=model_name,
+            prompt=prompt,
+            max_tokens=5,
+            temperature=0.0,
+            stream=False,
+            stream_options={"continuous_usage_stats": True})
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME, "zephyr-lora", "zephyr-pa"],
+)
+async def test_batch_completions(client: openai.AsyncOpenAI, model_name: str):
+    # test both text and token IDs
+    for prompts in (["Hello, my name is"] * 2, [[0, 0, 0, 0, 0]] * 2):
+        # test simple list
+        batch = await client.completions.create(
+            model=model_name,
+            prompt=prompts,
+            max_tokens=5,
+            temperature=0.0,
+        )
+        assert len(batch.choices) == 2
+        assert batch.choices[0].text == batch.choices[1].text
+
+        # test n = 2
+        batch = await client.completions.create(
+            model=model_name,
+            prompt=prompts,
+            n=2,
+            max_tokens=5,
+            temperature=0.0,
+            extra_body=dict(
+                # NOTE: this has to be true for n > 1 in vLLM, but
+                # not necessary for official client.
+                use_beam_search=True),
+        )
+        assert len(batch.choices) == 4
+        assert batch.choices[0].text != batch.choices[
+            1].text, "beam search should be different"
+        assert batch.choices[0].text == batch.choices[
+            2].text, "two copies of the same prompt should be the same"
+        assert batch.choices[1].text == batch.choices[
+            3].text, "two copies of the same prompt should be the same"
+
+        # test streaming
+        batch = await client.completions.create(
+            model=model_name,
+            prompt=prompts,
+            max_tokens=5,
+            temperature=0.0,
+            stream=True,
+        )
+        texts = [""] * 2
+        async for chunk in batch:
+            assert len(chunk.choices) == 1
+            choice = chunk.choices[0]
+            texts[choice.index] += choice.text
+        assert texts[0] == texts[1]
+
+
+@pytest.mark.asyncio
+async def test_logits_bias(client: openai.AsyncOpenAI):
+    prompt = "Hello, my name is"
+    max_tokens = 5
+    tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
+
+    # Test exclusive selection
+    token_id = 1000
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=prompt,
+        max_tokens=max_tokens,
+        temperature=0.0,
+        logit_bias={str(token_id): 100},
+        seed=42,
+    )
+    assert len(completion.choices[0].text) >= 5
+    response_tokens = tokenizer(completion.choices[0].text,
+                                add_special_tokens=False)["input_ids"]
+    expected_tokens = tokenizer(tokenizer.decode([token_id] * 5),
+                                add_special_tokens=False)["input_ids"]
+    assert all([
+        response == expected
+        for response, expected in zip(response_tokens, expected_tokens)
+    ])
+
+    # Test ban
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=prompt,
+        max_tokens=max_tokens,
+        temperature=0.0,
+    )
+    response_tokens = tokenizer(completion.choices[0].text,
+                                add_special_tokens=False)["input_ids"]
+    first_response = completion.choices[0].text
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=prompt,
+        max_tokens=max_tokens,
+        temperature=0.0,
+        logit_bias={str(token): -100
+                    for token in response_tokens},
+    )
+    assert first_response != completion.choices[0].text
+
+
+@pytest.mark.asyncio
+async def test_allowed_token_ids(client: openai.AsyncOpenAI):
+    prompt = "Hello, my name is"
+    max_tokens = 1
+    tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
+
+    # Test exclusive selection
+    allowed_ids = [21555, 21557, 21558]
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=prompt,
+        max_tokens=max_tokens,
+        temperature=0.0,
+        seed=42,
+        extra_body=dict(allowed_token_ids=allowed_ids),
+        logprobs=1,
+    )
+    response_tokens = completion.choices[0].logprobs.tokens
+    assert len(response_tokens) == 1
+    assert tokenizer.convert_tokens_to_ids(response_tokens)[0] in allowed_ids
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("guided_decoding_backend",
+                         ["outlines", "lm-format-enforcer"])
+async def test_guided_json_completion(client: openai.AsyncOpenAI,
+                                      guided_decoding_backend: str,
+                                      sample_json_schema):
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=f"Give an example JSON for an employee profile "
+        f"that fits this schema: {sample_json_schema}",
+        n=3,
+        temperature=1.0,
+        max_tokens=500,
+        extra_body=dict(guided_json=sample_json_schema,
+                        guided_decoding_backend=guided_decoding_backend))
+
+    assert completion.id is not None
+    assert len(completion.choices) == 3
+    for i in range(3):
+        output_json = json.loads(completion.choices[i].text)
+        jsonschema.validate(instance=output_json, schema=sample_json_schema)
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("guided_decoding_backend",
+                         ["outlines", "lm-format-enforcer"])
+async def test_guided_regex_completion(client: openai.AsyncOpenAI,
+                                       guided_decoding_backend: str,
+                                       sample_regex):
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=f"Give an example IPv4 address with this regex: {sample_regex}",
+        n=3,
+        temperature=1.0,
+        max_tokens=20,
+        extra_body=dict(guided_regex=sample_regex,
+                        guided_decoding_backend=guided_decoding_backend))
+
+    assert completion.id is not None
+    assert len(completion.choices) == 3
+    for i in range(3):
+        assert re.fullmatch(sample_regex,
+                            completion.choices[i].text) is not None
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("guided_decoding_backend",
+                         ["outlines", "lm-format-enforcer"])
+async def test_guided_choice_completion(client: openai.AsyncOpenAI,
+                                        guided_decoding_backend: str,
+                                        sample_guided_choice):
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt="The best language for type-safe systems programming is ",
+        n=2,
+        temperature=1.0,
+        max_tokens=10,
+        extra_body=dict(guided_choice=sample_guided_choice,
+                        guided_decoding_backend=guided_decoding_backend))
+
+    assert completion.id is not None
+    assert len(completion.choices) == 2
+    for i in range(2):
+        assert completion.choices[i].text in sample_guided_choice
+
+
+@pytest.mark.asyncio
+async def test_guided_grammar(client: openai.AsyncOpenAI,
+                              sample_sql_statements):
+
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=("Generate a sql state that select col_1 from "
+                "table_1 where it is equals to 1"),
+        temperature=1.0,
+        max_tokens=500,
+        extra_body=dict(guided_grammar=sample_sql_statements))
+
+    content = completion.choices[0].text
+
+    # use Lark to parse the output, and make sure it's a valid parse tree
+    from lark import Lark
+    parser = Lark(sample_sql_statements)
+    parser.parse(content)
+
+    # remove spaces for comparison b/c we removed them in the grammar
+    ground_truth = "SELECT col_1 from table_1 where col_1 = 1".replace(" ", "")
+
+    assert content.strip() == ground_truth
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    # first test base model, then test loras
+    "model_name",
+    [MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
+)
+@pytest.mark.parametrize("logprobs_arg", [1, 0])
+async def test_echo_logprob_completion(client: openai.AsyncOpenAI,
+                                       model_name: str, logprobs_arg: int):
+    tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
+    # test using text and token IDs
+    for prompt in ("Hello, my name is", [0, 0, 0, 0, 0]):
+        completion = await client.completions.create(model=model_name,
+                                                     prompt=prompt,
+                                                     max_tokens=5,
+                                                     temperature=0.0,
+                                                     echo=True,
+                                                     logprobs=logprobs_arg)
+
+        prompt_text = tokenizer.decode(prompt) if isinstance(prompt,
+                                                             list) else prompt
+        assert re.search(r"^" + prompt_text, completion.choices[0].text)
+        logprobs = completion.choices[0].logprobs
+        assert logprobs is not None
+        assert len(logprobs.text_offset) > 5
+        assert (len(logprobs.token_logprobs) > 5
+                and logprobs.token_logprobs[0] is None)
+        assert (len(logprobs.top_logprobs) > 5
+                and logprobs.top_logprobs[0] is None)
+        for top_logprobs in logprobs.top_logprobs[1:]:
+            assert max(logprobs_arg,
+                       1) <= len(top_logprobs) <= logprobs_arg + 1
+        assert len(logprobs.tokens) > 5
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("guided_decoding_backend",
+                         ["outlines", "lm-format-enforcer"])
+async def test_guided_decoding_type_error(client: openai.AsyncOpenAI,
+                                          guided_decoding_backend: str,
+                                          sample_json_schema, sample_regex):
+    with pytest.raises(openai.BadRequestError):
+        _ = await client.completions.create(
+            model=MODEL_NAME,
+            prompt="Give an example JSON that fits this schema: 42",
+            extra_body=dict(guided_json=42,
+                            guided_decoding_backend=guided_decoding_backend))
+
+    with pytest.raises(openai.BadRequestError):
+        _ = await client.completions.create(
+            model=MODEL_NAME,
+            prompt="Give an example string that fits this regex",
+            extra_body=dict(guided_regex=sample_regex,
+                            guided_json=sample_json_schema))
--- a/vllm-v0.6.2/tests/entrypoints/openai/test_embedding.py
+++ b/vllm-v0.6.2/tests/entrypoints/openai/test_embedding.py
@@ -0,0 +1,250 @@
+import base64
+
+import numpy as np
+import openai
+import pytest
+import pytest_asyncio
+import requests
+
+from vllm.transformers_utils.tokenizer import get_tokenizer
+
+from ...utils import RemoteOpenAIServer
+
+MODEL_NAME = "intfloat/e5-mistral-7b-instruct"
+DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}"""  # noqa: E501
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--enforce-eager",
+        "--max-model-len",
+        "8192",
+        "--chat-template",
+        DUMMY_CHAT_TEMPLATE,
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_single_embedding(client: openai.AsyncOpenAI, model_name: str):
+    input_texts = [
+        "The chef prepared a delicious meal.",
+    ]
+
+    # test single embedding
+    embeddings = await client.embeddings.create(
+        model=model_name,
+        input=input_texts,
+        encoding_format="float",
+    )
+    assert embeddings.id is not None
+    assert len(embeddings.data) == 1
+    assert len(embeddings.data[0].embedding) == 4096
+    assert embeddings.usage.completion_tokens == 0
+    assert embeddings.usage.prompt_tokens == 9
+    assert embeddings.usage.total_tokens == 9
+
+    # test using token IDs
+    input_tokens = [1, 1, 1, 1, 1]
+    embeddings = await client.embeddings.create(
+        model=model_name,
+        input=input_tokens,
+        encoding_format="float",
+    )
+    assert embeddings.id is not None
+    assert len(embeddings.data) == 1
+    assert len(embeddings.data[0].embedding) == 4096
+    assert embeddings.usage.completion_tokens == 0
+    assert embeddings.usage.prompt_tokens == 5
+    assert embeddings.usage.total_tokens == 5
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_batch_embedding(client: openai.AsyncOpenAI, model_name: str):
+    # test List[str]
+    input_texts = [
+        "The cat sat on the mat.", "A feline was resting on a rug.",
+        "Stars twinkle brightly in the night sky."
+    ]
+    embeddings = await client.embeddings.create(
+        model=model_name,
+        input=input_texts,
+        encoding_format="float",
+    )
+    assert embeddings.id is not None
+    assert len(embeddings.data) == 3
+    assert len(embeddings.data[0].embedding) == 4096
+    assert embeddings.usage.completion_tokens == 0
+    assert embeddings.usage.prompt_tokens == 32
+    assert embeddings.usage.total_tokens == 32
+
+    # test List[List[int]]
+    input_tokens = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24],
+                    [25, 32, 64, 77]]
+    embeddings = await client.embeddings.create(
+        model=model_name,
+        input=input_tokens,
+        encoding_format="float",
+    )
+    assert embeddings.id is not None
+    assert len(embeddings.data) == 4
+    assert len(embeddings.data[0].embedding) == 4096
+    assert embeddings.usage.completion_tokens == 0
+    assert embeddings.usage.prompt_tokens == 17
+    assert embeddings.usage.total_tokens == 17
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_conversation_embedding(server: RemoteOpenAIServer,
+                                      client: openai.AsyncOpenAI,
+                                      model_name: str):
+    messages = [{
+        "role": "user",
+        "content": "The cat sat on the mat.",
+    }, {
+        "role": "assistant",
+        "content": "A feline was resting on a rug.",
+    }, {
+        "role": "user",
+        "content": "Stars twinkle brightly in the night sky.",
+    }]
+
+    chat_response = requests.post(server.url_for("v1/embeddings"),
+                                  json={
+                                      "model": model_name,
+                                      "messages": messages,
+                                      "encoding_format": "float",
+                                  })
+    chat_response.raise_for_status()
+    chat_embeddings = chat_response.json()
+
+    tokenizer = get_tokenizer(tokenizer_name=model_name, tokenizer_mode="fast")
+    prompt = tokenizer.apply_chat_template(
+        messages,
+        chat_template=DUMMY_CHAT_TEMPLATE,
+        add_generation_prompt=True,
+        continue_final_message=False,
+        tokenize=False,
+    )
+    completion_response = await client.embeddings.create(
+        model=model_name,
+        input=prompt,
+        encoding_format="float",
+        # To be consistent with chat
+        extra_body={"add_special_tokens": False},
+    )
+    completion_embeddings = completion_response.model_dump(mode="json")
+
+    assert chat_embeddings.pop("id") is not None
+    assert completion_embeddings.pop("id") is not None
+    assert chat_embeddings.pop("created") <= completion_embeddings.pop(
+        "created")
+    assert chat_embeddings == completion_embeddings
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_batch_base64_embedding(client: openai.AsyncOpenAI,
+                                      model_name: str):
+    input_texts = [
+        "Hello my name is",
+        "The best thing about vLLM is that it supports many different models"
+    ]
+
+    responses_float = await client.embeddings.create(input=input_texts,
+                                                     model=model_name,
+                                                     encoding_format="float")
+
+    responses_base64 = await client.embeddings.create(input=input_texts,
+                                                      model=model_name,
+                                                      encoding_format="base64")
+
+    decoded_responses_base64_data = []
+    for data in responses_base64.data:
+        decoded_responses_base64_data.append(
+            np.frombuffer(base64.b64decode(data.embedding),
+                          dtype="float32").tolist())
+
+    assert responses_float.data[0].embedding == decoded_responses_base64_data[
+        0]
+    assert responses_float.data[1].embedding == decoded_responses_base64_data[
+        1]
+
+    # Default response is float32 decoded from base64 by OpenAI Client
+    responses_default = await client.embeddings.create(input=input_texts,
+                                                       model=model_name)
+
+    assert responses_float.data[0].embedding == responses_default.data[
+        0].embedding
+    assert responses_float.data[1].embedding == responses_default.data[
+        1].embedding
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_single_embedding_truncation(client: openai.AsyncOpenAI,
+                                           model_name: str):
+    input_texts = [
+        "Como o Brasil pode fomentar o desenvolvimento de modelos de IA?",
+    ]
+
+    # test single embedding
+    embeddings = await client.embeddings.create(
+        model=model_name,
+        input=input_texts,
+        extra_body={"truncate_prompt_tokens": 10})
+    assert embeddings.id is not None
+    assert len(embeddings.data) == 1
+    assert len(embeddings.data[0].embedding) == 4096
+    assert embeddings.usage.completion_tokens == 0
+    assert embeddings.usage.prompt_tokens == 10
+    assert embeddings.usage.total_tokens == 10
+
+    input_tokens = [
+        1, 24428, 289, 18341, 26165, 285, 19323, 283, 289, 26789, 3871, 28728,
+        9901, 340, 2229, 385, 340, 315, 28741, 28804, 2
+    ]
+    embeddings = await client.embeddings.create(
+        model=model_name,
+        input=input_tokens,
+        extra_body={"truncate_prompt_tokens": 10})
+
+    assert embeddings.id is not None
+    assert len(embeddings.data) == 1
+    assert len(embeddings.data[0].embedding) == 4096
+    assert embeddings.usage.completion_tokens == 0
+    assert embeddings.usage.prompt_tokens == 10
+    assert embeddings.usage.total_tokens == 10
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_single_embedding_truncation_invalid(client: openai.AsyncOpenAI,
+                                                   model_name: str):
+    input_texts = [
+        "Como o Brasil pode fomentar o desenvolvimento de modelos de IA?",
+    ]
+
+    with pytest.raises(openai.BadRequestError):
+        embeddings = await client.embeddings.create(
+            model=model_name,
+            input=input_texts,
+            extra_body={"truncate_prompt_tokens": 8193})
+        assert "error" in embeddings.object
+        assert "truncate_prompt_tokens value is greater than max_model_len. "\
+               "Please, select a smaller truncation size." in embeddings.message
--- a/vllm-v0.6.2/tests/entrypoints/openai/test_encoder_decoder.py
+++ b/vllm-v0.6.2/tests/entrypoints/openai/test_encoder_decoder.py
@@ -0,0 +1,52 @@
+import openai
+import pytest
+import pytest_asyncio
+
+from ...utils import RemoteOpenAIServer
+
+MODEL_NAME = "facebook/bart-base"
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        "--dtype",
+        "bfloat16",
+        "--enforce-eager",
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_single_completion(client: openai.AsyncOpenAI, model_name: str):
+    completion = await client.completions.create(model=model_name,
+                                                 prompt="Hello, my name is",
+                                                 max_tokens=5,
+                                                 temperature=0.0)
+
+    assert completion.id is not None
+    assert completion.choices is not None and len(completion.choices) == 1
+
+    choice = completion.choices[0]
+    assert len(choice.text) >= 5
+    assert choice.finish_reason == "length"
+    assert completion.usage == openai.types.CompletionUsage(
+        completion_tokens=5, prompt_tokens=2, total_tokens=7)
+
+    # test using token IDs
+    completion = await client.completions.create(
+        model=model_name,
+        prompt=[0, 0, 0, 0, 0],
+        max_tokens=5,
+        temperature=0.0,
+    )
+    assert len(completion.choices[0].text) >= 1
--- a/vllm-v0.6.2/tests/entrypoints/openai/test_lora_lineage.py
+++ b/vllm-v0.6.2/tests/entrypoints/openai/test_lora_lineage.py
@@ -0,0 +1,83 @@
+import json
+
+import openai  # use the official client for correctness check
+import pytest
+import pytest_asyncio
+# downloading lora to test lora requests
+from huggingface_hub import snapshot_download
+
+from ...utils import RemoteOpenAIServer
+
+# any model with a chat template should work here
+MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+# technically this needs Mistral-7B-v0.1 as base, but we're not testing
+# generation quality here
+LORA_NAME = "typeof/zephyr-7b-beta-lora"
+
+
+@pytest.fixture(scope="module")
+def zephyr_lora_files():
+    return snapshot_download(repo_id=LORA_NAME)
+
+
+@pytest.fixture(scope="module")
+def server_with_lora_modules_json(zephyr_lora_files):
+    # Define the json format LoRA module configurations
+    lora_module_1 = {
+        "name": "zephyr-lora",
+        "path": zephyr_lora_files,
+        "base_model_name": MODEL_NAME
+    }
+
+    lora_module_2 = {
+        "name": "zephyr-lora2",
+        "path": zephyr_lora_files,
+        "base_model_name": MODEL_NAME
+    }
+
+    args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "8192",
+        "--enforce-eager",
+        # lora config below
+        "--enable-lora",
+        "--lora-modules",
+        json.dumps(lora_module_1),
+        json.dumps(lora_module_2),
+        "--max-lora-rank",
+        "64",
+        "--max-cpu-loras",
+        "2",
+        "--max-num-seqs",
+        "64",
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client_for_lora_lineage(server_with_lora_modules_json):
+    async with server_with_lora_modules_json.get_async_client(
+    ) as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+async def test_check_lora_lineage(client_for_lora_lineage: openai.AsyncOpenAI,
+                                  zephyr_lora_files):
+    models = await client_for_lora_lineage.models.list()
+    models = models.data
+    served_model = models[0]
+    lora_models = models[1:]
+    assert served_model.id == MODEL_NAME
+    assert served_model.root == MODEL_NAME
+    assert served_model.parent is None
+    assert all(lora_model.root == zephyr_lora_files
+               for lora_model in lora_models)
+    assert all(lora_model.parent == MODEL_NAME for lora_model in lora_models)
+    assert lora_models[0].id == "zephyr-lora"
+    assert lora_models[1].id == "zephyr-lora2"
--- a/vllm-v0.6.2/tests/entrypoints/openai/test_metrics.py
+++ b/vllm-v0.6.2/tests/entrypoints/openai/test_metrics.py
@@ -0,0 +1,236 @@
+import subprocess
+import sys
+import tempfile
+import time
+from http import HTTPStatus
+
+import openai
+import pytest
+import pytest_asyncio
+import requests
+from prometheus_client.parser import text_string_to_metric_families
+from transformers import AutoTokenizer
+
+from ...utils import RemoteOpenAIServer
+
+MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
+
+
+@pytest.fixture(scope="module")
+def default_server_args():
+    return [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "1024",
+        "--enforce-eager",
+        "--max-num-seqs",
+        "128",
+    ]
+
+
+@pytest.fixture(scope="module",
+                params=[
+                    "",
+                    "--enable-chunked-prefill",
+                    "--disable-frontend-multiprocessing",
+                ])
+def server(default_server_args, request):
+    if request.param:
+        default_server_args.append(request.param)
+    with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as cl:
+        yield cl
+
+
+_PROMPT = "Hello my name is Robert and I love magic"
+tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+_TOKENIZED_PROMPT = tokenizer(_PROMPT)["input_ids"]
+
+_NUM_REQUESTS = 10
+_NUM_PROMPT_TOKENS_PER_REQUEST = len(_TOKENIZED_PROMPT)
+_NUM_GENERATION_TOKENS_PER_REQUEST = 10
+
+# {metric_family: [(suffix, expected_value)]}
+EXPECTED_VALUES = {
+    "vllm:time_to_first_token_seconds": [("_count", _NUM_REQUESTS)],
+    "vllm:time_per_output_token_seconds":
+    [("_count", _NUM_REQUESTS * (_NUM_GENERATION_TOKENS_PER_REQUEST - 1))],
+    "vllm:e2e_request_latency_seconds": [("_count", _NUM_REQUESTS)],
+    "vllm:request_prompt_tokens":
+    [("_sum", _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST),
+     ("_count", _NUM_REQUESTS)],
+    "vllm:request_generation_tokens":
+    [("_sum", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST),
+     ("_count", _NUM_REQUESTS)],
+    "vllm:request_params_n": [("_count", _NUM_REQUESTS)],
+    "vllm:request_params_max_tokens":
+    [("_sum", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST),
+     ("_count", _NUM_REQUESTS)],
+    "vllm:prompt_tokens": [("_total",
+                            _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST)],
+    "vllm:generation_tokens": [
+        ("_total", _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST)
+    ],
+    "vllm:request_success": [("_total", _NUM_REQUESTS)],
+}
+
+
+@pytest.mark.asyncio
+async def test_metrics_counts(server: RemoteOpenAIServer,
+                              client: openai.AsyncClient):
+    for _ in range(_NUM_REQUESTS):
+        # sending a request triggers the metrics to be logged.
+        await client.completions.create(
+            model=MODEL_NAME,
+            prompt=_TOKENIZED_PROMPT,
+            max_tokens=_NUM_GENERATION_TOKENS_PER_REQUEST)
+
+    response = requests.get(server.url_for("metrics"))
+    print(response.text)
+    assert response.status_code == HTTPStatus.OK
+
+    # Loop over all expected metric_families
+    for metric_family, suffix_values_list in EXPECTED_VALUES.items():
+        found_metric = False
+
+        # Check to see if the metric_family is found in the prom endpoint.
+        for family in text_string_to_metric_families(response.text):
+            if family.name == metric_family:
+                found_metric = True
+
+                # Check that each suffix is found in the prom endpoint.
+                for suffix, expected_value in suffix_values_list:
+                    metric_name_w_suffix = f"{metric_family}{suffix}"
+                    found_suffix = False
+
+                    for sample in family.samples:
+                        if sample.name == metric_name_w_suffix:
+                            found_suffix = True
+
+                            # For each suffix, value sure the value matches
+                            # what we expect.
+                            assert sample.value == expected_value, (
+                                f"{metric_name_w_suffix} expected value of "
+                                f"{expected_value} did not match found value "
+                                f"{sample.value}")
+                            break
+                    assert found_suffix, (
+                        f"Did not find {metric_name_w_suffix} in prom endpoint"
+                    )
+                break
+
+        assert found_metric, (f"Did not find {metric_family} in prom endpoint")
+
+
+EXPECTED_METRICS = [
+    "vllm:num_requests_running",
+    "vllm:num_requests_swapped",
+    "vllm:num_requests_waiting",
+    "vllm:gpu_cache_usage_perc",
+    "vllm:cpu_cache_usage_perc",
+    "vllm:time_to_first_token_seconds_sum",
+    "vllm:time_to_first_token_seconds_bucket",
+    "vllm:time_to_first_token_seconds_count",
+    "vllm:time_per_output_token_seconds_sum",
+    "vllm:time_per_output_token_seconds_bucket",
+    "vllm:time_per_output_token_seconds_count",
+    "vllm:e2e_request_latency_seconds_sum",
+    "vllm:e2e_request_latency_seconds_bucket",
+    "vllm:e2e_request_latency_seconds_count",
+    "vllm:request_prompt_tokens_sum",
+    "vllm:request_prompt_tokens_bucket",
+    "vllm:request_prompt_tokens_count",
+    "vllm:request_generation_tokens_sum",
+    "vllm:request_generation_tokens_bucket",
+    "vllm:request_generation_tokens_count",
+    "vllm:request_params_n_sum",
+    "vllm:request_params_n_bucket",
+    "vllm:request_params_n_count",
+    "vllm:request_params_max_tokens_sum",
+    "vllm:request_params_max_tokens_bucket",
+    "vllm:request_params_max_tokens_count",
+    "vllm:num_preemptions_total",
+    "vllm:prompt_tokens_total",
+    "vllm:generation_tokens_total",
+    "vllm:request_success_total",
+    "vllm:cache_config_info",
+    # labels in cache_config_info
+    "block_size",
+    "cache_dtype",
+    "cpu_offload_gb",
+    "enable_prefix_caching",
+    "gpu_memory_utilization",
+    "num_cpu_blocks",
+    "num_gpu_blocks",
+    "num_gpu_blocks_override",
+    "sliding_window",
+    "swap_space_bytes",
+]
+
+
+@pytest.mark.asyncio
+async def test_metrics_exist(server: RemoteOpenAIServer,
+                             client: openai.AsyncClient):
+    # sending a request triggers the metrics to be logged.
+    await client.completions.create(model=MODEL_NAME,
+                                    prompt="Hello, my name is",
+                                    max_tokens=5,
+                                    temperature=0.0)
+
+    response = requests.get(server.url_for("metrics"))
+    assert response.status_code == HTTPStatus.OK
+
+    for metric in EXPECTED_METRICS:
+        assert metric in response.text
+
+
+def test_metrics_exist_run_batch():
+    input_batch = """{"custom_id": "request-0", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "You are a helpful assistant."}}"""  # noqa: E501
+
+    base_url = "0.0.0.0"
+    port = "8001"
+    server_url = f"http://{base_url}:{port}"
+
+    with tempfile.NamedTemporaryFile(
+            "w") as input_file, tempfile.NamedTemporaryFile(
+                "r") as output_file:
+        input_file.write(input_batch)
+        input_file.flush()
+        proc = subprocess.Popen([
+            sys.executable,
+            "-m",
+            "vllm.entrypoints.openai.run_batch",
+            "-i",
+            input_file.name,
+            "-o",
+            output_file.name,
+            "--model",
+            "intfloat/e5-mistral-7b-instruct",
+            "--enable-metrics",
+            "--url",
+            base_url,
+            "--port",
+            port,
+        ], )
+
+        def is_server_up(url):
+            try:
+                response = requests.get(url)
+                return response.status_code == 200
+            except requests.ConnectionError:
+                return False
+
+        while not is_server_up(server_url):
+            time.sleep(1)
+
+        response = requests.get(server_url + "/metrics")
+        assert response.status_code == HTTPStatus.OK
+
+        proc.wait()
--- a/Show More
+++ b/Show More