init

2026-01-09 13:34:11 +08:00
parent dfa6476b58
commit b2ef04d792
538 changed files with 105693 additions and 2 deletions
--- a/tests/init.py
+++ b/tests/init.py
--- a/tests/async_engine/api_server_async_engine.py
+++ b/tests/async_engine/api_server_async_engine.py
@@ -0,0 +1,50 @@
+"""vllm.entrypoints.api_server with some extra logging for testing."""
+import argparse
+from typing import Any, Dict
+
+import uvicorn
+from fastapi.responses import JSONResponse, Response
+
+import vllm.entrypoints.api_server
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.engine.async_llm_engine import AsyncLLMEngine
+
+app = vllm.entrypoints.api_server.app
+
+
+class AsyncLLMEngineWithStats(AsyncLLMEngine):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._num_aborts = 0
+
+    async def abort(self, request_id: str) -> None:
+        await super().abort(request_id)
+        self._num_aborts += 1
+
+    def testing_stats(self) -> Dict[str, Any]:
+        return {"num_aborted_requests": self._num_aborts}
+
+
+@app.get("/stats")
+def stats() -> Response:
+    """Get the statistics of the engine."""
+    return JSONResponse(engine.testing_stats())
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", type=str, default="localhost")
+    parser.add_argument("--port", type=int, default=8000)
+    parser = AsyncEngineArgs.add_cli_args(parser)
+    args = parser.parse_args()
+
+    engine_args = AsyncEngineArgs.from_cli_args(args)
+    engine = AsyncLLMEngineWithStats.from_engine_args(engine_args)
+    vllm.entrypoints.api_server.engine = engine
+    uvicorn.run(
+        app,
+        host=args.host,
+        port=args.port,
+        log_level="debug",
+        timeout_keep_alive=vllm.entrypoints.api_server.TIMEOUT_KEEP_ALIVE)
--- a/tests/async_engine/test_api_server.py
+++ b/tests/async_engine/test_api_server.py
@@ -0,0 +1,108 @@
+import subprocess
+import sys
+import time
+from multiprocessing import Pool
+from pathlib import Path
+
+import pytest
+import requests
+
+
+def _query_server(prompt: str, max_tokens: int = 5) -> dict:
+    response = requests.post("http://localhost:8000/generate",
+                             json={
+                                 "prompt": prompt,
+                                 "max_tokens": max_tokens,
+                                 "temperature": 0,
+                                 "ignore_eos": True
+                             })
+    response.raise_for_status()
+    return response.json()
+
+
+def _query_server_long(prompt: str) -> dict:
+    return _query_server(prompt, max_tokens=500)
+
+
+@pytest.fixture
+def api_server(tokenizer_pool_size: int, engine_use_ray: bool,
+               worker_use_ray: bool):
+    script_path = Path(__file__).parent.joinpath(
+        "api_server_async_engine.py").absolute()
+    commands = [
+        sys.executable, "-u",
+        str(script_path), "--model", "facebook/opt-125m", "--host",
+        "127.0.0.1", "--tokenizer-pool-size",
+        str(tokenizer_pool_size)
+    ]
+    if engine_use_ray:
+        commands.append("--engine-use-ray")
+    if worker_use_ray:
+        commands.append("--worker-use-ray")
+    uvicorn_process = subprocess.Popen(commands)
+    yield
+    uvicorn_process.terminate()
+
+
+@pytest.mark.parametrize("tokenizer_pool_size", [0, 2])
+@pytest.mark.parametrize("worker_use_ray", [False, True])
+@pytest.mark.parametrize("engine_use_ray", [False, True])
+def test_api_server(api_server, tokenizer_pool_size: int, worker_use_ray: bool,
+                    engine_use_ray: bool):
+    """
+    Run the API server and test it.
+
+    We run both the server and requests in separate processes.
+
+    We test that the server can handle incoming requests, including
+    multiple requests at the same time, and that it can handle requests
+    being cancelled without crashing.
+    """
+    with Pool(32) as pool:
+        # Wait until the server is ready
+        prompts = ["warm up"] * 1
+        result = None
+        while not result:
+            try:
+                for r in pool.map(_query_server, prompts):
+                    result = r
+                    break
+            except requests.exceptions.ConnectionError:
+                time.sleep(1)
+
+        # Actual tests start here
+        # Try with 1 prompt
+        for result in pool.map(_query_server, prompts):
+            assert result
+
+        num_aborted_requests = requests.get(
+            "http://localhost:8000/stats").json()["num_aborted_requests"]
+        assert num_aborted_requests == 0
+
+        # Try with 100 prompts
+        prompts = ["test prompt"] * 100
+        for result in pool.map(_query_server, prompts):
+            assert result
+
+    with Pool(32) as pool:
+        # Cancel requests
+        prompts = ["canceled requests"] * 100
+        pool.map_async(_query_server_long, prompts)
+        time.sleep(0.01)
+        pool.terminate()
+        pool.join()
+
+        # check cancellation stats
+        # give it some times to update the stats
+        time.sleep(1)
+
+        num_aborted_requests = requests.get(
+            "http://localhost:8000/stats").json()["num_aborted_requests"]
+        assert num_aborted_requests > 0
+
+    # check that server still runs after cancellations
+    with Pool(32) as pool:
+        # Try with 100 prompts
+        prompts = ["test prompt after canceled"] * 100
+        for result in pool.map(_query_server, prompts):
+            assert result
--- a/tests/async_engine/test_async_llm_engine.py
+++ b/tests/async_engine/test_async_llm_engine.py
@@ -0,0 +1,96 @@
+import asyncio
+from dataclasses import dataclass
+
+import pytest
+
+from vllm.engine.async_llm_engine import AsyncLLMEngine
+
+
+@dataclass
+class RequestOutput:
+    request_id: int
+    finished: bool = False
+
+
+class MockEngine:
+
+    def __init__(self):
+        self.step_calls = 0
+        self.add_request_calls = 0
+        self.abort_request_calls = 0
+        self.request_id = None
+
+    async def step_async(self):
+        self.step_calls += 1
+        return [RequestOutput(
+            request_id=self.request_id)] if self.request_id else []
+
+    async def encode_request_async(self, *args, **kwargs):
+        pass
+
+    def generate(self, request_id):
+        self.request_id = request_id
+
+    def stop_generating(self):
+        self.request_id = None
+
+    def add_request(self, **kwargs):
+        del kwargs  # Unused
+        self.add_request_calls += 1
+
+    async def add_request_async(self, **kwargs):
+        self.add_request_calls += 1
+        return
+
+    def abort_request(self, request_id):
+        del request_id  # Unused
+        self.abort_request_calls += 1
+
+    def has_unfinished_requests(self):
+        return self.request_id is not None
+
+
+class MockAsyncLLMEngine(AsyncLLMEngine):
+
+    def _init_engine(self, *args, **kwargs):
+        return MockEngine()
+
+
+@pytest.mark.asyncio
+async def test_new_requests_event():
+    engine = MockAsyncLLMEngine(worker_use_ray=False, engine_use_ray=False)
+    engine.start_background_loop()
+    await asyncio.sleep(0.01)
+    assert engine.engine.step_calls == 0
+
+    await engine.add_request("1", "", None)
+    await asyncio.sleep(0.01)
+    assert engine.engine.add_request_calls == 1
+    assert engine.engine.step_calls == 1
+
+    await engine.add_request("2", "", None)
+    engine.engine.generate("2")
+    await asyncio.sleep(0)
+    await asyncio.sleep(0)
+    assert engine.engine.add_request_calls == 2
+    assert engine.engine.step_calls >= 2
+    await asyncio.sleep(0.001)
+    assert engine.engine.step_calls >= 3
+    engine.engine.stop_generating()
+    await asyncio.sleep(0.001)
+    old_step_calls = engine.engine.step_calls
+    await asyncio.sleep(0.001)
+    assert engine.engine.step_calls == old_step_calls
+
+    await engine.add_request("3", "", None)
+    await asyncio.sleep(0.01)
+    assert engine.engine.add_request_calls == 3
+    assert engine.engine.step_calls == old_step_calls + 1
+    await asyncio.sleep(0.01)
+    assert engine.engine.add_request_calls == 3
+    assert engine.engine.step_calls == old_step_calls + 1
+
+    engine = MockAsyncLLMEngine(worker_use_ray=True, engine_use_ray=True)
+    assert engine.get_model_config() is not None
+    assert engine.get_tokenizer() is not None
+    assert engine.get_decoding_config() is not None
--- a/tests/async_engine/test_chat_template.py
+++ b/tests/async_engine/test_chat_template.py
@@ -0,0 +1,134 @@
+import os
+import pathlib
+from dataclasses import dataclass
+
+import pytest
+
+from vllm.entrypoints.openai.protocol import ChatCompletionRequest
+from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
+from vllm.transformers_utils.tokenizer import get_tokenizer
+
+chatml_jinja_path = pathlib.Path(os.path.dirname(os.path.abspath(
+    __file__))).parent.parent / "examples/template_chatml.jinja"
+assert chatml_jinja_path.exists()
+
+# Define models, templates, and their corresponding expected outputs
+MODEL_TEMPLATE_GENERATON_OUTPUT = [
+    ("facebook/opt-125m", None, True,
+     "Hello</s>Hi there!</s>What is the capital of</s>"),
+    ("facebook/opt-125m", None, False,
+     "Hello</s>Hi there!</s>What is the capital of</s>"),
+    ("facebook/opt-125m", chatml_jinja_path, True, """<|im_start|>user
+Hello<|im_end|>
+<|im_start|>assistant
+Hi there!<|im_end|>
+<|im_start|>user
+What is the capital of<|im_end|>
+<|im_start|>assistant
+"""),
+    ("facebook/opt-125m", chatml_jinja_path, False, """<|im_start|>user
+Hello<|im_end|>
+<|im_start|>assistant
+Hi there!<|im_end|>
+<|im_start|>user
+What is the capital of""")
+]
+
+TEST_MESSAGES = [
+    {
+        'role': 'user',
+        'content': 'Hello'
+    },
+    {
+        'role': 'assistant',
+        'content': 'Hi there!'
+    },
+    {
+        'role': 'user',
+        'content': 'What is the capital of'
+    },
+]
+
+
+@dataclass
+class MockTokenizer:
+    chat_template = None
+
+
+@dataclass
+class MockServingChat:
+    tokenizer: MockTokenizer
+
+
+@pytest.mark.asyncio
+async def test_load_chat_template():
+    # Testing chatml template
+    tokenizer = MockTokenizer()
+    mock_serving_chat = MockServingChat(tokenizer)
+    await OpenAIServingChat._load_chat_template(
+        mock_serving_chat, chat_template=chatml_jinja_path)
+
+    template_content = tokenizer.chat_template
+
+    # Test assertions
+    assert template_content is not None
+    # Hard coded value for template_chatml.jinja
+    assert template_content == """{% for message in messages %}{{'<|im_start|>' + message['role'] + '\\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\\n'}}{% endif %}{% endfor %}
+{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\\n' }}{% endif %}"""  # noqa: E501
+
+
+@pytest.mark.asyncio
+async def test_no_load_chat_template_filelike():
+    # Testing chatml template
+    template = "../../examples/does_not_exist"
+    tokenizer = MockTokenizer()
+
+    mock_serving_chat = MockServingChat(tokenizer)
+
+    with pytest.raises(ValueError, match="looks like a file path"):
+        await OpenAIServingChat._load_chat_template(mock_serving_chat,
+                                                    chat_template=template)
+
+
+@pytest.mark.asyncio
+async def test_no_load_chat_template_literallike():
+    # Testing chatml template
+    template = "{{ messages }}"
+    tokenizer = MockTokenizer()
+
+    mock_serving_chat = MockServingChat(tokenizer)
+    await OpenAIServingChat._load_chat_template(mock_serving_chat,
+                                                chat_template=template)
+    template_content = tokenizer.chat_template
+
+    assert template_content == template
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model,template,add_generation_prompt,expected_output",
+    MODEL_TEMPLATE_GENERATON_OUTPUT)
+async def test_get_gen_prompt(model, template, add_generation_prompt,
+                              expected_output):
+    # Initialize the tokenizer
+    tokenizer = get_tokenizer(tokenizer_name=model)
+    mock_serving_chat = MockServingChat(tokenizer)
+    await OpenAIServingChat._load_chat_template(mock_serving_chat,
+                                                chat_template=template)
+
+    # Create a mock request object using keyword arguments
+    mock_request = ChatCompletionRequest(
+        model=model,
+        messages=TEST_MESSAGES,
+        add_generation_prompt=add_generation_prompt)
+
+    # Call the function and get the result
+    result = tokenizer.apply_chat_template(
+        conversation=mock_request.messages,
+        tokenize=False,
+        add_generation_prompt=mock_request.add_generation_prompt)
+
+    # Test assertion
+    assert result == expected_output, (
+        f"The generated prompt does not match the expected output for "
+        f"model {model} and template {template}")
--- a/tests/async_engine/test_merge_async_iterators.py
+++ b/tests/async_engine/test_merge_async_iterators.py
@@ -0,0 +1,41 @@
+import asyncio
+from typing import AsyncIterator, Tuple
+
+import pytest
+
+from vllm.utils import merge_async_iterators
+
+
+@pytest.mark.asyncio
+async def test_merge_async_iterators():
+
+    async def mock_async_iterator(idx: int) -> AsyncIterator[str]:
+        try:
+            while True:
+                yield f"item from iterator {idx}"
+                await asyncio.sleep(0.1)
+        except asyncio.CancelledError:
+            pass
+
+    iterators = [mock_async_iterator(i) for i in range(3)]
+    merged_iterator: AsyncIterator[Tuple[int, str]] = merge_async_iterators(
+        *iterators)
+
+    async def stream_output(generator: AsyncIterator[Tuple[int, str]]):
+        async for idx, output in generator:
+            print(f"idx: {idx}, output: {output}")
+
+    task = asyncio.create_task(stream_output(merged_iterator))
+    await asyncio.sleep(0.5)
+    task.cancel()
+    with pytest.raises(asyncio.CancelledError):
+        await task
+
+    for iterator in iterators:
+        try:
+            await asyncio.wait_for(anext(iterator), 1)
+        except StopAsyncIteration:
+            # All iterators should be cancelled and print this message.
+            print("Iterator was cancelled normally")
+        except (Exception, asyncio.CancelledError) as e:
+            raise AssertionError() from e
--- a/tests/async_engine/test_openapi_server_ray.py
+++ b/tests/async_engine/test_openapi_server_ray.py
@@ -0,0 +1,157 @@
+# imports for guided decoding tests
+import os
+import subprocess
+import sys
+import time
+
+import openai  # use the official client for correctness check
+import pytest
+# using Ray for overall ease of process management, parallel requests,
+# and debugging.
+import ray
+import requests
+
+MAX_SERVER_START_WAIT_S = 600  # wait for server to start for 60 seconds
+# any model with a chat template should work here
+MODEL_NAME = "facebook/opt-125m"
+
+
+@ray.remote(num_gpus=1)
+class ServerRunner:
+
+    def __init__(self, args):
+        env = os.environ.copy()
+        env["PYTHONUNBUFFERED"] = "1"
+        self.proc = subprocess.Popen(
+            ["python3", "-m", "vllm.entrypoints.openai.api_server"] + args,
+            env=env,
+            stdout=sys.stdout,
+            stderr=sys.stderr,
+        )
+        self._wait_for_server()
+
+    def ready(self):
+        return True
+
+    def _wait_for_server(self):
+        # run health check
+        start = time.time()
+        while True:
+            try:
+                if requests.get(
+                        "http://localhost:8000/health").status_code == 200:
+                    break
+            except Exception as err:
+                if self.proc.poll() is not None:
+                    raise RuntimeError("Server exited unexpectedly.") from err
+
+                time.sleep(0.5)
+                if time.time() - start > MAX_SERVER_START_WAIT_S:
+                    raise RuntimeError(
+                        "Server failed to start in time.") from err
+
+    def __del__(self):
+        if hasattr(self, "proc"):
+            self.proc.terminate()
+
+
+@pytest.fixture(scope="session")
+def server():
+    ray.init()
+    server_runner = ServerRunner.remote([
+        "--model",
+        MODEL_NAME,
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "float16",
+        "--max-model-len",
+        "2048",
+        "--enforce-eager",
+        "--engine-use-ray"
+    ])
+    ray.get(server_runner.ready.remote())
+    yield server_runner
+    ray.shutdown()
+
+
+@pytest.fixture(scope="session")
+def client():
+    client = openai.AsyncOpenAI(
+        base_url="http://localhost:8000/v1",
+        api_key="token-abc123",
+    )
+    yield client
+
+
+@pytest.mark.asyncio
+async def test_check_models(server, client: openai.AsyncOpenAI):
+    models = await client.models.list()
+    models = models.data
+    served_model = models[0]
+    assert served_model.id == MODEL_NAME
+    assert all(model.root == MODEL_NAME for model in models)
+
+
+@pytest.mark.asyncio
+async def test_single_completion(server, client: openai.AsyncOpenAI):
+    completion = await client.completions.create(model=MODEL_NAME,
+                                                 prompt="Hello, my name is",
+                                                 max_tokens=5,
+                                                 temperature=0.0)
+
+    assert completion.id is not None
+    assert completion.choices is not None and len(completion.choices) == 1
+    assert completion.choices[0].text is not None and len(
+        completion.choices[0].text) >= 5
+    assert completion.choices[0].finish_reason == "length"
+    assert completion.usage == openai.types.CompletionUsage(
+        completion_tokens=5, prompt_tokens=6, total_tokens=11)
+
+    # test using token IDs
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=[0, 0, 0, 0, 0],
+        max_tokens=5,
+        temperature=0.0,
+    )
+    assert completion.choices[0].text is not None and len(
+        completion.choices[0].text) >= 5
+
+
+@pytest.mark.asyncio
+async def test_single_chat_session(server, client: openai.AsyncOpenAI):
+    messages = [{
+        "role": "system",
+        "content": "you are a helpful assistant"
+    }, {
+        "role": "user",
+        "content": "what is 1+1?"
+    }]
+
+    # test single completion
+    chat_completion = await client.chat.completions.create(model=MODEL_NAME,
+                                                           messages=messages,
+                                                           max_tokens=10,
+                                                           logprobs=True,
+                                                           top_logprobs=5)
+    assert chat_completion.id is not None
+    assert chat_completion.choices is not None and len(
+        chat_completion.choices) == 1
+    assert chat_completion.choices[0].message is not None
+    assert chat_completion.choices[0].logprobs is not None
+    assert chat_completion.choices[0].logprobs.top_logprobs is not None
+    assert len(chat_completion.choices[0].logprobs.top_logprobs[0]) == 5
+    message = chat_completion.choices[0].message
+    assert message.content is not None and len(message.content) >= 10
+    assert message.role == "assistant"
+    messages.append({"role": "assistant", "content": message.content})
+
+    # test multi-turn dialogue
+    messages.append({"role": "user", "content": "express your result in json"})
+    chat_completion = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=messages,
+        max_tokens=10,
+    )
+    message = chat_completion.choices[0].message
+    assert message.content is not None and len(message.content) >= 0
--- a/tests/async_engine/test_request_tracker.py
+++ b/tests/async_engine/test_request_tracker.py
@@ -0,0 +1,67 @@
+import pytest
+
+from vllm.engine.async_llm_engine import RequestTracker
+from vllm.outputs import RequestOutput
+
+
+@pytest.mark.asyncio
+async def test_request_tracker():
+    tracker = RequestTracker()
+    stream_1 = tracker.add_request("1")
+    assert tracker.new_requests_event.is_set()
+    await tracker.wait_for_new_requests()
+    new, finished = tracker.get_new_and_finished_requests()
+    assert not tracker.new_requests_event.is_set()
+    assert len(new) == 1
+    assert new[0]["request_id"] == "1"
+    assert not finished
+    assert not stream_1.finished
+
+    stream_2 = tracker.add_request("2")
+    stream_3 = tracker.add_request("3")
+    assert tracker.new_requests_event.is_set()
+    await tracker.wait_for_new_requests()
+    new, finished = tracker.get_new_and_finished_requests()
+    assert not tracker.new_requests_event.is_set()
+    assert len(new) == 2
+    assert new[0]["request_id"] == "2"
+    assert new[1]["request_id"] == "3"
+    assert not finished
+    assert not stream_2.finished
+    assert not stream_3.finished
+
+    # request_ids must be unique
+    with pytest.raises(KeyError):
+        tracker.add_request("1")
+    assert not tracker.new_requests_event.is_set()
+
+    tracker.abort_request("1")
+    new, finished = tracker.get_new_and_finished_requests()
+    assert len(finished) == 1
+    assert "1" in finished
+    assert not new
+    assert stream_1.finished
+
+    stream_4 = tracker.add_request("4")
+    tracker.abort_request("4")
+    assert tracker.new_requests_event.is_set()
+    await tracker.wait_for_new_requests()
+    new, finished = tracker.get_new_and_finished_requests()
+    assert len(finished) == 1
+    assert "4" in finished
+    assert not new
+    assert stream_4.finished
+
+    stream_5 = tracker.add_request("5")
+    assert tracker.new_requests_event.is_set()
+    tracker.process_request_output(
+        RequestOutput("2", "output", [], [], [], finished=True))
+    await tracker.wait_for_new_requests()
+    new, finished = tracker.get_new_and_finished_requests()
+    assert not tracker.new_requests_event.is_set()
+    assert len(finished) == 1
+    assert "2" in finished
+    assert len(new) == 1
+    assert new[0]["request_id"] == "5"
+    assert stream_2.finished
+    assert not stream_5.finished
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -0,0 +1,50 @@
+"""Compare the short outputs of HF and vLLM when using greedy sampling.
+
+Run `pytest tests/basic_correctness/test_basic_correctness.py`.
+"""
+import os
+
+import pytest
+
+MODELS = [
+    "facebook/opt-125m",
+    "meta-llama/Llama-2-7b-hf",
+]
+VLLM_ATTENTION_BACKEND = "VLLM_ATTENTION_BACKEND"
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [5])
+@pytest.mark.parametrize("enforce_eager", [False, True])
+def test_models(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+    enforce_eager: bool,
+) -> None:
+    backend_by_env_var = os.getenv(VLLM_ATTENTION_BACKEND)
+    if backend_by_env_var == "FLASHINFER" and enforce_eager is False:
+        pytest.skip("Skipping non-eager test for FlashInferBackend.")
+
+    hf_model = hf_runner(model, dtype=dtype)
+    hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+    del hf_model
+
+    vllm_model = vllm_runner(model,
+                             dtype=dtype,
+                             enforce_eager=enforce_eager,
+                             gpu_memory_utilization=0.7)
+    vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+    del vllm_model
+
+    for i in range(len(example_prompts)):
+        hf_output_ids, hf_output_str = hf_outputs[i]
+        vllm_output_ids, vllm_output_str = vllm_outputs[i]
+        assert hf_output_str == vllm_output_str, (
+            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
+        assert hf_output_ids == vllm_output_ids, (
+            f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
--- a/tests/basic_correctness/test_chunked_prefill.py
+++ b/tests/basic_correctness/test_chunked_prefill.py
@@ -0,0 +1,65 @@
+"""Compare the outputs of HF and vLLM when using greedy sampling.
+
+It tests chunked prefill. Chunked prefill can be enabled by
+enable_chunked_prefill=True. If prefill size exceeds max_num_batched_tokens,
+prefill requests are chunked.
+
+Run `pytest tests/models/test_chunked_prefill.py`.
+"""
+import pytest
+
+MODELS = [
+    "facebook/opt-125m",
+    "meta-llama/Llama-2-7b-hf",
+]
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [32])
+@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16])
+@pytest.mark.parametrize("enforce_eager", [False, True])
+# NOTE: Increasing this in this suite will fail CI because we currently cannot
+# reset distributed env properly. Use a value > 1 just when you test.
+@pytest.mark.parametrize("tensor_parallel_size", [1])
+def test_models(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+    chunked_prefill_token_size: int,
+    enforce_eager: bool,
+    tensor_parallel_size: int,
+) -> None:
+    max_num_seqs = min(chunked_prefill_token_size, 256)
+    enable_chunked_prefill = False
+    max_num_batched_tokens = None
+    if chunked_prefill_token_size != -1:
+        enable_chunked_prefill = True
+        max_num_batched_tokens = chunked_prefill_token_size
+
+    hf_model = hf_runner(model, dtype=dtype)
+    hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+    del hf_model
+
+    vllm_model = vllm_runner(
+        model,
+        dtype=dtype,
+        max_num_batched_tokens=max_num_batched_tokens,
+        enable_chunked_prefill=enable_chunked_prefill,
+        tensor_parallel_size=tensor_parallel_size,
+        enforce_eager=enforce_eager,
+        max_num_seqs=max_num_seqs,
+    )
+    vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+    del vllm_model
+
+    for i in range(len(example_prompts)):
+        hf_output_ids, hf_output_str = hf_outputs[i]
+        vllm_output_ids, vllm_output_str = vllm_outputs[i]
+        assert hf_output_str == vllm_output_str, (
+            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
+        assert hf_output_ids == vllm_output_ids, (
+            f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
--- a/tests/basic_correctness/test_preemption.py
+++ b/tests/basic_correctness/test_preemption.py
@@ -0,0 +1,223 @@
+"""Compare the short outputs of HF and vLLM when using greedy sampling.
+
+VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 has to be set before running this test.
+
+Run `VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1
+pytest tests/basic_correctness/test_preemption.py`.
+"""
+import pytest
+
+from vllm import SamplingParams
+from vllm.core.scheduler import (ARTIFICIAL_PREEMPTION_MAX_CNT,
+                                 ENABLE_ARTIFICIAL_PREEMPT)
+
+MODELS = [
+    "facebook/opt-125m",
+]
+
+assert ENABLE_ARTIFICIAL_PREEMPT is True, (
+    "Use an env var VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1. "
+    "`VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest "
+    "tests/basic_correctness/test_preemption.py`")
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [96])
+@pytest.mark.parametrize("chunked_prefill_token_size", [16])
+def test_chunked_prefill_recompute(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+    chunked_prefill_token_size: int,
+) -> None:
+    """Ensure that chunked prefill works with preemption."""
+    max_num_seqs = min(chunked_prefill_token_size, 256)
+    enable_chunked_prefill = False
+    max_num_batched_tokens = None
+    if chunked_prefill_token_size != -1:
+        enable_chunked_prefill = True
+        max_num_batched_tokens = chunked_prefill_token_size
+
+    hf_model = hf_runner(model, dtype=dtype)
+    hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+    del hf_model
+
+    vllm_model = vllm_runner(
+        model,
+        dtype=dtype,
+        max_num_batched_tokens=max_num_batched_tokens,
+        enable_chunked_prefill=enable_chunked_prefill,
+        max_num_seqs=max_num_seqs,
+    )
+    vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+    assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
+            ARTIFICIAL_PREEMPTION_MAX_CNT)
+    del vllm_model
+
+    for i in range(len(example_prompts)):
+        hf_output_ids, hf_output_str = hf_outputs[i]
+        vllm_output_ids, vllm_output_str = vllm_outputs[i]
+        assert hf_output_str == vllm_output_str, (
+            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
+        assert hf_output_ids == vllm_output_ids, (
+            f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize("max_tokens", [96])
+def test_preemption(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+) -> None:
+    """By default, recompute preemption is enabled"""
+
+    hf_model = hf_runner(model, dtype=dtype)
+    hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+    del hf_model
+
+    vllm_model = vllm_runner(
+        model,
+        dtype=dtype,
+    )
+    vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+    assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
+            ARTIFICIAL_PREEMPTION_MAX_CNT)
+    del vllm_model
+
+    for i in range(len(example_prompts)):
+        hf_output_ids, hf_output_str = hf_outputs[i]
+        vllm_output_ids, vllm_output_str = vllm_outputs[i]
+        assert hf_output_str == vllm_output_str, (
+            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
+        assert hf_output_ids == vllm_output_ids, (
+            f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize("max_tokens", [96])
+@pytest.mark.parametrize("beam_width", [4])
+def test_swap(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+    beam_width: int,
+) -> None:
+    """Use beam search enables swapping."""
+    example_prompts = example_prompts[:1]
+    hf_model = hf_runner(model, dtype=dtype)
+    hf_outputs = hf_model.generate_beam_search(example_prompts, beam_width,
+                                               max_tokens)
+    del hf_model
+
+    vllm_model = vllm_runner(model, dtype=dtype, swap_space=10)
+    vllm_outputs = vllm_model.generate_beam_search(example_prompts, beam_width,
+                                                   max_tokens)
+    assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
+            ARTIFICIAL_PREEMPTION_MAX_CNT)
+    del vllm_model
+
+    for i in range(len(example_prompts)):
+        hf_output_ids, _ = hf_outputs[i]
+        vllm_output_ids, _ = vllm_outputs[i]
+        assert len(hf_output_ids) == len(vllm_output_ids)
+        for j in range(len(hf_output_ids)):
+            assert hf_output_ids[j] == vllm_output_ids[j], (
+                f"Test{i} output{j}:\nHF: {hf_output_ids}\n"
+                f"vLLM: {vllm_output_ids}")
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize("max_tokens", [96])
+@pytest.mark.parametrize("beam_width", [4])
+def test_swap_infeasible(
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+    beam_width: int,
+) -> None:
+    """Verify infeasible swap request will be ignored."""
+    BLOCK_SIZE = 16
+    prefill_blocks = 2
+    decode_blocks = max_tokens // BLOCK_SIZE
+    example_prompts = example_prompts[:1]
+
+    vllm_model = vllm_runner(
+        model,
+        dtype=dtype,
+        swap_space=10,
+        block_size=BLOCK_SIZE,
+        # Since beam search have more than 1 sequence, prefill + decode blocks
+        # are not enough to finish.
+        num_gpu_blocks_override=prefill_blocks + decode_blocks,
+        max_model_len=(prefill_blocks + decode_blocks) * BLOCK_SIZE,
+    )
+    sampling_params = SamplingParams(n=beam_width,
+                                     use_beam_search=True,
+                                     temperature=0.0,
+                                     max_tokens=max_tokens,
+                                     ignore_eos=True)
+    req_outputs = vllm_model.model.generate(
+        example_prompts,
+        sampling_params=sampling_params,
+    )
+    assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
+            ARTIFICIAL_PREEMPTION_MAX_CNT)
+    del vllm_model
+    # Verify the request is ignored and not hang.
+    assert req_outputs[0].outputs[0].finish_reason == "length"
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize("max_tokens", [96])
+def test_preemption_infeasible(
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+) -> None:
+    """Verify infeasible preemption request will be ignored."""
+    BLOCK_SIZE = 16
+    prefill_blocks = 2
+    decode_blocks = max_tokens // BLOCK_SIZE
+    vllm_model = vllm_runner(
+        model,
+        dtype=dtype,
+        block_size=BLOCK_SIZE,
+        # Not enough gpu blocks to complete a single sequence.
+        # preemption should happen, and the sequence should be
+        # ignored instead of hanging forever.
+        num_gpu_blocks_override=prefill_blocks + decode_blocks // 2,
+        max_model_len=((prefill_blocks + decode_blocks // 2) * BLOCK_SIZE),
+    )
+    sampling_params = SamplingParams(max_tokens=max_tokens, ignore_eos=True)
+    req_outputs = vllm_model.model.generate(
+        example_prompts,
+        sampling_params=sampling_params,
+    )
+
+    assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
+            ARTIFICIAL_PREEMPTION_MAX_CNT)
+    del vllm_model
+    # Verify the request is ignored and not hang.
+    for req_output in req_outputs:
+        outputs = req_output.outputs
+        assert len(outputs) == 1
+        assert outputs[0].finish_reason == "length"
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -0,0 +1,417 @@
+import contextlib
+import gc
+import os
+from typing import List, Optional, Tuple
+
+import pytest
+import torch
+from PIL import Image
+from transformers import (AutoModelForCausalLM, AutoProcessor,
+                          LlavaForConditionalGeneration)
+
+from vllm import LLM, SamplingParams
+from vllm.config import TokenizerPoolConfig, VisionLanguageConfig
+from vllm.distributed import destroy_model_parallel
+from vllm.sequence import MultiModalData
+from vllm.transformers_utils.tokenizer import get_tokenizer
+
+_TEST_DIR = os.path.dirname(__file__)
+_TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")]
+_LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "summary.txt")]
+
+# Multi modal related
+_PIXEL_VALUES_FILES = [
+    os.path.join(_TEST_DIR, "images", filename) for filename in
+    ["stop_sign_pixel_values.pt", "cherry_blossom_pixel_values.pt"]
+]
+_IMAGE_FEATURES_FILES = [
+    os.path.join(_TEST_DIR, "images", filename) for filename in
+    ["stop_sign_image_features.pt", "cherry_blossom_image_features.pt"]
+]
+_IMAGE_FILES = [
+    os.path.join(_TEST_DIR, "images", filename)
+    for filename in ["stop_sign.jpg", "cherry_blossom.jpg"]
+]
+_IMAGE_PROMPTS = [
+    "<image>\nUSER: What's the content of the image?\nASSISTANT:",
+    "<image>\nUSER: What is the season?\nASSISTANT:"
+]
+assert len(_PIXEL_VALUES_FILES) == len(_IMAGE_FEATURES_FILES) == len(
+    _IMAGE_FILES) == len(_IMAGE_PROMPTS)
+
+
+def _read_prompts(filename: str) -> List[str]:
+    with open(filename, "r") as f:
+        prompts = f.readlines()
+        return prompts
+
+
+def cleanup():
+    destroy_model_parallel()
+    with contextlib.suppress(AssertionError):
+        torch.distributed.destroy_process_group()
+    gc.collect()
+    torch.cuda.empty_cache()
+
+
+@pytest.fixture()
+def should_do_global_cleanup_after_test(request) -> bool:
+    """Allow subdirectories to skip global cleanup by overriding this fixture.
+    This can provide a ~10x speedup for non-GPU unit tests since they don't need
+    to initialize torch.
+    """
+
+    if request.node.get_closest_marker("skip_global_cleanup"):
+        return False
+
+    return True
+
+
+@pytest.fixture(autouse=True)
+def cleanup_fixture(should_do_global_cleanup_after_test: bool):
+    yield
+    if should_do_global_cleanup_after_test:
+        cleanup()
+
+
+@pytest.fixture(scope="session")
+def hf_image_prompts() -> List[str]:
+    return _IMAGE_PROMPTS
+
+
+@pytest.fixture(scope="session")
+def hf_images() -> List[Image.Image]:
+    return [Image.open(filename) for filename in _IMAGE_FILES]
+
+
+@pytest.fixture()
+def vllm_images(request) -> "torch.Tensor":
+    vision_language_config = request.getfixturevalue("model_and_config")[1]
+    all_images = []
+    if vision_language_config.image_input_type == (
+            VisionLanguageConfig.ImageInputType.IMAGE_FEATURES):
+        filenames = _IMAGE_FEATURES_FILES
+    else:
+        filenames = _PIXEL_VALUES_FILES
+    for filename in filenames:
+        all_images.append(torch.load(filename))
+    return torch.concat(all_images, dim=0)
+
+
+@pytest.fixture()
+def vllm_image_prompts(request) -> List[str]:
+    vision_language_config = request.getfixturevalue("model_and_config")[1]
+    return [
+        "<image>" * (vision_language_config.image_feature_size - 1) + p
+        for p in _IMAGE_PROMPTS
+    ]
+
+
+@pytest.fixture
+def example_prompts() -> List[str]:
+    prompts = []
+    for filename in _TEST_PROMPTS:
+        prompts += _read_prompts(filename)
+    return prompts
+
+
+@pytest.fixture
+def example_long_prompts() -> List[str]:
+    prompts = []
+    for filename in _LONG_PROMPTS:
+        prompts += _read_prompts(filename)
+    return prompts
+
+
+_STR_DTYPE_TO_TORCH_DTYPE = {
+    "half": torch.half,
+    "bfloat16": torch.bfloat16,
+    "float": torch.float,
+}
+
+_VISION_LANGUAGE_MODELS = {
+    "llava-hf/llava-1.5-7b-hf": LlavaForConditionalGeneration,
+}
+
+
+class HfRunner:
+
+    def __init__(
+        self,
+        model_name: str,
+        tokenizer_name: Optional[str] = None,
+        dtype: str = "half",
+    ) -> None:
+        assert dtype in _STR_DTYPE_TO_TORCH_DTYPE
+        torch_dtype = _STR_DTYPE_TO_TORCH_DTYPE[dtype]
+        self.model_name = model_name
+        if model_name not in _VISION_LANGUAGE_MODELS:
+            self.model = AutoModelForCausalLM.from_pretrained(
+                model_name,
+                torch_dtype=torch_dtype,
+                trust_remote_code=True,
+            ).cuda()
+            self.processor = None
+        else:
+            self.model = _VISION_LANGUAGE_MODELS[model_name].from_pretrained(
+                model_name,
+                torch_dtype=torch_dtype,
+                trust_remote_code=True,
+            ).cuda()
+            self.processor = AutoProcessor.from_pretrained(
+                model_name,
+                torch_dtype=torch_dtype,
+            )
+        if tokenizer_name is None:
+            tokenizer_name = model_name
+        self.tokenizer = get_tokenizer(tokenizer_name, trust_remote_code=True)
+
+    def generate(
+        self,
+        prompts: List[str],
+        images: Optional[List[Image.Image]] = None,
+        **kwargs,
+    ) -> List[Tuple[List[int], str]]:
+        outputs: List[Tuple[List[int], str]] = []
+        if images:
+            assert len(prompts) == len(images)
+        for i, prompt in enumerate(prompts):
+            if self.model_name not in _VISION_LANGUAGE_MODELS:
+                input_ids = self.tokenizer(prompt,
+                                           return_tensors="pt").input_ids
+                inputs = {"input_ids": input_ids.cuda()}
+            else:
+                image = images[i] if images else None
+                inputs = self.processor(text=prompt,
+                                        images=image,
+                                        return_tensors="pt")
+                inputs = {
+                    key: value.cuda() if value is not None else None
+                    for key, value in inputs.items()
+                }
+            output_ids = self.model.generate(
+                **inputs,
+                use_cache=True,
+                **kwargs,
+            )
+            output_str = self.tokenizer.batch_decode(
+                output_ids,
+                skip_special_tokens=True,
+                clean_up_tokenization_spaces=False,
+            )
+            output_ids = output_ids.cpu().tolist()
+            outputs.append((output_ids, output_str))
+        return outputs
+
+    def generate_greedy(
+        self,
+        prompts: List[str],
+        max_tokens: int,
+        images: Optional["torch.Tensor"] = None,
+    ) -> List[Tuple[List[int], str]]:
+        outputs = self.generate(prompts,
+                                do_sample=False,
+                                max_new_tokens=max_tokens,
+                                images=images)
+        for i in range(len(outputs)):
+            output_ids, output_str = outputs[i]
+            outputs[i] = (output_ids[0], output_str[0])
+        return outputs
+
+    def generate_beam_search(
+        self,
+        prompts: List[str],
+        beam_width: int,
+        max_tokens: int,
+    ) -> List[Tuple[List[int], str]]:
+        outputs = self.generate(prompts,
+                                do_sample=False,
+                                max_new_tokens=max_tokens,
+                                num_beams=beam_width,
+                                num_return_sequences=beam_width)
+        for i in range(len(outputs)):
+            output_ids, output_str = outputs[i]
+            for j in range(len(output_ids)):
+                output_ids[j] = [
+                    x for x in output_ids[j]
+                    if x != self.tokenizer.pad_token_id
+                ]
+            outputs[i] = (output_ids, output_str)
+        return outputs
+
+    def generate_greedy_logprobs(
+        self,
+        prompts: List[str],
+        max_tokens: int,
+    ) -> List[List[torch.Tensor]]:
+        all_logprobs = []
+        for prompt in prompts:
+            input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids
+            output = self.model.generate(
+                input_ids.cuda(),
+                use_cache=True,
+                do_sample=False,
+                max_new_tokens=max_tokens,
+                output_hidden_states=True,
+                return_dict_in_generate=True,
+            )
+            seq_logprobs = []
+            for hidden_states in output.hidden_states:
+                last_hidden_states = hidden_states[-1][0]
+                logits = torch.matmul(
+                    last_hidden_states,
+                    self.model.get_output_embeddings().weight.t(),
+                )
+                if self.model.get_output_embeddings().bias is not None:
+                    logits += self.model.get_output_embeddings(
+                    ).bias.unsqueeze(0)
+                logprobs = torch.nn.functional.log_softmax(logits,
+                                                           dim=-1,
+                                                           dtype=torch.float32)
+                seq_logprobs.append(logprobs)
+            all_logprobs.append(seq_logprobs)
+        return all_logprobs
+
+    def __del__(self):
+        del self.model
+        cleanup()
+
+
+@pytest.fixture
+def hf_runner():
+    return HfRunner
+
+
+class VllmRunner:
+
+    def __init__(
+        self,
+        model_name: str,
+        tokenizer_name: Optional[str] = None,
+        # Use smaller max model length, otherwise bigger model cannot run due
+        # to kv cache size limit.
+        max_model_len=1024,
+        dtype: str = "half",
+        disable_log_stats: bool = True,
+        tensor_parallel_size: int = 1,
+        block_size: int = 16,
+        enable_chunked_prefill: bool = False,
+        swap_space=4,
+        **kwargs,
+    ) -> None:
+        self.model = LLM(
+            model=model_name,
+            tokenizer=tokenizer_name,
+            trust_remote_code=True,
+            dtype=dtype,
+            swap_space=swap_space,
+            disable_log_stats=disable_log_stats,
+            tensor_parallel_size=tensor_parallel_size,
+            max_model_len=max_model_len,
+            block_size=block_size,
+            enable_chunked_prefill=enable_chunked_prefill,
+            **kwargs,
+        )
+
+    def generate(
+        self,
+        prompts: List[str],
+        sampling_params: SamplingParams,
+        images: Optional["torch.Tensor"] = None,
+    ) -> List[Tuple[List[int], str]]:
+        if images is not None:
+            assert len(prompts) == images.shape[0]
+        req_outputs = self.model.generate(
+            prompts,
+            sampling_params=sampling_params,
+            multi_modal_data=MultiModalData(type=MultiModalData.Type.IMAGE,
+                                            data=images)
+            if images is not None else None)
+        outputs = []
+        for req_output in req_outputs:
+            prompt_str = req_output.prompt
+            prompt_ids = req_output.prompt_token_ids
+            req_sample_output_ids = []
+            req_sample_output_strs = []
+            for sample in req_output.outputs:
+                output_str = sample.text
+                output_ids = sample.token_ids
+                req_sample_output_ids.append(prompt_ids + output_ids)
+                req_sample_output_strs.append(prompt_str + output_str)
+            outputs.append((req_sample_output_ids, req_sample_output_strs))
+        return outputs
+
+    def generate_w_logprobs(
+        self,
+        prompts: List[str],
+        sampling_params: SamplingParams,
+    ) -> List[Tuple[List[int], str]]:
+        assert sampling_params.logprobs is not None
+
+        req_outputs = self.model.generate(prompts,
+                                          sampling_params=sampling_params)
+        outputs = []
+        for req_output in req_outputs:
+            for sample in req_output.outputs:
+                output_str = sample.text
+                output_ids = sample.token_ids
+                output_logprobs = sample.logprobs
+            outputs.append((output_ids, output_str, output_logprobs))
+        return outputs
+
+    def generate_greedy(
+        self,
+        prompts: List[str],
+        max_tokens: int,
+        images: Optional[torch.Tensor] = None,
+    ) -> List[Tuple[List[int], str]]:
+        greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens)
+        outputs = self.generate(prompts, greedy_params, images=images)
+        return [(output_ids[0], output_str[0])
+                for output_ids, output_str in outputs]
+
+    def generate_greedy_logprobs(
+        self,
+        prompts: List[str],
+        max_tokens: int,
+        num_logprobs: int,
+    ) -> List[Tuple[List[int], str]]:
+        greedy_logprobs_params = SamplingParams(temperature=0.0,
+                                                max_tokens=max_tokens,
+                                                logprobs=num_logprobs)
+        outputs = self.generate_w_logprobs(prompts, greedy_logprobs_params)
+
+        return [(output_ids, output_str, output_logprobs)
+                for output_ids, output_str, output_logprobs in outputs]
+
+    def generate_beam_search(
+        self,
+        prompts: List[str],
+        beam_width: int,
+        max_tokens: int,
+    ) -> List[Tuple[List[int], str]]:
+        beam_search_params = SamplingParams(n=beam_width,
+                                            use_beam_search=True,
+                                            temperature=0.0,
+                                            max_tokens=max_tokens)
+        outputs = self.generate(prompts, beam_search_params)
+        return outputs
+
+    def __del__(self):
+        del self.model
+        cleanup()
+
+
+@pytest.fixture(scope="session")
+def vllm_runner():
+    return VllmRunner
+
+
+def get_tokenizer_pool_config(tokenizer_group_type):
+    if tokenizer_group_type is None:
+        return None
+    if tokenizer_group_type == "ray":
+        return TokenizerPoolConfig(pool_size=1,
+                                   pool_type="ray",
+                                   extra_config={})
+    raise ValueError(f"Unknown tokenizer_group_type: {tokenizer_group_type}")
--- a/tests/core/init.py
+++ b/tests/core/init.py
--- a/tests/core/block/init.py
+++ b/tests/core/block/init.py
--- a/tests/core/block/conftest.py
+++ b/tests/core/block/conftest.py
@@ -0,0 +1,12 @@
+import pytest
+
+
+@pytest.fixture()
+def should_do_global_cleanup_after_test() -> bool:
+    """Disable the global cleanup fixture for tests in this directory. This
+    provides a ~10x speedup for unit tests that don't load a model to GPU.
+
+    This requires that tests in this directory clean up after themselves if they
+    use the GPU.
+    """
+    return False
--- a/tests/core/block/e2e/conftest.py
+++ b/tests/core/block/e2e/conftest.py
@@ -0,0 +1,41 @@
+import pytest
+
+from tests.conftest import cleanup
+from vllm import LLM
+from vllm.model_executor.utils import set_random_seed
+
+
+@pytest.fixture
+def baseline_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
+                           baseline_llm_kwargs, seed):
+    return create_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
+                                baseline_llm_kwargs, seed)
+
+
+@pytest.fixture
+def test_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
+                       test_llm_kwargs, seed):
+    return create_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
+                                test_llm_kwargs, seed)
+
+
+def create_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
+                         distinct_llm_kwargs, seed):
+    kwargs = {
+        **common_llm_kwargs,
+        **per_test_common_llm_kwargs,
+        **distinct_llm_kwargs,
+    }
+
+    def generator_inner():
+        llm = LLM(**kwargs)
+
+        set_random_seed(seed)
+
+        yield llm
+        del llm
+        cleanup()
+
+    for llm in generator_inner():
+        yield llm
+        del llm
--- a/tests/core/block/e2e/test_correctness.py
+++ b/tests/core/block/e2e/test_correctness.py
@@ -0,0 +1,455 @@
+from itertools import cycle
+
+import pytest
+
+from vllm import SamplingParams
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        # Use a small model for a fast test.
+        "model": "facebook/opt-125m",
+
+        # skip cuda graph creation for fast test.
+        "enforce_eager": True,
+
+        # Allow only 5 sequences of ~1024 tokens in worst case.
+        "block_size": 16,
+        "num_gpu_blocks_override": 5 * (64 + 1),
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{
+    "use_v2_block_manager": False
+}])
+@pytest.mark.parametrize("test_llm_kwargs", [{"use_v2_block_manager": True}])
+@pytest.mark.parametrize("batch_size", [10])
+@pytest.mark.parametrize("seed", [1])
+def test_v1_v2_greedy_equality_with_preemption(baseline_llm_generator,
+                                               test_llm_generator, batch_size):
+    """Verify block manager v2 produces same outputs as block manager v1, even
+    when there is preemption.
+
+    This constructs two LLM, each with limited number of GPU blocks. The limit
+    is decided such that as the sequences in the batch grow, sequences must be
+    preempted and removed from cache.
+
+    If the output token ids are equivalent, then we have confidence that the KV
+    cache is not corrupted in the v2 block manager.
+
+    NOTE: We want a significant number of generated tokens so that any incorrect
+    KV mapping has time to build up error.
+    """
+    output_len = 1024
+    temperature = 0.0
+
+    # We want to ensure equality even with preemption.
+    # We force the total block size to be 1 + cdiv(output_len, block_size)
+    # so that only one sequence can fit at a time (once the sequences grow).
+
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+
+    prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
+
+    sampling_params = SamplingParams(
+        max_tokens=output_len,
+        ignore_eos=True,
+        temperature=temperature,
+    )
+
+    print('Getting token ids from block manager v1')
+    baseline_token_ids = get_token_ids_from_llm_generator(
+        baseline_llm_generator, prompts, sampling_params)
+
+    print('Getting token ids from block manager v2')
+    test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
+                                                      prompts, sampling_params)
+
+    for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
+                                                    test_token_ids):
+        assert expected_token_ids == actual_token_ids
+
+    assert baseline_token_ids == test_token_ids
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        # Use a small model for a fast test.
+        "model": "facebook/opt-125m",
+
+        # skip cuda graph creation for fast test.
+        "enforce_eager": True,
+
+        # Use a large block size to trigger more copy-on-writes.
+        "block_size": 32,
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{
+    "use_v2_block_manager": False
+}])
+@pytest.mark.parametrize("test_llm_kwargs", [{"use_v2_block_manager": True}])
+@pytest.mark.parametrize("batch_size", [10])
+@pytest.mark.parametrize("seed", [1])
+def test_v1_v2_greedy_equality_with_cow(baseline_llm_generator,
+                                        test_llm_generator, batch_size):
+    """Verify beam search equality with block manager v1 and v2.
+
+    This requires copy-on-writes; if the v1 and v2 output is the same, then
+    we have some confidence cow is working.
+    """
+    output_len = 128
+    temperature = 0.0
+
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+
+    prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
+
+    sampling_params = SamplingParams(
+        max_tokens=output_len,
+        ignore_eos=True,
+        temperature=temperature,
+        use_beam_search=True,
+        best_of=2,
+    )
+
+    print('Getting token ids from block manager v1')
+    baseline_token_ids = get_token_ids_from_llm_generator(
+        baseline_llm_generator, prompts, sampling_params)
+
+    print('Getting token ids from block manager v2')
+    test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
+                                                      prompts, sampling_params)
+
+    for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
+                                                    test_token_ids):
+        assert expected_token_ids == actual_token_ids
+
+    assert baseline_token_ids == test_token_ids
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        # Use a small model for a fast test.
+        "model": "facebook/opt-125m",
+
+        # Our prompts will generate 128 tokens; since the prompts themselves are
+        # small, we don't need much KV space beyond 128.
+        "max_model_len": 160,
+
+        # skip cuda graph creation for fast test.
+        "enforce_eager": True,
+
+        # Lookahead scheduling only supported in v2 block manager.
+        "use_v2_block_manager": True,
+    }])
+@pytest.mark.parametrize(
+    "per_test_common_llm_kwargs",
+    [
+        {
+            "block_size": 16,
+
+            # Allow only 2 sequences of ~128 tokens in worst case.
+            # Note 8 = 128/block_size
+            "num_gpu_blocks_override": 2 * (8 + 1),
+        },
+        {
+            "block_size": 8,
+
+            # Allow only 2 sequences of ~128 tokens in worst case.
+            # Note 16 = 128/block_size
+            "num_gpu_blocks_override": 2 * (16 + 1),
+        }
+    ])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{
+    "num_lookahead_slots": 0,
+}])
+@pytest.mark.parametrize(
+    "test_llm_kwargs",
+    [{
+        # We run one test with block_size < lookahead_slots, one test with
+        # block_size > lookahead_slots
+        "num_lookahead_slots": 10,
+    }])
+@pytest.mark.parametrize("batch_size", [4])
+@pytest.mark.parametrize("seed", [1])
+def test_lookahead_greedy_equality_with_preemption(baseline_llm_generator,
+                                                   test_llm_generator,
+                                                   batch_size):
+    """Verify vLLM produces the same output with greedy sampling, when lookahead
+    scheduling is used vs. not.
+
+    Lookahead scheduling is not expected to modify the output, as it simply
+    allocates empty slots ahead of the known token ids in a sliding fashion.
+
+    This test constrains the total number of blocks to force preemption. It also
+    varies the block size so that the lookahead size is less than and greater
+    than the block size.
+    """
+    output_len = 128
+    temperature = 0.0
+
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+
+    prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
+
+    sampling_params = SamplingParams(
+        max_tokens=output_len,
+        ignore_eos=True,
+        temperature=temperature,
+    )
+
+    print('Getting token ids without lookahead scheduling')
+    baseline_token_ids = get_token_ids_from_llm_generator(
+        baseline_llm_generator, prompts, sampling_params)
+
+    print('Getting token ids with lookahead scheduling')
+    test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
+                                                      prompts, sampling_params)
+
+    for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
+                                                    test_token_ids):
+        assert expected_token_ids == actual_token_ids
+
+    assert baseline_token_ids == test_token_ids
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [
+        {
+            # Use a small model for a fast test.
+            "model": "facebook/opt-125m",
+
+            # skip cuda graph creation for fast test.
+            "enforce_eager": True,
+            "enable_chunked_prefill": True,
+            "max_num_batched_tokens": 2,
+            "max_num_seqs": 2,
+        },
+    ])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [
+    {
+        "use_v2_block_manager": False,
+    },
+])
+@pytest.mark.parametrize("test_llm_kwargs", [
+    {
+        "use_v2_block_manager": True,
+        "num_lookahead_slots": 0,
+    },
+    {
+        "use_v2_block_manager": True,
+        "num_lookahead_slots": 5,
+    },
+])
+@pytest.mark.parametrize("batch_size", [4])
+@pytest.mark.parametrize("seed", [1])
+def test_chunked_prefill_block_manager_v2(baseline_llm_generator,
+                                          test_llm_generator, batch_size):
+    """Verify that chunked prefill works with BlockManagerV2, with and without
+    lookahead scheduling.
+    """
+    output_len = 32
+    temperature = 0.0
+
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+
+    prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
+
+    sampling_params = SamplingParams(
+        max_tokens=output_len,
+        ignore_eos=True,
+        temperature=temperature,
+    )
+
+    print('Getting token ids with BlockManagerV1')
+    baseline_token_ids = get_token_ids_from_llm_generator(
+        baseline_llm_generator, prompts, sampling_params)
+
+    print('Getting token ids with BlockManagerV2')
+    test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
+                                                      prompts, sampling_params)
+
+    for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
+                                                    test_token_ids):
+        assert expected_token_ids == actual_token_ids
+
+    assert baseline_token_ids == test_token_ids
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        # Use a small model for a fast test.
+        "model": "facebook/opt-125m",
+
+        # skip cuda graph creation for fast test.
+        "enforce_eager": True,
+
+        # Allow only 5 sequences of ~1024 tokens in worst case.
+        "block_size": 16,
+        "num_gpu_blocks_override": 5 * (64 + 1),
+
+        # Enable prefill cache
+        "enable_prefix_caching": True,
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{
+    "use_v2_block_manager": False
+}])
+@pytest.mark.parametrize("test_llm_kwargs", [{"use_v2_block_manager": True}])
+@pytest.mark.parametrize("batch_size", [10])
+@pytest.mark.parametrize("seed", [1])
+def test_v1_v2_greedy_equality_prefix_caching_enabled_with_preemption(
+        baseline_llm_generator, test_llm_generator, batch_size):
+    """Verify block manager v2 produces same outputs as block manager v1, even
+    when there is preemption.
+
+    This constructs two LLM, each with limited number of GPU blocks. The limit
+    is decided such that as the sequences in the batch grow, sequences must be
+    preempted and removed from cache.
+
+    If the output token ids are equivalent, then we have confidence that the KV
+    cache is not corrupted in the v2 block manager.
+
+    NOTE: We want a significant number of generated tokens so that any incorrect
+    KV mapping has time to build up error.
+    """
+    output_len = 1024
+    temperature = 0.0
+
+    # We want to ensure equality even with preemption.
+    # We force the total block size to be 1 + cdiv(output_len, block_size)
+    # so that only one sequence can fit at a time (once the sequences grow).
+
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+
+    prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
+
+    sampling_params = SamplingParams(
+        max_tokens=output_len,
+        ignore_eos=True,
+        temperature=temperature,
+    )
+
+    print('Getting token ids from block manager v1')
+    baseline_token_ids = get_token_ids_from_llm_generator(
+        baseline_llm_generator, prompts, sampling_params)
+
+    print('Getting token ids from block manager v2')
+    test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
+                                                      prompts, sampling_params)
+
+    for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
+                                                    test_token_ids):
+        assert expected_token_ids == actual_token_ids
+
+    assert baseline_token_ids == test_token_ids
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        # Use a small model for a fast test.
+        "model": "facebook/opt-125m",
+
+        # skip cuda graph creation for fast test.
+        "enforce_eager": True,
+
+        # Allow only 5 sequences of ~1024 tokens in worst case.
+        "block_size": 16,
+        "num_gpu_blocks_override": 5 * (64 + 1),
+
+        # Test APC in v2 block
+        "use_v2_block_manager": True,
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{
+    "enable_prefix_caching": False
+}])
+@pytest.mark.parametrize("test_llm_kwargs", [{"enable_prefix_caching": True}])
+@pytest.mark.parametrize("batch_size", [10])
+@pytest.mark.parametrize("seed", [1])
+def test_auto_prefix_caching_with_preemption(baseline_llm_generator,
+                                             test_llm_generator, batch_size):
+    """Verify block manager v2 with auto prefix caching enabled produces same
+    outputs as auto prefix caching disabled, even when there is preemption.
+
+    This constructs two LLM, each with limited number of GPU blocks. The limit
+    is decided such that as the sequences in the batch grow, sequences must be
+    preempted and removed from cache.
+
+    If the output token ids are equivalent, then we have confidence that auto
+    prefix caching itself at least don't cause result error.
+    """
+    output_len = 1024
+    temperature = 0.0
+
+    # We want to ensure equality even with preemption.
+    # We force the total block size to be 1 + cdiv(output_len, block_size)
+    # so that only one sequence can fit at a time (once the sequences grow).
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+
+    prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
+
+    sampling_params = SamplingParams(
+        max_tokens=output_len,
+        ignore_eos=True,
+        temperature=temperature,
+    )
+
+    print('Getting token ids with APC disabled')
+    baseline_token_ids = get_token_ids_from_llm_generator(
+        baseline_llm_generator, prompts, sampling_params)
+
+    print('Getting token ids with APC enabled')
+    test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
+                                                      prompts, sampling_params)
+
+    for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
+                                                    test_token_ids):
+        assert expected_token_ids == actual_token_ids
+
+    assert baseline_token_ids == test_token_ids
+
+
+def get_token_ids_from_llm_generator(llm_generator, prompts, sampling_params):
+    for llm in llm_generator:
+        outputs = llm.generate(prompts, sampling_params, use_tqdm=True)
+        token_ids = [output.outputs[0].token_ids for output in outputs]
+        del llm
+
+    return token_ids
--- a/tests/core/block/test_block_manager_v2.py
+++ b/tests/core/block/test_block_manager_v2.py
@@ -0,0 +1,103 @@
+import pytest
+
+from vllm.core.block_manager_v2 import BlockSpaceManagerV2
+from vllm.core.interfaces import AllocStatus
+from vllm.sequence import Logprob, SequenceStatus
+from vllm.utils import chunk_list
+
+from ..utils import create_seq_group
+
+
+@pytest.mark.parametrize("block_size", [16])
+@pytest.mark.parametrize("num_gpu_blocks", [8, 40, 80])
+@pytest.mark.parametrize("num_seqs_per_group", [1, 4])
+@pytest.mark.parametrize("watermark", [0.0, 0.5])
+def test_can_allocate_seq_group(block_size: int, num_seqs_per_group: int,
+                                num_gpu_blocks: int, watermark: float):
+    block_manager = BlockSpaceManagerV2(
+        block_size=block_size,
+        num_gpu_blocks=num_gpu_blocks,
+        num_cpu_blocks=1024,
+        watermark=watermark,
+    )
+    num_watermark_blocks = int(watermark * num_gpu_blocks)
+
+    num_output_blocks_per_seq = 1
+
+    # NOTE: This should be num_output_blocks_per_seq * num_seqs_per_group, but
+    # the current implementation assumes all seqs are new prompts / don't have
+    # different output lens.
+    num_output_blocks = num_output_blocks_per_seq
+
+    for num_prompt_blocks in range(1, num_gpu_blocks - num_output_blocks):
+        seq_group = create_seq_group(
+            seq_prompt_len=block_size * num_prompt_blocks,
+            seq_output_lens=[
+                block_size * num_output_blocks_per_seq
+                for _ in range(num_seqs_per_group)
+            ],
+        )
+
+        assert num_prompt_blocks + num_output_blocks <= num_gpu_blocks
+
+        can_allocate_result = block_manager.can_allocate(seq_group)
+
+        num_required_blocks = num_prompt_blocks + num_output_blocks
+
+        if num_gpu_blocks - num_required_blocks < num_watermark_blocks:
+            assert can_allocate_result == AllocStatus.NEVER
+        elif num_gpu_blocks >= num_required_blocks:
+            assert can_allocate_result == AllocStatus.OK
+        else:
+            assert can_allocate_result == AllocStatus.LATER
+
+
+@pytest.mark.parametrize("block_size", [1, 8])
+@pytest.mark.parametrize("prompt_len", [1, 7, 8])
+@pytest.mark.parametrize("num_slots_to_append", [1, 8, 129])
+@pytest.mark.parametrize("num_lookahead_slots", [0, 10])
+def test_append_slots(block_size, prompt_len, num_slots_to_append,
+                      num_lookahead_slots):
+    """Verify append_slots consumes the correct number of blocks from the block
+    table.
+    """
+
+    num_gpu_blocks = 1024
+    watermark = 0.1
+    block_manager = BlockSpaceManagerV2(
+        block_size=block_size,
+        num_gpu_blocks=num_gpu_blocks,
+        num_cpu_blocks=0,
+        watermark=watermark,
+    )
+
+    seq_group = create_seq_group(
+        seq_prompt_len=prompt_len,
+        seq_output_lens=[0],
+    )
+
+    # Allocate seq
+    assert block_manager.can_allocate(seq_group)
+    block_manager.allocate(seq_group)
+
+    # Seq seq to RUNNING
+    seq = seq_group.get_seqs()[0]
+    seq.status = SequenceStatus.RUNNING
+
+    # Append tokens to the sequeqnce
+    for token_id in range(num_slots_to_append):
+        seq.append_token_id(token_id, {token_id: Logprob(0.0)})
+
+    # Append slots for new tokens and lookahead slots.
+    free_blocks_before_append = block_manager.get_num_free_gpu_blocks()
+    block_manager.append_slots(seq, num_lookahead_slots)
+    num_consumed_blocks = (free_blocks_before_append -
+                           block_manager.get_num_free_gpu_blocks())
+
+    # Expect consumed blocks to be new blocks required to support the new slots.
+    expected_consumed_blocks = len(
+        chunk_list(
+            list(
+                range(prompt_len + num_slots_to_append + num_lookahead_slots)),
+            block_size)) - len(chunk_list(list(range(prompt_len)), block_size))
+    assert num_consumed_blocks == expected_consumed_blocks
--- a/tests/core/block/test_block_table.py
+++ b/tests/core/block/test_block_table.py
@@ -0,0 +1,575 @@
+import pytest
+
+from vllm.core.block.block_table import BlockTable
+from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator
+from vllm.utils import Device, cdiv, chunk_list
+
+
+@pytest.mark.parametrize("block_size", [16])
+@pytest.mark.parametrize("sequence_len", [1, 16, 129])
+def test_allocate_naive(block_size: int, sequence_len: int):
+    """Test the allocation of blocks using the naive allocator.
+
+    This test creates a CpuGpuBlockAllocator with the specified block size and
+    number of blocks. It then allocates multiple BlockTables with varying
+    sequence lengths and verifies that the number of free blocks decreases as
+    expected after each allocation.
+    """
+    assert block_size > 1
+    num_gpu_blocks = 1024
+
+    allocator = CpuGpuBlockAllocator.create(
+        allocator_type="naive",
+        num_gpu_blocks=num_gpu_blocks,
+        num_cpu_blocks=1024,
+        block_size=block_size,
+    )
+
+    token_ids = list(range(sequence_len))
+    num_blocks_per_alloc = len(list(chunk_list(token_ids, block_size)))
+
+    block_tables = []
+    for i in range(5):
+        assert allocator.get_num_free_blocks(
+            device=Device.GPU) == num_gpu_blocks - i * num_blocks_per_alloc
+
+        block_tables.append(
+            BlockTable(
+                block_size=block_size,
+                block_allocator=allocator,
+            ))
+        block_tables[-1].allocate(token_ids=token_ids, device=Device.GPU)
+
+
+@pytest.mark.parametrize("block_size", [16])
+@pytest.mark.parametrize("sequence_len", [1, 16, 129])
+def test_allocate_prefix_caching(block_size: int, sequence_len: int):
+    """Test the allocation of blocks using the prefix caching allocator.
+
+    This test creates a CpuGpuBlockAllocator with the specified block size and
+    number of blocks, using the prefix caching allocator. It then allocates
+    multiple BlockTables with varying sequence lengths and verifies that the
+    number of free blocks decreases as expected after each allocation.
+
+    The test expects all sequences to share allocations, except for their last
+    block, which may be mutable. It calculates the expected number of immutable
+    and mutable blocks per allocation based on the sequence length and block
+    size.
+    """
+    assert block_size > 1
+    num_gpu_blocks = 1024
+
+    allocator = CpuGpuBlockAllocator.create(
+        allocator_type="prefix_caching",
+        num_gpu_blocks=num_gpu_blocks,
+        num_cpu_blocks=1024,
+        block_size=block_size,
+    )
+
+    token_ids = list(range(sequence_len))
+    chunked_tokens = list(chunk_list(token_ids, block_size))
+    num_mutable_blocks_per_alloc = 0 if len(
+        chunked_tokens[-1]) == block_size else 1
+    num_immutable_blocks_per_alloc = len(
+        chunked_tokens) - num_mutable_blocks_per_alloc
+
+    block_tables = []
+    for alloc_i in range(1, 6):
+
+        block_tables.append(
+            BlockTable(
+                block_size=block_size,
+                block_allocator=allocator,
+            ))
+        block_tables[-1].allocate(token_ids=token_ids, device=Device.GPU)
+
+        # Expect all sequences to share allocations, except for their last block
+        # (which may be mutable).
+        assert allocator.get_num_free_blocks(
+            device=Device.GPU) == num_gpu_blocks - (
+                num_immutable_blocks_per_alloc + num_mutable_blocks_per_alloc *
+                (alloc_i))
+
+
+@pytest.mark.parametrize("block_size", [16])
+@pytest.mark.parametrize("sequence_len", [1, 16, 129])
+@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
+@pytest.mark.parametrize("device", ["cpu", "gpu"])
+def test_allocate_free(block_size: int, sequence_len: int, allocator_type: str,
+                       device: str):
+    """Test the allocation and freeing of blocks using different allocators and
+    devices.
+
+    This test creates a CpuGpuBlockAllocator with the specified block size,
+    number of blocks, allocator type, and device. It then allocates a BlockTable
+    multiple times with the same sequence and verifies that the number of free
+    blocks remains consistent after each allocation and freeing.
+    """
+    device = Device[device.upper()]
+
+    num_device_blocks = 1024
+    allocator = CpuGpuBlockAllocator.create(
+        allocator_type=allocator_type,
+        num_gpu_blocks=num_device_blocks,
+        num_cpu_blocks=num_device_blocks,
+        block_size=block_size,
+    )
+
+    token_ids = list(range(sequence_len))
+    num_blocks_per_alloc = len(list(chunk_list(token_ids, block_size)))
+
+    block_table = BlockTable(
+        block_size=block_size,
+        block_allocator=allocator,
+    )
+
+    for i in range(5):
+        block_table.allocate(token_ids=token_ids, device=device)
+        assert allocator.get_num_free_blocks(
+            device) == num_device_blocks - num_blocks_per_alloc
+        assert all(block_id is not None
+                   for block_id in block_table.physical_block_ids)
+
+        block_table.free()
+        assert allocator.get_num_free_blocks(device) == num_device_blocks
+
+
+@pytest.mark.parametrize("block_size", [1, 8])
+@pytest.mark.parametrize("sequence_len", [1, 16, 129])
+@pytest.mark.parametrize("append_len", [1, 16, 129])
+@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
+def test_append_token_ids_allocation(block_size: int, sequence_len: int,
+                                     append_len: int, allocator_type: str):
+    """Test the allocation behavior when appending token IDs to a BlockTable.
+
+    This test creates a CpuGpuBlockAllocator with the specified block size,
+    number of blocks, and allocator type. It then allocates a BlockTable with an
+    initial sequence and appends additional token IDs to it. The test verifies
+    that the number of allocated blocks before and after appending matches the
+    expected values.
+    """
+
+    num_gpu_blocks = 1024
+
+    allocator = CpuGpuBlockAllocator.create(
+        allocator_type=allocator_type,
+        num_gpu_blocks=num_gpu_blocks,
+        num_cpu_blocks=1024,
+        block_size=block_size,
+    )
+
+    token_ids = list(range(sequence_len))
+    token_ids_to_append = list(range(append_len))
+
+    block_table = BlockTable(
+        block_size=block_size,
+        block_allocator=allocator,
+    )
+
+    num_expected_blocks_before_append = len(
+        list(chunk_list(token_ids, block_size)))
+    num_expected_appended_blocks = len(
+        list(chunk_list(token_ids + token_ids_to_append,
+                        block_size))) - num_expected_blocks_before_append
+
+    block_table.allocate(token_ids=token_ids, device=Device.GPU)
+
+    assert len(
+        block_table.physical_block_ids) == num_expected_blocks_before_append
+    block_table.append_token_ids(token_ids_to_append)
+    assert len(
+        block_table.physical_block_ids
+    ) == num_expected_blocks_before_append + num_expected_appended_blocks
+
+
+@pytest.mark.parametrize("block_size", [1, 8])
+@pytest.mark.parametrize("sequence_len", [1, 16, 129])
+@pytest.mark.parametrize("num_empty_slots", [1, 16, 129])
+@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
+def test_ensure_num_empty_slots_allocation(block_size: int, sequence_len: int,
+                                           num_empty_slots: int,
+                                           allocator_type: str):
+    """Test the allocation behavior when ensuring a certain number of empty
+    slots in a BlockTable.
+
+    This test creates a CpuGpuBlockAllocator with the specified block size,
+    number of blocks, and allocator type. It then allocates a BlockTable with an
+    initial sequence and ensures a certain number of empty slots. The test
+    verifies that the number of allocated blocks before and after ensuring empty
+    slots matches the expected values. It also checks that filling up the empty
+    slots does not consume additional blocks.
+    """
+    num_gpu_blocks = 1024
+
+    allocator = CpuGpuBlockAllocator.create(
+        allocator_type=allocator_type,
+        num_gpu_blocks=num_gpu_blocks,
+        num_cpu_blocks=1024,
+        block_size=block_size,
+    )
+
+    token_ids = list(range(sequence_len))
+
+    block_table = BlockTable(
+        block_size=block_size,
+        block_allocator=allocator,
+    )
+
+    num_expected_blocks_before_append = len(
+        list(chunk_list(token_ids, block_size)))
+    num_expected_appended_blocks = len(
+        list(chunk_list(token_ids + [-1] * num_empty_slots,
+                        block_size))) - num_expected_blocks_before_append
+
+    block_table.allocate(token_ids=token_ids, device=Device.GPU)
+
+    # Assert that the empty slots consume the expected number of additional
+    # blocks.
+    assert len(
+        block_table.physical_block_ids) == num_expected_blocks_before_append
+    block_table.ensure_num_empty_slots(num_empty_slots)
+    assert len(
+        block_table.physical_block_ids
+    ) == num_expected_blocks_before_append + num_expected_appended_blocks
+
+    # Now, ensure no additional blocks consumed as we fill up the empty slots.
+    num_free_blocks = allocator.get_num_free_blocks(device=Device.GPU)
+    block_table.append_token_ids(token_ids=list(range(num_empty_slots)))
+    assert num_free_blocks == allocator.get_num_free_blocks(device=Device.GPU)
+
+
+@pytest.mark.parametrize("block_size", [1, 8])
+@pytest.mark.parametrize("sequence_len", [1, 9])
+@pytest.mark.parametrize("append_len", [1, 16, 129])
+@pytest.mark.parametrize("append_size", [1, 4, 129])
+@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
+def test_append_token_ids_correct_content(block_size: int, sequence_len: int,
+                                          append_len: int, allocator_type: str,
+                                          append_size: int):
+    """Verify token ids are correctly appended. Appends various amounts of
+    token ids in various append sizes, and verifies the final sequence is
+    correct.
+    """
+    num_gpu_blocks = 1024
+
+    allocator = CpuGpuBlockAllocator.create(
+        allocator_type=allocator_type,
+        num_gpu_blocks=num_gpu_blocks,
+        num_cpu_blocks=1024,
+        block_size=block_size,
+    )
+
+    token_ids = list(range(sequence_len))
+    token_ids_to_append = list(range(append_len))
+
+    block_table = BlockTable(
+        block_size=block_size,
+        block_allocator=allocator,
+    )
+    block_table.allocate(token_ids=token_ids, device=Device.GPU)
+
+    appended_so_far = []
+    for append in chunk_list(token_ids_to_append, append_size):
+        block_table.append_token_ids(append)
+        appended_so_far.extend(append)
+
+        assert block_table._get_all_token_ids() == token_ids + appended_so_far
+
+    assert block_table._get_all_token_ids() == token_ids + token_ids_to_append
+
+
+@pytest.mark.parametrize("seq_len", [1, 9, 129])
+@pytest.mark.parametrize("block_size", [1, 8])
+@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
+def test_fork(seq_len: int, block_size: int, allocator_type: str):
+    """Create a sequence using the specified allocator.
+        1. Assert that after forking the sequence, the free block count is the
+            same.
+        2. Assert that the forked sequence has the same physical mappings.
+        3. Then free the original sequence; verify that the free block count is
+            the same.
+        4. Finally, free the forked sequence and verify that the free block
+            count drops to zero.
+    """
+    num_gpu_blocks = 1024
+
+    allocator = CpuGpuBlockAllocator.create(
+        allocator_type=allocator_type,
+        num_gpu_blocks=num_gpu_blocks,
+        num_cpu_blocks=0,
+        block_size=block_size,
+    )
+
+    token_ids = list(range(seq_len))
+
+    block_table = BlockTable(
+        block_size=block_size,
+        block_allocator=allocator,
+    )
+
+    block_table.allocate(token_ids)
+
+    num_free_blocks_before_fork = allocator.get_num_free_blocks(
+        device=Device.GPU)
+
+    forked_block_table = block_table.fork()
+
+    # Expect physical_block_ids and token_ids to match.
+    assert (block_table.physical_block_ids ==
+            forked_block_table.physical_block_ids)
+    assert block_table._get_all_token_ids(
+    ) == forked_block_table._get_all_token_ids()
+
+    # Do not expect any additional allocations.
+    assert allocator.get_num_free_blocks(
+        device=Device.GPU) == num_free_blocks_before_fork
+
+    # Free the original blocks. Assert num free blocks does not change, since
+    # refcount is nonzero.
+    block_table.free()
+    assert allocator.get_num_free_blocks(
+        device=Device.GPU) == num_free_blocks_before_fork
+
+    # Expect the forked block table to be unaffected by the free.
+    assert all(block_id is not None
+               for block_id in forked_block_table.physical_block_ids)
+
+    # Free the forked blocks. Assert num free blocks does change, since
+    # refcount is now zero.
+    forked_block_table.free()
+    assert allocator.get_num_free_blocks(device=Device.GPU) == num_gpu_blocks
+
+
+@pytest.mark.parametrize("block_size", [8])
+@pytest.mark.parametrize("sequence_len", [1, 16, 129])
+@pytest.mark.parametrize("append_len", [1, 16, 129])
+@pytest.mark.parametrize("appender", ["forked", "original"])
+@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
+def test_cow(block_size: int, sequence_len: int, append_len: int,
+             allocator_type: str, appender: str):
+    """Fork a sequence; append to the forked sequence; verify there's a CoW.
+    """
+    num_gpu_blocks = 1024
+
+    allocator = CpuGpuBlockAllocator.create(
+        allocator_type=allocator_type,
+        num_gpu_blocks=num_gpu_blocks,
+        num_cpu_blocks=0,
+        block_size=block_size,
+    )
+
+    token_ids = list(range(sequence_len))
+    token_ids_to_append = list(range(append_len))
+
+    original_block_table = BlockTable(
+        block_size=block_size,
+        block_allocator=allocator,
+    )
+
+    num_expected_non_cow_blocks = cdiv(sequence_len, block_size)
+    num_expected_cow_blocks = cdiv(sequence_len + append_len,
+                                   block_size) - (sequence_len // block_size)
+
+    original_block_table.allocate(token_ids=token_ids, device=Device.GPU)
+    original_block_ids = original_block_table.physical_block_ids
+
+    forked_block_table = original_block_table.fork()
+
+    # Expect no additional allocation (copy on _write_).
+    assert allocator.get_num_free_blocks(
+        Device.GPU) == (num_gpu_blocks - num_expected_non_cow_blocks)
+
+    if appender == "forked":
+        appender_block_table = forked_block_table
+        static_block_table = original_block_table
+    elif appender == "original":
+        appender_block_table = original_block_table
+        static_block_table = forked_block_table
+    else:
+        raise ValueError(f"unknown test config {appender=}")
+
+    # Write tokens.
+    appender_block_table.append_token_ids(token_ids_to_append)
+
+    # Expect the non-appending block table to have no change.
+    assert static_block_table.physical_block_ids == original_block_ids
+    assert appender_block_table.physical_block_ids != original_block_ids
+
+    # Expect the blocks changed during append to have a CoW.
+    assert allocator.get_num_free_blocks(
+        Device.GPU) == num_gpu_blocks - (num_expected_non_cow_blocks +
+                                         num_expected_cow_blocks)
+
+    cows = allocator.clear_copy_on_writes()
+    if sequence_len % block_size > 0:
+        # If the last block in the sequence is not full, then when appending we
+        # expect a CoW.
+        assert cows
+
+        cow_block_id = sequence_len // block_size
+        expected_src = static_block_table.physical_block_ids[cow_block_id]
+        expected_dst = appender_block_table.physical_block_ids[cow_block_id]
+
+        assert expected_src in cows
+        assert expected_dst in cows[expected_src]
+    else:
+        # Otherwise, there should be no copy-on-write.
+        assert not cows
+
+    static_block_table.free()
+    appender_block_table.free()
+
+    # After free, expect all blocks to be freed.
+    assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
+
+
+@pytest.mark.parametrize("block_size", [8])
+@pytest.mark.parametrize("sequence_len", [1, 16, 129])
+@pytest.mark.parametrize("append_len", [1, 16, 129])
+@pytest.mark.parametrize("lookahead_slots", [1, 16, 129])
+@pytest.mark.parametrize("appender", ["forked", "original"])
+@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
+def test_cow_lookahead_simple(block_size: int, sequence_len: int,
+                              append_len: int, lookahead_slots: int,
+                              allocator_type: str, appender: str):
+    """Similar to test_cow, except with lookahead allocation. The assertions are
+    less rigorous due to the complexity of the property under test.
+    """
+    num_gpu_blocks = 1024
+
+    allocator = CpuGpuBlockAllocator.create(
+        allocator_type=allocator_type,
+        num_gpu_blocks=num_gpu_blocks,
+        num_cpu_blocks=0,
+        block_size=block_size,
+    )
+
+    token_ids = list(range(sequence_len))
+    token_ids_to_append = list(range(append_len))
+
+    original_block_table = BlockTable(
+        block_size=block_size,
+        block_allocator=allocator,
+    )
+
+    original_block_table.allocate(token_ids=token_ids, device=Device.GPU)
+
+    # Allocate lookahead slots.
+    original_block_table.ensure_num_empty_slots(lookahead_slots)
+    original_block_ids = original_block_table.physical_block_ids
+
+    forked_block_table = original_block_table.fork()
+
+    if appender == "forked":
+        appender_block_table = forked_block_table
+        static_block_table = original_block_table
+    elif appender == "original":
+        appender_block_table = original_block_table
+        static_block_table = forked_block_table
+    else:
+        raise ValueError(f"unknown test config {appender=}")
+
+    # Write tokens.
+    appender_block_table.append_token_ids(token_ids_to_append)
+
+    # Expect the non-appending block table to have no change.
+    assert static_block_table.physical_block_ids == original_block_ids
+    assert appender_block_table.physical_block_ids != original_block_ids
+
+    cows = allocator.clear_copy_on_writes()
+
+    # Always expect copy-on-write
+    assert cows
+
+    if sequence_len % block_size > 0:
+        # If the last block in the sequence is not full, then when appending we
+        # expect a CoW.
+        assert cows
+
+        cow_block_id = sequence_len // block_size
+        expected_src = static_block_table.physical_block_ids[cow_block_id]
+        expected_dst = appender_block_table.physical_block_ids[cow_block_id]
+
+        assert expected_src in cows
+        assert expected_dst in cows[expected_src]
+
+    static_block_table.free()
+    appender_block_table.free()
+
+    # After free, expect all blocks to be freed.
+    assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
+
+
+@pytest.mark.parametrize("block_size", [1, 8])
+@pytest.mark.parametrize("sequence_len", [1, 16, 129])
+@pytest.mark.parametrize("num_new_tokens", [1, 16, 129])
+@pytest.mark.parametrize("num_lookahead_slots", [1, 7, 8])
+@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
+def test_num_blocks_touched_by_append_slots(block_size: int, sequence_len: int,
+                                            num_new_tokens: int,
+                                            num_lookahead_slots: int,
+                                            allocator_type: str):
+    """Verify correct calculation of get_num_blocks_touched_by_append_slots.
+
+    This is done by using copy-on-write, which requires any modified block to
+    be copied before write if the refcount > 1. We set the refcount>1 by forking
+    a sequence, then measure the free blocks before and after an append. If the
+    number of consumed blocks equals what `get_num_blocks_touched_by_append_
+    slots` returns, then the calculation is correct.
+    """
+
+    num_gpu_blocks = 1024
+
+    allocator = CpuGpuBlockAllocator.create(
+        allocator_type=allocator_type,
+        num_gpu_blocks=num_gpu_blocks,
+        num_cpu_blocks=0,
+        block_size=block_size,
+    )
+
+    token_ids = list(range(sequence_len))
+    token_ids_to_append = list(range(num_new_tokens))
+
+    block_table = BlockTable(
+        block_size=block_size,
+        block_allocator=allocator,
+    )
+
+    block_table.allocate(token_ids=token_ids, device=Device.GPU)
+
+    # Add lookahead before fork so both sequences have the same lookahead
+    # blocks.
+    block_table.ensure_num_empty_slots(num_empty_slots=num_lookahead_slots)
+
+    # Fork sequence so that every block has refcount > 1.
+    _ = block_table.fork()
+
+    # Determine how many blocks should be touched.
+    expected_num_touched_blocks = (
+        block_table.get_num_blocks_touched_by_append_slots(
+            token_ids=token_ids_to_append,
+            num_lookahead_slots=num_lookahead_slots))
+
+    # Measure how many blocks are touched by measuring num_free_blocks before
+    # and after the append.
+    #
+    # We expect append_token_ids to CoW all mutated blocks that have refcount>1.
+    num_free_blocks_before_append = allocator.get_num_free_blocks(Device.GPU)
+    block_table.append_token_ids(token_ids_to_append, num_lookahead_slots)
+    num_consumed_blocks = (num_free_blocks_before_append -
+                           allocator.get_num_free_blocks(Device.GPU))
+
+    # TODO(cade) ensure equality when num_lookahead_slots > 0.
+    # The reason we have < is because lookahead blocks are not copied eagerly;
+    # they are copied on first write. This will cause issues for beam search +
+    # speculative decoding. This is acceptable for now as it is a large effort
+    # to combine the two. To fix this, we can ensure single sequence ownership
+    # of lookahead blocks by appending empty slots to each block, which will
+    # trigger the CoW.
+    #
+    # Until then, we can accept that the consumed tokens are <= the expected
+    # tokens when appending with lookahead.
+    if num_lookahead_slots > 0:
+        assert num_consumed_blocks <= expected_num_touched_blocks
+    else:
+        assert num_consumed_blocks == expected_num_touched_blocks
--- a/tests/core/block/test_common.py
+++ b/tests/core/block/test_common.py
@@ -0,0 +1,42 @@
+import random
+
+import pytest
+
+from vllm.core.block.common import RefCounter
+
+
+@pytest.mark.parametrize("seed", list(range(20)))
+@pytest.mark.parametrize("num_incrs", [1, 100])
+@pytest.mark.parametrize("num_blocks", [1024])
+def test_incr(seed: int, num_incrs: int, num_blocks: int):
+    random.seed(seed)
+
+    all_block_indices = list(range(num_blocks))
+    counter = RefCounter(all_block_indices=all_block_indices)
+
+    block_id = random.randint(0, num_blocks - 1)
+    for i in range(num_incrs):
+        value = counter.incr(block_id)
+        assert value == i + 1
+
+
+@pytest.mark.parametrize("seed", list(range(20)))
+@pytest.mark.parametrize("num_incrs", [1, 100])
+@pytest.mark.parametrize("num_blocks", [1024])
+def test_incr_decr(seed: int, num_incrs: int, num_blocks: int):
+    random.seed(seed)
+
+    all_block_indices = list(range(num_blocks))
+    counter = RefCounter(all_block_indices=all_block_indices)
+
+    block_id = random.randint(0, num_blocks - 1)
+    for i in range(num_incrs):
+        value = counter.incr(block_id)
+        assert value == i + 1
+
+    for i in range(num_incrs):
+        value = counter.decr(block_id)
+        assert value == num_incrs - (i + 1)
+
+    with pytest.raises(AssertionError):
+        counter.decr(block_id)
--- a/tests/core/block/test_cpu_gpu_block_allocator.py
+++ b/tests/core/block/test_cpu_gpu_block_allocator.py
@@ -0,0 +1,93 @@
+import pytest
+
+from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator
+from vllm.utils import Device, chunk_list
+
+
+@pytest.mark.parametrize("num_cpu_blocks", [0, 512])
+@pytest.mark.parametrize("num_gpu_blocks", [1024])
+@pytest.mark.parametrize("block_size", [16])
+@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
+def test_allocate_mutable(num_cpu_blocks: int, num_gpu_blocks: int,
+                          block_size: int, allocator_type: str):
+    allocator = CpuGpuBlockAllocator.create(
+        allocator_type=allocator_type,
+        num_gpu_blocks=num_gpu_blocks,
+        num_cpu_blocks=num_cpu_blocks,
+        block_size=block_size,
+    )
+
+    assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks
+    assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
+
+    cpu_blocks = [
+        allocator.allocate_mutable(prev_block=None, device=Device.CPU)
+        for _ in range(num_cpu_blocks)
+    ]
+    assert allocator.get_num_free_blocks(Device.CPU) == 0
+    assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
+
+    gpu_blocks = [
+        allocator.allocate_mutable(prev_block=None, device=Device.GPU)
+        for _ in range(num_gpu_blocks)
+    ]
+    assert allocator.get_num_free_blocks(Device.CPU) == 0
+    assert allocator.get_num_free_blocks(Device.GPU) == 0
+
+    _ = [allocator.free(block) for block in cpu_blocks]
+    assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks
+    assert allocator.get_num_free_blocks(Device.GPU) == 0
+
+    _ = [allocator.free(block) for block in gpu_blocks]
+    assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks
+    assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
+
+
+@pytest.mark.parametrize("num_cpu_blocks", [0, 512])
+@pytest.mark.parametrize("num_gpu_blocks", [1024])
+@pytest.mark.parametrize("block_size", [2])
+@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
+def test_allocate_immutable(num_cpu_blocks: int, num_gpu_blocks: int,
+                            block_size: int, allocator_type: str):
+    allocator = CpuGpuBlockAllocator.create(
+        allocator_type=allocator_type,
+        num_gpu_blocks=num_gpu_blocks,
+        num_cpu_blocks=num_cpu_blocks,
+        block_size=block_size,
+    )
+
+    unique_token_ids = list(
+        range((num_cpu_blocks + num_gpu_blocks) * block_size))
+    gpu_token_ids = chunk_list(unique_token_ids[:num_gpu_blocks * block_size],
+                               block_size)
+    cpu_token_ids = chunk_list(unique_token_ids[num_gpu_blocks * block_size:],
+                               block_size)
+
+    assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks
+    assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
+
+    cpu_blocks = [
+        allocator.allocate_immutable(prev_block=None,
+                                     token_ids=token_ids,
+                                     device=Device.CPU)
+        for token_ids in cpu_token_ids
+    ]
+    assert allocator.get_num_free_blocks(Device.CPU) == 0
+    assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
+
+    gpu_blocks = [
+        allocator.allocate_immutable(prev_block=None,
+                                     token_ids=token_ids,
+                                     device=Device.GPU)
+        for token_ids in gpu_token_ids
+    ]
+    assert allocator.get_num_free_blocks(Device.CPU) == 0
+    assert allocator.get_num_free_blocks(Device.GPU) == 0
+
+    _ = [allocator.free(block) for block in cpu_blocks]
+    assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks
+    assert allocator.get_num_free_blocks(Device.GPU) == 0
+
+    _ = [allocator.free(block) for block in gpu_blocks]
+    assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks
+    assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
--- a/tests/core/block/test_naive_block.py
+++ b/tests/core/block/test_naive_block.py
@@ -0,0 +1,102 @@
+from typing import List, Optional
+
+import pytest
+
+from vllm.core.block.interfaces import Block, BlockAllocator
+from vllm.core.block.naive_block import NaiveBlock, NaiveBlockAllocator
+
+
+class TestNaiveBlockAllocator:
+
+    @staticmethod
+    def create_allocate_lambda(allocate_type: str,
+                               allocator: NaiveBlockAllocator,
+                               prev_block: Optional[Block],
+                               token_ids: List[int]):
+        if allocate_type == "immutable":
+            allocate_block = lambda: allocator.allocate_immutable(
+                prev_block=prev_block, token_ids=token_ids)
+        elif allocate_type == "mutable":
+            allocate_block = lambda: allocator.allocate_mutable(prev_block=
+                                                                prev_block)
+        else:
+            raise ValueError()
+
+        return allocate_block
+
+    @staticmethod
+    @pytest.mark.parametrize("allocate_type", ["immutable", "mutable"])
+    @pytest.mark.parametrize("num_blocks", [1, 1024])
+    @pytest.mark.parametrize("block_size", [1, 16])
+    def test_allocate_ooms(allocate_type: str, num_blocks: int,
+                           block_size: int):
+        allocator = NaiveBlockAllocator(create_block=NaiveBlock,
+                                        num_blocks=num_blocks,
+                                        block_size=block_size)
+        allocate_block = TestNaiveBlockAllocator.create_allocate_lambda(
+            allocate_type,
+            allocator,
+            prev_block=None,
+            token_ids=list(range(block_size)))
+
+        [allocate_block() for _ in range(num_blocks)]
+        with pytest.raises(BlockAllocator.NoFreeBlocksError):
+            allocate_block()
+
+    @staticmethod
+    @pytest.mark.parametrize("allocate_type", ["immutable", "mutable"])
+    @pytest.mark.parametrize("num_blocks", [1, 1024])
+    @pytest.mark.parametrize("block_size", [1, 16])
+    def test_free_prevents_oom(allocate_type: str, num_blocks: int,
+                               block_size: int):
+        allocator = NaiveBlockAllocator(create_block=NaiveBlock,
+                                        num_blocks=num_blocks,
+                                        block_size=block_size)
+        allocate_block = TestNaiveBlockAllocator.create_allocate_lambda(
+            allocate_type,
+            allocator,
+            prev_block=None,
+            token_ids=list(range(block_size)))
+
+        blocks = [allocate_block() for _ in range(num_blocks)]
+
+        with pytest.raises(BlockAllocator.NoFreeBlocksError):
+            allocate_block()
+
+        block_to_free = blocks.pop()
+
+        for _ in range(100):
+            block_id = block_to_free.block_id
+            allocator.free(block_to_free)
+            assert block_to_free.block_id is None
+
+            new_block = allocate_block()
+            assert new_block.block_id == block_id
+
+            with pytest.raises(BlockAllocator.NoFreeBlocksError):
+                allocate_block()
+
+            block_to_free = new_block
+
+    @staticmethod
+    @pytest.mark.parametrize("allocate_type", ["immutable", "mutable"])
+    @pytest.mark.parametrize("num_blocks", [1024])
+    @pytest.mark.parametrize("block_size", [16])
+    def test_get_num_free_blocks(allocate_type: str, num_blocks: int,
+                                 block_size: int):
+        allocator = NaiveBlockAllocator(create_block=NaiveBlock,
+                                        num_blocks=num_blocks,
+                                        block_size=block_size)
+        allocate_block = TestNaiveBlockAllocator.create_allocate_lambda(
+            allocate_type,
+            allocator,
+            prev_block=None,
+            token_ids=list(range(block_size)))
+
+        assert allocator.get_num_free_blocks() == num_blocks
+
+        blocks = [allocate_block() for _ in range(num_blocks)]
+
+        for i, block in enumerate(blocks):
+            assert allocator.get_num_free_blocks() == i
+            allocator.free(block)
--- a/tests/core/block/test_prefix_caching_block.py
+++ b/tests/core/block/test_prefix_caching_block.py
@@ -0,0 +1,509 @@
+import math
+import random
+from typing import List, Optional
+from unittest.mock import MagicMock
+
+import pytest
+
+from vllm.core.block.interfaces import Block, BlockAllocator
+from vllm.core.block.prefix_caching_block import (PrefixCachingBlock,
+                                                  PrefixCachingBlockAllocator)
+
+
+class TestPrefixCachingBlock:
+
+    @staticmethod
+    @pytest.mark.parametrize("seed", list(range(10)))
+    @pytest.mark.parametrize("block_size", [1, 16])
+    @pytest.mark.parametrize("is_curr_block_full", [True, False])
+    def test_first_block_has_correct_content_hash(seed: int, block_size: int,
+                                                  is_curr_block_full: bool):
+        """Verify a block which is first in the sequence has the correct hash.
+        """
+        random.seed(seed)
+        num_to_fill = block_size if is_curr_block_full else random.randint(
+            0, block_size - 1)
+        token_ids = list(range(num_to_fill))
+        mock_allocator = MagicMock(spec=PrefixCachingBlockAllocator)
+
+        block_with_prev = PrefixCachingBlock(
+            prev_block=None,
+            token_ids=token_ids,
+            block_size=block_size,
+            prefix_caching_allocator=mock_allocator)
+
+        if is_curr_block_full:
+            # Expect hash since block is full.
+            assert block_with_prev.content_hash == (
+                PrefixCachingBlock.hash_block_tokens(
+                    is_first_block=True,
+                    prev_block_hash=None,
+                    cur_block_token_ids=token_ids))
+        else:
+            # Do not expect hash since block is not full.
+            assert block_with_prev.content_hash is None
+
+    @staticmethod
+    @pytest.mark.parametrize("seed", list(range(10)))
+    @pytest.mark.parametrize("block_size", [1, 16])
+    @pytest.mark.parametrize("is_curr_block_full", [True, False])
+    @pytest.mark.parametrize("prev_block_has_hash", [True, False])
+    def test_nth_block_has_correct_content_hash(seed: int, block_size: int,
+                                                is_curr_block_full: bool,
+                                                prev_block_has_hash: bool):
+        """Verify a block which is not first in the sequence has the correct
+        hash.
+        """
+
+        random.seed(seed)
+
+        previous_block = MagicMock(spec=PrefixCachingBlock)
+        prev_block_hash = random.randint(0, 1000)
+        previous_block.content_hash = (prev_block_hash
+                                       if prev_block_has_hash else None)
+
+        num_to_fill = block_size if is_curr_block_full else random.randint(
+            0, block_size - 1)
+        token_ids = list(range(num_to_fill))
+        mock_allocator = MagicMock(spec=PrefixCachingBlockAllocator)
+
+        block_with_prev = PrefixCachingBlock(
+            prev_block=previous_block,
+            token_ids=token_ids,
+            block_size=block_size,
+            prefix_caching_allocator=mock_allocator,
+        )
+
+        if is_curr_block_full and prev_block_has_hash:
+            # Expect hash since block is full and previous block has hash.
+            assert (block_with_prev.content_hash ==
+                    PrefixCachingBlock.hash_block_tokens(
+                        is_first_block=False,
+                        prev_block_hash=prev_block_hash,
+                        cur_block_token_ids=token_ids))
+        else:
+            # Do not expect hash since block is not full or the previous block
+            # does not have a hash.
+            assert block_with_prev.content_hash is None
+
+    @staticmethod
+    @pytest.mark.parametrize("block_size", [1, 2, 16])
+    @pytest.mark.parametrize("num_tokens", list(range(3)))
+    @pytest.mark.parametrize("num_empty_trailing_blocks", [0, 1, 10])
+    def test_blocks_have_correct_hash_in_chain(block_size: int,
+                                               num_tokens: int,
+                                               num_empty_trailing_blocks: int):
+        """Create two chains of logical blocks with the same contents.
+        Assert the hashes are equal.
+        """
+        random.seed(0)
+
+        token_ids = [random.randint(0, 50_000) for _ in range(num_tokens)]
+
+        first_chain, second_chain = [
+            TestPrefixCachingBlock.create_chain(
+                block_size=block_size,
+                token_ids=token_ids,
+                num_empty_trailing_blocks=num_empty_trailing_blocks)
+            for _ in range(2)
+        ]
+
+        for first_chain_block, second_chain_block in zip(
+                first_chain, second_chain):
+            assert (first_chain_block.content_hash ==
+                    second_chain_block.content_hash)
+
+        if not first_chain or not second_chain:
+            assert first_chain == second_chain
+            assert num_tokens == 0
+
+    @staticmethod
+    def create_chain(block_size: int,
+                     token_ids: List[int],
+                     num_empty_trailing_blocks=0) -> List[PrefixCachingBlock]:
+        """Helper method which creates a chain of blocks.
+        """
+        blocks = []
+        num_blocks = math.ceil(
+            len(token_ids) / block_size) + num_empty_trailing_blocks
+
+        if num_blocks == 0:
+            return []
+
+        allocator = MagicMock(spec=PrefixCachingBlockAllocator)
+
+        prev_block = None
+        for block_number in range(0, num_blocks):
+            prev_block = PrefixCachingBlock(
+                prev_block=prev_block,
+                token_ids=[],
+                block_size=block_size,
+                prefix_caching_allocator=allocator,
+            )
+
+            tokens_to_append = token_ids[block_number *
+                                         block_size:(block_number + 1) *
+                                         block_size]
+            if tokens_to_append:
+                prev_block.append_token_ids(tokens_to_append)
+
+            blocks.append(prev_block)
+
+        return blocks
+
+
+class TestPrefixCachingBlockAllocator:
+
+    @staticmethod
+    def create_allocate_lambda(allocate_type: str, allocator: BlockAllocator,
+                               prev_block: Optional[Block],
+                               token_ids: List[int]):
+        if allocate_type == "immutable":
+            allocate_block = lambda: allocator.allocate_immutable(
+                prev_block=prev_block, token_ids=token_ids)
+        elif allocate_type == "mutable":
+            allocate_block = lambda: allocator.allocate_mutable(prev_block=
+                                                                prev_block)
+        else:
+            raise ValueError()
+
+        return allocate_block
+
+    @staticmethod
+    @pytest.mark.parametrize("num_blocks", [1, 1024])
+    @pytest.mark.parametrize("block_size", [1, 16])
+    def test_allocate_mutable_ooms(num_blocks: int, block_size: int):
+        allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
+                                                block_size=block_size)
+        allocate_block = TestPrefixCachingBlockAllocator.create_allocate_lambda(
+            allocate_type="mutable",
+            allocator=allocator,
+            prev_block=None,
+            token_ids=list(range(block_size)),
+        )
+
+        [allocate_block() for _ in range(num_blocks)]
+        with pytest.raises(BlockAllocator.NoFreeBlocksError):
+            allocate_block()
+
+    @staticmethod
+    @pytest.mark.parametrize("num_blocks", [1, 1024])
+    @pytest.mark.parametrize("block_size", [1, 16])
+    def test_allocate_immutable_does_not_oom_single_hash(
+            num_blocks: int, block_size: int):
+        allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
+                                                block_size=block_size)
+        allocate_block = TestPrefixCachingBlockAllocator.create_allocate_lambda(
+            allocate_type="immutable",
+            allocator=allocator,
+            prev_block=None,
+            token_ids=list(range(block_size)),
+        )
+
+        blocks = [allocate_block() for _ in range(num_blocks)]
+
+        # Expect no OOM. If these were mutable blocks, this would OOM.
+        non_oom_block = allocate_block()
+
+        # Expect all blocks to have same physical block index.
+        for block in blocks:
+            assert (block.block_id == non_oom_block.block_id)
+
+    @staticmethod
+    @pytest.mark.parametrize("num_blocks", [1, 1024])
+    @pytest.mark.parametrize("block_size", [1, 16])
+    def test_allocate_immutable_ooms_many_hash(num_blocks: int,
+                                               block_size: int):
+        """Consume all blocks using many different hashes/block content.
+
+        Do this by creating a sequence that is very long.
+        Expect next block to OOM.
+        """
+        allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
+                                                block_size=block_size)
+
+        # Create token ids that will exhaust all blocks.
+        token_ids = list(range(num_blocks * block_size))
+
+        chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
+            block_size=block_size,
+            token_ids=token_ids,
+            allocator=allocator,
+        )
+
+        # Expect allocation with unseen hash to fail.
+        with pytest.raises(BlockAllocator.NoFreeBlocksError):
+            allocator.allocate_immutable(prev_block=chain[-1],
+                                         token_ids=list(range(block_size)))
+
+        # Expect mutable allocation to fail.
+        with pytest.raises(BlockAllocator.NoFreeBlocksError):
+            allocator.allocate_mutable(prev_block=chain[-1])
+
+        # Expect allocation of exact same chain to pass.
+        second_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
+            block_size=block_size,
+            token_ids=token_ids,
+            allocator=allocator,
+        )
+
+        # Expect physical block indices to be the same in both chains.
+        assert chain and second_chain
+        for first_chain_block, second_chain_block in zip(chain, second_chain):
+            assert (first_chain_block.block_id == second_chain_block.block_id)
+
+    @staticmethod
+    @pytest.mark.parametrize("num_blocks", [1, 1024])
+    @pytest.mark.parametrize("block_size", [1, 16])
+    def test_free_prevents_oom(num_blocks: int, block_size: int):
+        allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
+                                                block_size=block_size)
+
+        # Create token ids that will exhaust all blocks.
+        token_ids = list(range(num_blocks * block_size))
+
+        chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
+            block_size=block_size,
+            token_ids=token_ids,
+            allocator=allocator,
+        )
+
+        # Expect mutable allocation to fail.
+        with pytest.raises(BlockAllocator.NoFreeBlocksError):
+            allocator.allocate_mutable(prev_block=None)
+
+        block_to_free = chain[-1]
+
+        # Expect free/allocate loop to succeed many times.
+        for i in range(100):
+            block_id = block_to_free.block_id
+            allocator.free(block_to_free)
+            assert block_to_free.block_id is None, i
+
+            new_block = allocator.allocate_mutable(prev_block=None)
+            assert new_block.block_id == block_id, i
+
+            with pytest.raises(BlockAllocator.NoFreeBlocksError):
+                allocator.allocate_mutable(prev_block=None)
+
+            block_to_free = new_block
+
+    @staticmethod
+    @pytest.mark.parametrize("num_blocks", [1024])
+    @pytest.mark.parametrize("block_size", [16])
+    @pytest.mark.parametrize("seed", list(range(20)))
+    def test_get_num_free_blocks(num_blocks: int, block_size: int, seed: int):
+        random.seed(seed)
+        allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
+                                                block_size=block_size)
+        num_blocks_to_consume = random.randint(1, num_blocks - 1)
+
+        # Create token ids that will exhaust all blocks.
+        token_ids = list(range(num_blocks_to_consume * block_size))
+
+        chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
+            block_size=block_size,
+            token_ids=token_ids,
+            allocator=allocator,
+        )
+
+        # Free each block in chain, assert num free blocks includes new free
+        # block.
+        for i, block in enumerate(chain):
+            assert allocator.get_num_free_blocks() == (num_blocks -
+                                                       num_blocks_to_consume +
+                                                       i)
+            allocator.free(block)
+
+    @staticmethod
+    @pytest.mark.parametrize("num_blocks", [1024])
+    @pytest.mark.parametrize("block_size", [16])
+    @pytest.mark.parametrize("seed", list(range(20)))
+    def test_get_num_free_blocks_shared(num_blocks: int, block_size: int,
+                                        seed: int):
+        """Verify sharing occurs by allocating two sequences that share prefixes
+        and incrementally freeing blocks.
+        """
+        random.seed(seed)
+        allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
+                                                block_size=block_size)
+        num_blocks_to_consume = random.randint(1, num_blocks - 1)
+
+        # Create token ids that will exhaust all blocks.
+        token_ids = list(range(num_blocks_to_consume * block_size))
+
+        first_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
+            block_size=block_size,
+            token_ids=token_ids,
+            allocator=allocator,
+        )
+        second_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
+            block_size=block_size,
+            token_ids=token_ids,
+            allocator=allocator,
+        )
+
+        # Free each block in the first chain. Since all blocks are shared, the
+        # free count should stay constant.
+        for i, block in enumerate(first_chain):
+            assert allocator.get_num_free_blocks() == (num_blocks -
+                                                       num_blocks_to_consume)
+            allocator.free(block)
+
+        # Free each block in the second chain. Since the refcount is now zero,
+        # the free count should increment with each free.
+        for i, block in enumerate(second_chain):
+            assert allocator.get_num_free_blocks() == (num_blocks -
+                                                       num_blocks_to_consume +
+                                                       i)
+            allocator.free(block)
+
+    @staticmethod
+    @pytest.mark.parametrize("num_blocks", [1024])
+    @pytest.mark.parametrize("block_size", [16])
+    @pytest.mark.parametrize("seed", list(range(20)))
+    def test_get_common_computed_block_ids(num_blocks: int, block_size: int,
+                                           seed: int):
+        """Verify get_common_computed_block_ids could get correct result
+        by create two immutable chain sharing prefix at specified pos,
+        and compare whether we also could get right result
+        from get_common_computed_block_ids.
+        """
+        random.seed(seed)
+        allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks * 2,
+                                                block_size=block_size)
+        num_blocks_to_consume = random.randint(1, num_blocks - 1)
+
+        # Create token ids that will exhaust all blocks.
+        token_ids = list(range(num_blocks_to_consume * block_size))
+        blocks = list(range(num_blocks_to_consume))
+
+        first_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
+            block_size=block_size,
+            token_ids=token_ids,
+            allocator=allocator,
+        )
+
+        # mark all blocks in first chain as computed
+        allocator.mark_blocks_as_computed(blocks)
+
+        # After zero_point, second_chain's token_ids would be set -1, which
+        # make it different from here comparing with first_chain
+        zero_point = random.randint(1, len(token_ids) - 1)
+        zero_point_blocks = zero_point // block_size
+        token_ids[zero_point:] = [-1] * (len(token_ids) - zero_point)
+
+        second_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
+            block_size=block_size,
+            token_ids=token_ids,
+            allocator=allocator,
+        )
+
+        first_computed_ids = [
+            first_chain[i].block_id for i in range(num_blocks_to_consume)
+        ]
+        second_computed_ids = [
+            second_chain[i].block_id for i in range(num_blocks_to_consume)
+        ]
+        res = allocator.get_common_computed_block_ids(
+            [first_computed_ids, second_computed_ids])
+
+        assert (len(res) == zero_point_blocks)
+
+    # Test case where two last accessed times are equal
+    @staticmethod
+    @pytest.mark.parametrize("num_blocks", [1024])
+    @pytest.mark.parametrize("block_size", [16])
+    @pytest.mark.parametrize("seed", list(range(20)))
+    def test_eviction_order(num_blocks: int, block_size: int, seed: int):
+        """This test case simulate the two chain created and free in order,
+        and together they would exhaust the initial freed blocks.
+
+        So the next block created after those two chain shall use the block
+        from the first chain as that block has long access time.
+        While first chain has two blocks, it shall pick up the last one, as
+        it has larger token number.
+        """
+
+        random.seed(seed)
+        allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
+                                                block_size=block_size)
+        num_blocks_to_consume = num_blocks + 1
+
+        token_ids = list(range(num_blocks_to_consume * block_size))
+
+        num_blocks_in_first_chain = 2
+        num_tokens_in_first_chain = block_size * num_blocks_in_first_chain
+        # First chain takes the first block
+        first_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
+            block_size=block_size,
+            token_ids=token_ids[:num_tokens_in_first_chain],
+            allocator=allocator,
+        )
+        # There should only be one block allocated at this point
+        assert allocator.get_num_free_blocks() == (num_blocks -
+                                                   num_blocks_in_first_chain)
+
+        # Set the last accessed time of the first block to 1
+        blocks_ids = [block.block_id for block in first_chain]
+        allocator.mark_blocks_as_accessed(blocks_ids, 1)
+
+        # Second chain takes the rest of the blocks
+        second_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
+            block_size=block_size,
+            token_ids=token_ids[num_tokens_in_first_chain:-block_size],
+            allocator=allocator,
+        )
+
+        # There shouldn't be any blocks left at this point
+        assert allocator.get_num_free_blocks() == (0)
+
+        assert len(first_chain) == num_blocks_in_first_chain
+        last_block_id = first_chain[-1].block_id
+        # Free each block in the first chain.
+        for i, block in enumerate(first_chain):
+            allocator.free(block)
+
+        # Set the last accessed time on all of the blocks in the second chain
+        # to 2
+        blocks_ids = [block.block_id for block in second_chain]
+        allocator.mark_blocks_as_accessed(blocks_ids, 2)
+
+        # Free each block in the second chain.
+        for i, block in enumerate(second_chain):
+            allocator.free(block)
+
+        # Allocate a new block and check that it's the least recently used block
+        # from the first chain.
+        new_block = TestPrefixCachingBlockAllocator.create_immutable_chain(
+            block_size=block_size,
+            token_ids=token_ids[-block_size:],
+            allocator=allocator,
+        )
+
+        assert new_block[0].block_id == last_block_id
+
+    @staticmethod
+    def create_immutable_chain(
+        block_size: int,
+        token_ids: List[int],
+        allocator: PrefixCachingBlockAllocator,
+    ) -> List[PrefixCachingBlock]:
+        """Helper method which creates a chain of blocks.
+        """
+        blocks = []
+        num_blocks = math.ceil(len(token_ids) / block_size)
+
+        if num_blocks == 0:
+            return []
+
+        prev_block = None
+        for block_number in range(0, num_blocks):
+            block_token_ids = token_ids[block_number *
+                                        block_size:(block_number + 1) *
+                                        block_size]
+            prev_block = allocator.allocate_immutable(
+                prev_block=prev_block, token_ids=block_token_ids)
+            blocks.append(prev_block)
+
+        return blocks
--- a/tests/core/test_block_manager.py
+++ b/tests/core/test_block_manager.py
@@ -0,0 +1,367 @@
+import time
+from typing import List
+
+import pytest
+
+from vllm import SamplingParams
+from vllm.block import PhysicalTokenBlock
+from vllm.core.block_manager_v1 import (BlockSpaceManagerV1,
+                                        UncachedBlockAllocator)
+from vllm.core.interfaces import AllocStatus
+from vllm.sequence import Logprob, Sequence, SequenceGroup, SequenceStatus
+from vllm.utils import Device
+
+from .utils import create_dummy_prompt
+
+
+def test_block_allocator_allocate():
+    block_size = 4
+    num_cpu_blocks = 4
+    cpu_allocator = UncachedBlockAllocator(Device.CPU, block_size,
+                                           num_cpu_blocks)
+
+    # Allocate all available cpu blocks.
+    num_free = num_cpu_blocks
+    assert cpu_allocator.get_num_free_blocks() == num_free
+    for _ in range(num_cpu_blocks):
+        block = cpu_allocator.allocate()
+        num_free -= 1
+
+        assert block not in cpu_allocator.free_blocks
+        assert cpu_allocator.get_num_free_blocks() == num_free
+
+    with pytest.raises(ValueError):
+        cpu_allocator.allocate()
+
+
+def test_block_allocator_free():
+    block_size = 4
+    num_cpu_blocks = 4
+    cpu_allocator = UncachedBlockAllocator(Device.CPU, block_size,
+                                           num_cpu_blocks)
+
+    # Allocate all available cpu blocks.
+    blocks: List[PhysicalTokenBlock] = []
+    for _ in range(num_cpu_blocks):
+        block = cpu_allocator.allocate()
+        blocks.append(block)
+        assert block not in cpu_allocator.free_blocks
+
+    # Free all allocated cpu blocks.
+    num_free = 0
+    assert cpu_allocator.get_num_free_blocks() == num_free
+    for block in blocks:
+        cpu_allocator.free(block)
+        num_free += 1
+        assert block in cpu_allocator.free_blocks
+        assert cpu_allocator.get_num_free_blocks() == num_free
+
+        with pytest.raises(ValueError):
+            cpu_allocator.free(block)
+
+
+def test_allocate():
+    block_size = 4
+    num_cpu_blocks = 4
+    num_gpu_blocks = 4
+    block_manager = BlockSpaceManagerV1(block_size,
+                                        num_cpu_blocks,
+                                        num_gpu_blocks,
+                                        watermark=0)
+
+    # Allocate same sequence group to all available gpu blocks.
+    for i in range(num_gpu_blocks):
+        _, seq_group = create_dummy_prompt(str(i), block_size)
+        assert block_manager.can_allocate(seq_group)
+        block_manager.allocate(seq_group)
+    assert block_manager.can_allocate(seq_group) != AllocStatus.OK
+
+    # Allocate same sequence group to all available gpu blocks.
+    # Use watermark to reserve one gpu block.
+    block_manager = BlockSpaceManagerV1(block_size,
+                                        num_cpu_blocks,
+                                        num_gpu_blocks,
+                                        watermark=1 / num_gpu_blocks)
+    for i in range(num_gpu_blocks - 1):
+        _, seq_group = create_dummy_prompt(str(i), block_size)
+        assert block_manager.can_allocate(seq_group)
+        block_manager.allocate(seq_group)
+    assert block_manager.can_allocate(seq_group) != AllocStatus.OK
+
+
+def test_append_slot_single_seq():
+    block_size = 4
+    num_cpu_blocks = 4
+    num_gpu_blocks = 4
+    block_manager = BlockSpaceManagerV1(block_size,
+                                        num_cpu_blocks,
+                                        num_gpu_blocks,
+                                        watermark=0)
+
+    # Allocate single seq to gpu block.
+    prompt, seq_group = create_dummy_prompt("1", block_size)
+    block_manager.allocate(seq_group)
+
+    # Nothing to append. Sequence has no new logical blocks.
+    assert block_manager.can_append_slots(seq_group)
+    before_blocks = block_manager.get_num_free_gpu_blocks()
+    assert not block_manager.append_slots(prompt)
+    after_blocks = block_manager.get_num_free_gpu_blocks()
+    assert before_blocks == after_blocks
+
+    # Add block_size number of new tokens and append slot.
+    for i in range(block_size):
+        token_id = i + 5
+        prompt.append_token_id(token_id, {token_id: Logprob(0.0)})
+
+    assert block_manager.can_append_slots(seq_group)
+    before_blocks = block_manager.get_num_free_gpu_blocks()
+    assert not block_manager.append_slots(prompt)
+    after_blocks = block_manager.get_num_free_gpu_blocks()
+    assert before_blocks - after_blocks == 1
+
+
+def test_append_slot_cow():
+    block_size = 4
+    num_cpu_blocks = 4
+    num_gpu_blocks = 4
+    block_manager = BlockSpaceManagerV1(block_size=block_size,
+                                        num_cpu_blocks=num_cpu_blocks,
+                                        num_gpu_blocks=num_gpu_blocks,
+                                        watermark=0)
+
+    # Allocate prompt to gpu block. There is one slot left in the block.
+    prompt = Sequence(seq_id=1,
+                      prompt="one two three",
+                      prompt_token_ids=[1, 2, 3],
+                      block_size=block_size)
+
+    # Fork the sequence, such that a COW will be required when we append a new
+    # token id.
+    child = prompt.fork(new_seq_id=2)
+
+    # Allocate space for the sequence group.
+    seq_group = SequenceGroup("1", [prompt, child], SamplingParams(),
+                              time.time(), time.perf_counter)
+    block_manager.allocate(seq_group)
+
+    # Fork and append a new token id. We expect a COW to be scheduled.
+    token_id = 4
+    child.append_token_id(token_id, {token_id: Logprob(0.0)})
+    block_manager.fork(prompt, child)
+
+    assert block_manager.can_append_slots(seq_group)
+    before_blocks = block_manager.get_num_free_gpu_blocks()
+
+    cows = block_manager.append_slots(child)
+    assert cows
+    for src_block, dst_blocks in cows.items():
+        assert src_block not in dst_blocks
+
+    after_blocks = block_manager.get_num_free_gpu_blocks()
+    assert before_blocks - after_blocks == 1
+
+
+def test_fork():
+    block_size = 4
+    num_cpu_blocks = 4
+    num_gpu_blocks = 4
+    block_manager = BlockSpaceManagerV1(block_size,
+                                        num_cpu_blocks,
+                                        num_gpu_blocks,
+                                        watermark=0)
+
+    prompt, seq_group = create_dummy_prompt("1",
+                                            block_size - 1,
+                                            block_size=block_size)
+    block_manager.allocate(seq_group)
+
+    # Fork prompt and copy block tables.
+    child = prompt.fork(2)
+    block_manager.fork(prompt, child)
+    assert block_manager.get_block_table(
+        prompt) == block_manager.get_block_table(child)
+    token_id = 4
+    # Append token to child. Block is shared so copy on write occurs.
+    child.append_token_id(token_id, {token_id: Logprob(0.0)})
+    block_manager.append_slots(child)
+    assert block_manager.get_block_table(
+        prompt) != block_manager.get_block_table(child)
+
+
+def test_swap():
+    block_size = 4
+    num_cpu_blocks = 4
+    num_gpu_blocks = 4
+    block_manager = BlockSpaceManagerV1(block_size,
+                                        num_cpu_blocks,
+                                        num_gpu_blocks,
+                                        watermark=0)
+
+    prompt, seq_group = create_dummy_prompt("1", prompt_length=block_size - 1)
+    prompt.status = SequenceStatus.WAITING
+    block_manager.allocate(seq_group)
+
+    # Emulate a forward pass by appending a single token.
+    # The block manager then knows how many unprocessed
+    # tokens will be written in the next forward pass.
+    token_id = 0
+    prompt.status = SequenceStatus.RUNNING
+    prompt.append_token_id(token_id, {token_id: Logprob(0.0)})
+
+    # Swap seq group from GPU -> CPU.
+    gpu_blocks = block_manager.get_block_table(prompt)
+    assert block_manager.can_swap_out(seq_group)
+    before_cpu_blocks = block_manager.get_num_free_cpu_blocks()
+    before_gpu_blocks = block_manager.get_num_free_gpu_blocks()
+    mapping = block_manager.swap_out(seq_group)
+    assert list(mapping.keys()) == gpu_blocks
+    after_cpu_blocks = block_manager.get_num_free_cpu_blocks()
+    after_gpu_blocks = block_manager.get_num_free_gpu_blocks()
+    assert before_cpu_blocks == after_cpu_blocks + len(gpu_blocks)
+    assert before_gpu_blocks + len(gpu_blocks) == after_gpu_blocks
+    prompt.status = SequenceStatus.SWAPPED
+
+    # Swap seq group from CPU -> GPU.
+    cpu_blocks = block_manager.get_block_table(prompt)
+    assert block_manager.can_swap_in(seq_group) == AllocStatus.OK
+    before_cpu_blocks = block_manager.get_num_free_cpu_blocks()
+    before_gpu_blocks = block_manager.get_num_free_gpu_blocks()
+    mapping = block_manager.swap_in(seq_group)
+    assert list(mapping.keys()) == cpu_blocks
+    after_cpu_blocks = block_manager.get_num_free_cpu_blocks()
+    after_gpu_blocks = block_manager.get_num_free_gpu_blocks()
+    assert before_cpu_blocks + len(cpu_blocks) == after_cpu_blocks
+    assert before_gpu_blocks == after_gpu_blocks + len(cpu_blocks)
+
+
+def test_free():
+    block_size = 4
+    num_cpu_blocks = 4
+    num_gpu_blocks = 4
+    block_manager = BlockSpaceManagerV1(block_size,
+                                        num_cpu_blocks,
+                                        num_gpu_blocks,
+                                        watermark=0)
+
+    prompt, seq_group = create_dummy_prompt("1", block_size)
+    block_manager.allocate(seq_group)
+
+    # Free allocated seq.
+    prompt_blocks = len(block_manager.get_block_table(prompt))
+    before_blocks = block_manager.get_num_free_gpu_blocks()
+    block_manager.free(prompt)
+    after_blocks = block_manager.get_num_free_gpu_blocks()
+    assert after_blocks == before_blocks + prompt_blocks
+
+    # Block table for freed seq is deleted.
+    with pytest.raises(KeyError):
+        block_manager.get_block_table(prompt)
+
+
+def test_reset():
+    block_size = 4
+    num_cpu_blocks = 4
+    num_gpu_blocks = 4
+    block_manager = BlockSpaceManagerV1(block_size,
+                                        num_cpu_blocks,
+                                        num_gpu_blocks,
+                                        watermark=0)
+
+    # Allocate same seq group on all available gpu blocks.
+    original_blocks = block_manager.get_num_free_gpu_blocks()
+    for i in range(num_gpu_blocks):
+        _, seq_group = create_dummy_prompt(str(i), block_size)
+        block_manager.allocate(seq_group)
+    assert block_manager.get_num_free_gpu_blocks() == 0
+
+    # Resetting block manager frees all allocated blocks.
+    block_manager.reset()
+    assert block_manager.get_num_free_gpu_blocks() == original_blocks
+
+
+def test_sliding_window_multi_seq():
+    """
+    Tests that memory allocation and deallocation is handled
+    correctly with multiple sequences that exceed the sliding
+    window's capacity.
+    """
+    block_size = 1
+    num_cpu_blocks = 8
+    num_gpu_blocks = 8
+    sliding_window = 2
+    block_manager = BlockSpaceManagerV1(block_size,
+                                        num_cpu_blocks,
+                                        num_gpu_blocks,
+                                        sliding_window=sliding_window,
+                                        watermark=0)
+
+    assert block_manager.get_num_free_gpu_blocks() == num_gpu_blocks
+
+    parent = Sequence(1, "one two three", [0, 1, 2], block_size)
+    seq_group = SequenceGroup("1", [parent], SamplingParams(), time.time(),
+                              None)
+    block_manager.allocate(seq_group)
+
+    # assert the number of blocks allocated is correct
+    # the parent seq has len 3, but since sliding_window is 2,
+    # we will use at most 2 blocks
+    assert block_manager.get_num_free_gpu_blocks(
+    ) == num_gpu_blocks - sliding_window
+
+    # Fork prompt and copy block tables.
+    child = parent.fork(2)
+    block_manager.fork(parent, child)
+
+    # assert the number of blocks allocated is correct
+    # forking does not increase memory consumption
+    assert block_manager.get_num_free_gpu_blocks(
+    ) == num_gpu_blocks - sliding_window
+
+    # assert both parent and child share all blocks
+    assert block_manager.get_block_table(
+        parent) == block_manager.get_block_table(child)
+
+    token_id = 4
+    # Append token to child. Block is shared so copy on write occurs.
+    child.append_token_id(token_id, {token_id: Logprob(0.0)})
+    block_manager.append_slots(child)
+
+    # assert the number of blocks allocated is correct
+    # we will use now one block more. Each seq will use 2 blocks,
+    # but only one can be shared
+    assert block_manager.get_num_free_gpu_blocks(
+    ) == num_gpu_blocks - sliding_window - 1
+
+    token_id = 5
+    parent.append_token_id(token_id, {token_id: Logprob(0.0)})
+    block_manager.append_slots(parent)
+
+    # assert the number of blocks allocated is correct
+    # no change, because both sequences are still just sharing one block
+    assert block_manager.get_num_free_gpu_blocks(
+    ) == num_gpu_blocks - sliding_window - 1
+
+    block_table_parent = block_manager.get_block_table(parent)
+    block_table_child = block_manager.get_block_table(child)
+
+    assert block_table_parent != block_table_child
+
+    # assert both blocks are sharing the second-last block
+    assert block_table_parent[-2] == block_table_child[-2]
+
+    # now let's clean up...
+    block_manager.free(parent)
+
+    # assert the number of blocks allocated is correct
+    # We have freed one seq, reducing the ref count of two blocks by one.
+    # One of the two was only used by the parent seq, so this is now free.
+    # The child seq still consumes sliding_window blocks
+    assert block_manager.get_num_free_gpu_blocks(
+    ) == num_gpu_blocks - sliding_window
+
+    # free all blocks
+    block_manager.free(child)
+
+    # assert all blocks are free now
+    assert block_manager.get_num_free_gpu_blocks() == num_gpu_blocks
--- a/tests/core/test_chunked_prefill_scheduler.py
+++ b/tests/core/test_chunked_prefill_scheduler.py
@@ -0,0 +1,564 @@
+from typing import List
+from unittest.mock import MagicMock
+
+import pytest  # noqa
+
+from vllm.config import CacheConfig, SchedulerConfig
+from vllm.core.interfaces import AllocStatus
+from vllm.core.scheduler import Scheduler
+from vllm.sequence import Logprob, SequenceGroup
+
+from .utils import create_dummy_prompt
+
+
+def get_sequence_groups(scheduler_output):
+    return [s.seq_group for s in scheduler_output.scheduled_seq_groups]
+
+
+def append_new_token(seq_group, token_id: int):
+    for seq in seq_group.get_seqs():
+        seq.append_token_id(token_id, {token_id: Logprob(token_id)})
+
+
+def schedule_and_update_computed_tokens(scheduler):
+    metas, out = scheduler.schedule()
+    for s, meta in zip(out.scheduled_seq_groups, metas):
+        s.seq_group.update_num_computed_tokens(meta.token_chunk_size)
+    return metas, out
+
+
+def test_simple():
+    """Verify basic scheduling works."""
+    block_size = 4
+    num_seq_group = 4
+    max_model_len = 16
+    max_num_batched_tokens = 64
+    scheduler_config = SchedulerConfig(max_num_batched_tokens,
+                                       num_seq_group,
+                                       max_model_len,
+                                       enable_chunked_prefill=True)
+    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+    cache_config.num_cpu_blocks = 8
+    cache_config.num_gpu_blocks = 8
+    scheduler = Scheduler(scheduler_config, cache_config, None)
+    running: List[SequenceGroup] = []
+
+    # Add seq groups to scheduler.
+    for i in range(num_seq_group):
+        _, seq_group = create_dummy_prompt(str(i), prompt_length=block_size)
+        scheduler.add_seq_group(seq_group)
+        running.append(seq_group)
+
+    # Schedule seq groups prompts.
+    num_tokens = block_size * num_seq_group
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert set(get_sequence_groups(out)) == set(running)
+    assert out.num_batched_tokens == num_tokens
+    assert (not out.blocks_to_copy and not out.blocks_to_swap_in
+            and not out.blocks_to_swap_out)
+    assert len(seq_group_meta) == num_seq_group
+    for s in running:
+        append_new_token(s, 1)
+
+    # Schedule seq groups generation.
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert set(get_sequence_groups(out)) == set(running)
+    assert out.num_batched_tokens == num_seq_group
+    assert (not out.blocks_to_copy and not out.blocks_to_swap_in
+            and not out.blocks_to_swap_out)
+    assert len(seq_group_meta) == num_seq_group
+
+
+def test_chunk():
+    """Verify prefills are chunked properly."""
+    block_size = 4
+    max_seqs = 60
+    max_model_len = 80
+    max_num_batched_tokens = 64
+    scheduler_config = SchedulerConfig(max_num_batched_tokens,
+                                       max_seqs,
+                                       max_model_len,
+                                       enable_chunked_prefill=True)
+    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+    cache_config.num_cpu_blocks = 8
+    cache_config.num_gpu_blocks = 8
+    scheduler = Scheduler(scheduler_config, cache_config, None)
+    running: List[SequenceGroup] = []
+
+    # Add seq groups to scheduler.
+    for i in range(2):
+        _, seq_group = create_dummy_prompt(str(i), prompt_length=60)
+        scheduler.add_seq_group(seq_group)
+        running.append(seq_group)
+
+    # Verify the second request is chunked.
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert set(get_sequence_groups(out)) == set(running)
+    assert seq_group_meta[0].token_chunk_size == 60
+    # Verify it is chunked.
+    assert seq_group_meta[1].token_chunk_size == 4
+    assert out.num_prefill_groups == 2
+    assert out.num_batched_tokens == 64
+    # Only the first seq group has a new token appended.
+    append_new_token(running[0], 1)
+
+    # One chunked prefill, and one decoding.
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert set(get_sequence_groups(out)) == set(running)
+    # The first one is prefill. Scheduler guarantees ordering.
+    assert seq_group_meta[0].token_chunk_size == 56
+    # The second one is a chunked prefill.
+    assert seq_group_meta[1].token_chunk_size == 1
+    assert out.num_prefill_groups == 1
+    assert out.num_batched_tokens == 57
+
+
+def test_complex():
+    block_size = 4
+    max_seqs = 60
+    max_model_len = 80
+    max_num_batched_tokens = 64
+    scheduler_config = SchedulerConfig(max_num_batched_tokens,
+                                       max_seqs,
+                                       max_model_len,
+                                       enable_chunked_prefill=True)
+    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+    cache_config.num_cpu_blocks = 8
+    cache_config.num_gpu_blocks = 8
+    scheduler = Scheduler(scheduler_config, cache_config, None)
+    running: List[SequenceGroup] = []
+
+    # Add seq groups to scheduler.
+    for i in range(2):
+        _, seq_group = create_dummy_prompt(str(i), prompt_length=60)
+        scheduler.add_seq_group(seq_group)
+        running.append(seq_group)
+        assert seq_group.is_prefill()
+
+    # Verify the second request is chunked.
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+
+    assert set(get_sequence_groups(out)) == set(running)
+    assert seq_group_meta[0].token_chunk_size == 60
+    # Verify it is chunked.
+    assert seq_group_meta[1].token_chunk_size == 4
+    assert not running[0].is_prefill()
+    assert running[1].is_prefill()
+    assert out.num_prefill_groups == 2
+    assert out.num_batched_tokens == 64
+    # Only the first seq group has a new token appended.
+    append_new_token(running[0], 1)
+
+    # Add 2 more requsets.
+    for i in range(2, 4):
+        _, seq_group = create_dummy_prompt(str(i), prompt_length=60)
+        scheduler.add_seq_group(seq_group)
+        running.append(seq_group)
+
+    # Decoding & chunked prefill & first chunk of 3rd request is scheduled.
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert len(get_sequence_groups(out)) == 3
+    # The first one is the first chunked prefill.
+    assert seq_group_meta[0].token_chunk_size == 7
+    # The second one is the second new chunked prefill.
+    assert seq_group_meta[1].token_chunk_size == 56
+    # The last one is decode.
+    assert seq_group_meta[2].token_chunk_size == 1
+    # Two of them are in chunked prefill.
+    assert out.num_prefill_groups == 2
+    assert out.num_batched_tokens == 64
+    # The first 2 requests are now in decodine phase.
+    append_new_token(running[0], 1)
+    assert not running[0].is_prefill()
+    append_new_token(running[1], 1)
+    assert not running[1].is_prefill()
+    # The third request is still in prefill stage.
+    assert running[2].is_prefill()
+
+
+def test_maximal_decoding():
+    """Verify decoding requests are prioritized."""
+    block_size = 4
+    max_seqs = 2
+    max_model_len = 2
+    max_num_batched_tokens = 2
+    scheduler_config = SchedulerConfig(max_num_batched_tokens,
+                                       max_seqs,
+                                       max_model_len,
+                                       enable_chunked_prefill=True)
+    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+    cache_config.num_cpu_blocks = 8
+    cache_config.num_gpu_blocks = 8
+    scheduler = Scheduler(scheduler_config, cache_config, None)
+    running: List[SequenceGroup] = []
+
+    # Add seq groups to scheduler.
+    for i in range(2):
+        _, seq_group = create_dummy_prompt(str(i), prompt_length=2)
+        scheduler.add_seq_group(seq_group)
+        running.append(seq_group)
+        assert seq_group.is_prefill()
+
+    # The first prefill is scheduled.
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert len(get_sequence_groups(out)) == 1
+    assert seq_group_meta[0].token_chunk_size == 2
+    assert not running[0].is_prefill()
+    assert running[1].is_prefill()
+    assert out.num_prefill_groups == 1
+    assert out.num_batched_tokens == 2
+    # Only the first seq group has a new token appended.
+    append_new_token(running[0], 1)
+
+    # Create one more seq_group.
+    _, seq_group = create_dummy_prompt("3", prompt_length=2)
+    scheduler.add_seq_group(seq_group)
+    running.append(seq_group)
+    assert seq_group.is_prefill()
+    # The first decoding + second chunk is scheduled.
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert len(get_sequence_groups(out)) == 2
+    assert seq_group_meta[0].token_chunk_size == 1
+    assert seq_group_meta[1].token_chunk_size == 1
+    assert not running[0].is_prefill()
+    assert running[1].is_prefill()
+    assert running[2].is_prefill()
+    assert out.num_prefill_groups == 1
+    assert out.num_batched_tokens == 2
+    append_new_token(running[0], 1)
+
+    # Decoding + running prefill is prioritized.
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert len(get_sequence_groups(out)) == 2
+    assert seq_group_meta[0].token_chunk_size == 1
+    assert seq_group_meta[1].token_chunk_size == 1
+    assert not running[0].is_prefill()
+    assert not running[1].is_prefill()
+    assert out.num_prefill_groups == 1
+    assert out.num_batched_tokens == 2
+    append_new_token(running[0], 1)
+    append_new_token(running[1], 1)
+
+    # Only decoding is prioritized.
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert len(get_sequence_groups(out)) == 2
+    assert seq_group_meta[0].token_chunk_size == 1
+    assert seq_group_meta[1].token_chunk_size == 1
+    assert not running[0].is_prefill()
+    assert not running[1].is_prefill()
+    assert out.num_prefill_groups == 0
+    assert out.num_batched_tokens == 2
+    append_new_token(running[0], 1)
+    append_new_token(running[1], 1)
+
+    # After aborting the decoding request, the fcfs new prefill is prioritized.
+    scheduler.abort_seq_group(running[0].request_id)
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert len(get_sequence_groups(out)) == 2
+    assert seq_group_meta[0].token_chunk_size == 1
+    assert seq_group_meta[1].token_chunk_size == 1
+    assert not running[1].is_prefill()
+    assert running[2].is_prefill()
+    assert out.num_prefill_groups == 1
+    assert out.num_batched_tokens == 2
+
+
+def test_prompt_limit():
+    """Verify max_num_batched_tokens < max_model_len is possible."""
+    block_size = 4
+    max_seqs = 32
+    max_model_len = 64
+    max_num_batched_tokens = 32
+    scheduler_config = SchedulerConfig(max_num_batched_tokens,
+                                       max_seqs,
+                                       max_model_len,
+                                       enable_chunked_prefill=True)
+    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+    cache_config.num_cpu_blocks = 8
+    cache_config.num_gpu_blocks = 8
+    scheduler = Scheduler(scheduler_config, cache_config, None)
+    running: List[SequenceGroup] = []
+
+    _, seq_group = create_dummy_prompt("1", prompt_length=48)
+    scheduler.add_seq_group(seq_group)
+    running.append(seq_group)
+    assert seq_group.is_prefill()
+
+    # The prompt length > max_num_batched_tokens should be still scheduled.
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert len(get_sequence_groups(out)) == 1
+    assert seq_group_meta[0].token_chunk_size == 32
+    assert running[0].is_prefill()
+    assert out.num_prefill_groups == 1
+    assert out.num_batched_tokens == 32
+
+
+def test_prompt_limit_exceed():
+    block_size = 4
+    max_seqs = 64
+    max_model_len = 32
+    max_num_batched_tokens = 64
+    scheduler_config = SchedulerConfig(max_num_batched_tokens,
+                                       max_seqs,
+                                       max_model_len,
+                                       enable_chunked_prefill=True)
+    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+    cache_config.num_cpu_blocks = 8
+    cache_config.num_gpu_blocks = 8
+    scheduler = Scheduler(scheduler_config, cache_config, None)
+    running: List[SequenceGroup] = []
+
+    _, seq_group = create_dummy_prompt("2", prompt_length=48)
+    scheduler.add_seq_group(seq_group)
+    running.append(seq_group)
+    assert seq_group.is_prefill()
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert len(out.ignored_seq_groups) == 1
+    assert out.ignored_seq_groups[0] == seq_group
+
+
+def test_swap():
+    """Verify swapping works with chunked prefill requests"""
+    block_size = 4
+    max_seqs = 30
+    max_model_len = 200
+    max_num_batched_tokens = 30
+    scheduler_config = SchedulerConfig(max_num_batched_tokens,
+                                       max_seqs,
+                                       max_model_len,
+                                       enable_chunked_prefill=True)
+    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+    cache_config.num_cpu_blocks = 8
+    cache_config.num_gpu_blocks = 8
+    scheduler = Scheduler(scheduler_config, cache_config, None)
+
+    _, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2)
+    scheduler.add_seq_group(seq_group)
+    _, out = schedule_and_update_computed_tokens(scheduler)
+    # The request is chunked.
+    # prefill scheduled now.
+    assert len(out.scheduled_seq_groups) == 1
+    assert out.num_prefill_groups == 1
+    assert seq_group.is_prefill()
+    assert out.num_batched_tokens == max_num_batched_tokens
+
+    # The last request should be swapped out.
+    scheduler.block_manager.can_append_slots = MagicMock()
+
+    def cannot_append_second_group(seq_group, num_lookahead_slots):
+        return seq_group.request_id != "1"
+
+    scheduler.block_manager.can_append_slots.side_effect = (
+        cannot_append_second_group)
+
+    # The running prefill is now swapped.
+    _, out = schedule_and_update_computed_tokens(scheduler)
+    assert len(out.scheduled_seq_groups) == 0
+    assert out.num_batched_tokens == 0
+    assert out.blocks_to_swap_out != {}
+    assert out.blocks_to_swap_in == {}
+
+    # Add 1 more task. Swap should be prioritized over new prefill.
+    _, seq_group = create_dummy_prompt("2", prompt_length=60)
+    scheduler.add_seq_group(seq_group)
+    _, out = schedule_and_update_computed_tokens(scheduler)
+    assert len(out.scheduled_seq_groups) == 1
+    # 3 decodes. It is swapped in.
+    assert out.num_batched_tokens == 30
+    assert out.blocks_to_swap_in != {}
+    assert out.blocks_to_swap_out == {}
+
+
+def test_running_prefill_prioritized_over_swap():
+    block_size = 4
+    max_seqs = 30
+    max_model_len = 200
+    max_num_batched_tokens = 30
+    scheduler_config = SchedulerConfig(max_num_batched_tokens,
+                                       max_seqs,
+                                       max_model_len,
+                                       enable_chunked_prefill=True)
+    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+    cache_config.num_cpu_blocks = 8
+    cache_config.num_gpu_blocks = 8
+    scheduler = Scheduler(scheduler_config, cache_config, None)
+
+    _, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2)
+    scheduler.add_seq_group(seq_group)
+    _, out = schedule_and_update_computed_tokens(scheduler)
+    # The request is chunked.
+    # prefill scheduled now.
+    assert len(out.scheduled_seq_groups) == 1
+    assert out.num_prefill_groups == 1
+    assert seq_group.is_prefill()
+    assert out.num_batched_tokens == max_num_batched_tokens
+
+    # The request should be swapped out.
+    scheduler.block_manager.can_append_slots = MagicMock()
+
+    def cannot_append_second_group(seq_group, num_lookahead_slots):
+        return seq_group.request_id != "1"
+
+    scheduler.block_manager.can_append_slots.side_effect = (
+        cannot_append_second_group)
+
+    # The running prefill is now swapped.
+    _, out = schedule_and_update_computed_tokens(scheduler)
+    assert len(out.scheduled_seq_groups) == 0
+    assert out.num_batched_tokens == 0
+    assert out.blocks_to_swap_out != {}
+    assert out.blocks_to_swap_in == {}
+
+    # Add 1 more task. Swap is not possible, so prefill is running.
+    scheduler.block_manager.can_swap_in = MagicMock()
+    scheduler.block_manager.can_swap_in.return_value = AllocStatus.LATER
+
+    _, seq_group2 = create_dummy_prompt("2", prompt_length=60)
+    scheduler.add_seq_group(seq_group2)
+    _, out = schedule_and_update_computed_tokens(scheduler)
+    assert len(out.scheduled_seq_groups) == 1
+    # 3 decodes. It is swapped in.
+    assert out.num_batched_tokens == 30
+    assert out.blocks_to_swap_in == {}
+    assert out.blocks_to_swap_out == {}
+    assert out.scheduled_seq_groups[0].seq_group == seq_group2
+
+    # Now although swap is possible, running prefill is prioritized.
+    scheduler.block_manager.can_swap_in.return_value = AllocStatus.OK
+    _, out = schedule_and_update_computed_tokens(scheduler)
+    assert len(out.scheduled_seq_groups) == 1
+    # 3 decodes. It is swapped in.
+    assert out.num_batched_tokens == 30
+    assert out.blocks_to_swap_in == {}
+    assert out.blocks_to_swap_out == {}
+    assert not seq_group2.is_prefill()
+    assert out.scheduled_seq_groups[0].seq_group == seq_group2
+    append_new_token(seq_group2, 1)
+
+    # Decoding is prioritized.
+    _, out = schedule_and_update_computed_tokens(scheduler)
+    assert len(out.scheduled_seq_groups) == 1
+    # 3 decodes. It is swapped in.
+    assert out.num_batched_tokens == 1
+    assert out.blocks_to_swap_in == {}
+    assert out.blocks_to_swap_out == {}
+    assert not seq_group2.is_prefill()
+    assert out.scheduled_seq_groups[0].seq_group == seq_group2
+    append_new_token(seq_group2, 1)
+
+    # Since we abort the sequence group, we can finally swap.
+    scheduler.abort_seq_group(seq_group2.request_id)
+    _, out = schedule_and_update_computed_tokens(scheduler)
+    assert len(out.scheduled_seq_groups) == 1
+    assert out.num_batched_tokens == 30
+    assert out.blocks_to_swap_in != {}
+    assert out.blocks_to_swap_out == {}
+
+
+def test_chunked_prefill_preempt():
+    """Verify preempt works with chunked prefill requests"""
+    block_size = 4
+    max_seqs = 30
+    max_model_len = 200
+    max_num_batched_tokens = 30
+    scheduler_config = SchedulerConfig(max_num_batched_tokens,
+                                       max_seqs,
+                                       max_model_len,
+                                       enable_chunked_prefill=True)
+    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+    cache_config.num_cpu_blocks = 8
+    cache_config.num_gpu_blocks = 8
+    scheduler = Scheduler(scheduler_config, cache_config, None)
+
+    _, seq_group = create_dummy_prompt("1", prompt_length=60)
+    scheduler.add_seq_group(seq_group)
+    _, out = schedule_and_update_computed_tokens(scheduler)
+    # The request is chunked.
+    # prefill scheduled now.
+    assert len(out.scheduled_seq_groups) == 1
+    assert out.num_prefill_groups == 1
+    assert seq_group.is_prefill()
+    assert out.num_batched_tokens == max_num_batched_tokens
+
+    # The request should be preempted.
+    scheduler.block_manager.can_append_slots = MagicMock()
+
+    def cannot_append_second_group(seq_group, num_lookahead_slots):
+        return seq_group.request_id != "1"
+
+    scheduler.block_manager.can_append_slots.side_effect = (
+        cannot_append_second_group)
+
+    # The running prefill is now preempted.
+    _, out = schedule_and_update_computed_tokens(scheduler)
+    assert len(out.scheduled_seq_groups) == 0
+    assert out.num_batched_tokens == 0
+    assert out.blocks_to_swap_out == {}
+    assert out.blocks_to_swap_in == {}
+
+    # Make sure we can reschedule preempted request.
+    _, out = schedule_and_update_computed_tokens(scheduler)
+    assert len(out.scheduled_seq_groups) == 1
+    assert out.num_prefill_groups == 1
+    assert seq_group.is_prefill()
+    assert out.num_batched_tokens == max_num_batched_tokens
+    assert seq_group.get_num_uncomputed_tokens() == 30
+
+    # We should be able to run prefill twice as it is chunked.
+    def cannot_append_second_group(seq_group, num_lookahead_slots):
+        return True
+
+    scheduler.block_manager.can_append_slots.side_effect = (
+        cannot_append_second_group)
+    _, out = schedule_and_update_computed_tokens(scheduler)
+    assert len(out.scheduled_seq_groups) == 1
+    assert out.num_prefill_groups == 1
+    assert not seq_group.is_prefill()
+    assert out.num_batched_tokens == max_num_batched_tokens
+
+
+def test_chunked_prefill_max_seqs():
+    block_size = 4
+    max_seqs = 2
+    max_model_len = 80
+    max_num_batched_tokens = 64
+    scheduler_config = SchedulerConfig(max_num_batched_tokens,
+                                       max_seqs,
+                                       max_model_len,
+                                       enable_chunked_prefill=True)
+    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+    cache_config.num_cpu_blocks = 8
+    cache_config.num_gpu_blocks = 8
+    scheduler = Scheduler(scheduler_config, cache_config, None)
+    running = []
+
+    _, seq_group = create_dummy_prompt("1", prompt_length=65)
+    scheduler.add_seq_group(seq_group)
+    running.append(seq_group)
+    # The first prefill is chunked.
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert seq_group_meta[0].token_chunk_size == max_num_batched_tokens
+    assert len(get_sequence_groups(out)) == 1
+
+    # Add new requests.
+    for i in range(4):
+        _, seq_group = create_dummy_prompt(str(i), prompt_length=65)
+        scheduler.add_seq_group(seq_group)
+        running.append(seq_group)
+
+    # Make sure only 2 requests are scheduled.
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert out.num_batched_tokens == max_num_batched_tokens
+    assert len(get_sequence_groups(out)) == 2
+    assert not running[0].is_prefill()
+    assert running[1].is_prefill()
+    append_new_token(running[0], 1)
+
+    # Although we have enough token budget, we can only schedule max_seqs.
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert seq_group_meta[0].token_chunk_size == 2
+    assert seq_group_meta[1].token_chunk_size == 1
+    assert out.num_batched_tokens == 3
+    assert len(get_sequence_groups(out)) == max_seqs
+    assert not running[0].is_prefill()
+    assert not running[1].is_prefill()
--- a/tests/core/test_scheduler.py
+++ b/tests/core/test_scheduler.py
@@ -0,0 +1,900 @@
+import time
+from collections import deque
+from typing import List
+from unittest.mock import MagicMock
+
+import pytest  # noqa
+
+from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig
+from vllm.core.interfaces import AllocStatus
+from vllm.core.policy import PolicyFactory
+from vllm.core.scheduler import Scheduler, SchedulingBudget
+from vllm.lora.request import LoRARequest
+from vllm.sequence import Logprob, SequenceGroup, SequenceStatus
+
+from .utils import create_dummy_prompt
+
+
+def get_sequence_groups(scheduler_output):
+    return [s.seq_group for s in scheduler_output.scheduled_seq_groups]
+
+
+def append_new_token(out, token_id: int):
+    seq_groups = get_sequence_groups(out)
+    for seq_group in seq_groups:
+        for seq in seq_group.get_seqs():
+            seq.append_token_id(token_id, {token_id: Logprob(token_id)})
+
+
+def schedule_and_update_computed_tokens(scheduler):
+    metas, out = scheduler.schedule()
+    for s, meta in zip(out.scheduled_seq_groups, metas):
+        s.seq_group.update_num_computed_tokens(meta.token_chunk_size)
+    return metas, out
+
+
+def append_new_token_seq_group(token_chunk_size, seq_group, token_id: int):
+    seq_group.update_num_computed_tokens(token_chunk_size)
+    for seq in seq_group.get_seqs():
+        seq.append_token_id(token_id, {token_id: Logprob(token_id)})
+
+
+def test_scheduler_add_seq_group():
+    block_size = 4
+    scheduler_config = SchedulerConfig(100, 64, 1)
+    cache_config = CacheConfig(block_size, 1.0, 1, cache_dtype="auto")
+    cache_config.num_cpu_blocks = 4
+    cache_config.num_gpu_blocks = 4
+    scheduler = Scheduler(scheduler_config, cache_config, None)
+
+    # Add seq group to scheduler.
+    num_seq_group = 4
+    for i in range(num_seq_group):
+        _, seq_group = create_dummy_prompt(str(i), block_size)
+        scheduler.add_seq_group(seq_group)
+        assert scheduler.get_num_unfinished_seq_groups() == i + 1
+
+
+def test_scheduler_abort_seq_group():
+    block_size = 4
+    scheduler_config = SchedulerConfig(100, 64, 1)
+    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+    cache_config.num_cpu_blocks = 4
+    cache_config.num_gpu_blocks = 4
+    scheduler = Scheduler(scheduler_config, cache_config, None)
+
+    # Add multiple seq groups to scheduler.
+    num_seq_group = 4
+    request_ids = set()
+    for i in range(num_seq_group):
+        _, seq_group = create_dummy_prompt(str(i), block_size)
+        scheduler.add_seq_group(seq_group)
+        request_ids.add(str(i))
+
+    # Abort all added seq groups.
+    assert scheduler.get_num_unfinished_seq_groups() == num_seq_group
+    scheduler.abort_seq_group(request_ids)
+    assert scheduler.get_num_unfinished_seq_groups() == 0
+
+
+def test_scheduler_schedule_simple():
+    block_size = 4
+    num_seq_group = 4
+    max_model_len = 16
+    scheduler_config = SchedulerConfig(64, num_seq_group, max_model_len)
+    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+    cache_config.num_cpu_blocks = 8
+    cache_config.num_gpu_blocks = 8
+    scheduler = Scheduler(scheduler_config, cache_config, None)
+    running: List[SequenceGroup] = []
+
+    # Add seq groups to scheduler.
+    for i in range(num_seq_group):
+        _, seq_group = create_dummy_prompt(str(i), prompt_length=block_size)
+        scheduler.add_seq_group(seq_group)
+        running.append(seq_group)
+
+    # Schedule seq groups prompts.
+    num_tokens = block_size * num_seq_group
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert set(get_sequence_groups(out)) == set(running)
+    assert out.num_batched_tokens == num_tokens
+    assert (not out.blocks_to_copy and not out.blocks_to_swap_in
+            and not out.blocks_to_swap_out)
+    assert len(seq_group_meta) == num_seq_group
+    append_new_token(out, 1)
+
+    # Schedule seq groups generation.
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert set(get_sequence_groups(out)) == set(running)
+    assert out.num_batched_tokens == num_seq_group
+    assert (not out.blocks_to_copy and not out.blocks_to_swap_in
+            and not out.blocks_to_swap_out)
+    assert len(seq_group_meta) == num_seq_group
+    append_new_token(out, 1)
+
+
+def test_scheduler_prefill_prioritized():
+    """Verify running batched tokens are not applied to prefill requests."""
+    block_size = 4
+    max_model_len = 30
+    max_batched_num_tokens = 30
+    scheduler_config = SchedulerConfig(max_batched_num_tokens, 2,
+                                       max_model_len)
+    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+    cache_config.num_cpu_blocks = 2
+    cache_config.num_gpu_blocks = 2
+    scheduler = Scheduler(scheduler_config, cache_config, None)
+
+    # Add seq groups to scheduler.
+    _, seq_group_a = create_dummy_prompt("1", 1)
+    scheduler.add_seq_group(seq_group_a)
+
+    # Schedule seq groups prompts.
+    _, out = schedule_and_update_computed_tokens(scheduler)
+    assert get_sequence_groups(out) == [seq_group_a]
+
+    # Add a new prefill request B.
+    _, seq_group_b = create_dummy_prompt("2", 30)
+    scheduler.add_seq_group(seq_group_b)
+
+    # Verify prefill requests are prioritized. Since max_batched_num_tokens
+    # is 1, new prefill request has to be scheduled first.
+    _, out = schedule_and_update_computed_tokens(scheduler)
+    assert get_sequence_groups(out) == [seq_group_b]
+
+
+def test_scheduler_schedule_preempt_abort():
+    block_size = 4
+    max_model_len = 16
+    scheduler_config = SchedulerConfig(64, 2, max_model_len)
+    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+    cache_config.num_cpu_blocks = 2
+    cache_config.num_gpu_blocks = 2
+    scheduler = Scheduler(scheduler_config, cache_config, None)
+
+    # Add seq groups to scheduler.
+    seq_a, seq_group_a = create_dummy_prompt("1", block_size)
+    seq_b, seq_group_b = create_dummy_prompt("2", block_size)
+    scheduler.add_seq_group(seq_group_a)
+    scheduler.add_seq_group(seq_group_b)
+
+    # Schedule seq groups prompts.
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert get_sequence_groups(out) == [seq_group_a, seq_group_b]
+    assert out.num_batched_tokens == block_size * 2  # seq_a and seq_b
+    assert (not out.blocks_to_copy and not out.blocks_to_swap_in
+            and not out.blocks_to_swap_out)
+    assert len(seq_group_meta) == 2
+    assert scheduler.get_num_unfinished_seq_groups() == 2
+
+    # Append "generated" tokens, allowing the sequence to mark prompt tokens as
+    # processed.
+    append_new_token(out, 1)
+
+    # Schedule seq groups generation and preempt seq group b.
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert get_sequence_groups(out) == [seq_group_a]
+    assert out.num_batched_tokens == 1
+    assert (not out.blocks_to_copy and not out.blocks_to_swap_in
+            and not out.blocks_to_swap_out)
+    assert len(seq_group_meta) == 1
+    assert scheduler.get_num_unfinished_seq_groups() == 2
+
+    # Abort seq group a. Re-schedule seq group b prompt with recomputation.
+    scheduler.abort_seq_group("1")
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert get_sequence_groups(out) == [seq_group_b]
+    assert out.num_batched_tokens == 5  # 4 prompt + 1 generation.
+    assert (not out.blocks_to_copy and not out.blocks_to_swap_in
+            and not out.blocks_to_swap_out)
+    assert len(seq_group_meta) == 1
+    assert scheduler.get_num_unfinished_seq_groups() == 1
+
+
+def test_scheduler_max_seqs():
+    block_size = 4
+    num_seq_group = 4
+    max_seq_group = 2
+    max_model_len = 16
+    scheduler_config = SchedulerConfig(64, max_seq_group, max_model_len)
+    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+    cache_config.num_cpu_blocks = 8
+    cache_config.num_gpu_blocks = 8
+    scheduler = Scheduler(scheduler_config, cache_config, None)
+
+    all_seq_groups: List[SequenceGroup] = []
+    # Add seq groups to scheduler.
+    for i in range(num_seq_group):
+        _, seq_group = create_dummy_prompt(str(i), prompt_length=block_size)
+        all_seq_groups.append(seq_group)
+
+    # Append 1 seq group
+    scheduler.add_seq_group(all_seq_groups[0])
+
+    # Schedule seq groups prompts.
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert set(get_sequence_groups(out)) == set([all_seq_groups[0]])
+    append_new_token(out, 1)
+
+    # Schedule seq groups generation.
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert set(get_sequence_groups(out)) == set([all_seq_groups[0]])
+    append_new_token(out, 1)
+
+    # Append 2 more seq group
+    scheduler.add_seq_group(all_seq_groups[1])
+    scheduler.add_seq_group(all_seq_groups[2])
+
+    # Schedule seq groups prompts.
+    # Only 1 seq group should be scheduled since max_seq_group is 2
+    # and one is prompting.
+    _, out = schedule_and_update_computed_tokens(scheduler)
+    assert set(get_sequence_groups(out)) == set([all_seq_groups[1]])
+
+
+def test_scheduler_delay_factor():
+    block_size = 4
+    scheduler_config = SchedulerConfig(100, 64, 16, delay_factor=0.5)
+    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+    cache_config.num_cpu_blocks = 8
+    cache_config.num_gpu_blocks = 8
+    scheduler = Scheduler(scheduler_config, cache_config, None)
+
+    # schedule first prompt
+    seq_group_meta, seq_group = create_dummy_prompt("0",
+                                                    prompt_length=block_size)
+    scheduler.add_seq_group(seq_group)
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert out.num_prefill_groups > 0
+    assert seq_group_meta[0].request_id == '0'
+    append_new_token(out, 1)
+
+    # wait for a second before scheduling next prompt
+    time.sleep(1)
+    seq_group_meta, seq_group = create_dummy_prompt("1",
+                                                    prompt_length=block_size)
+    scheduler.add_seq_group(seq_group)
+
+    # second prompt should *not* be scheduled
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert out.num_prefill_groups == 0
+    assert seq_group_meta[0].request_id == '0'
+    append_new_token(out, 1)
+
+    # wait for more than 0.5 second and try again
+    time.sleep(0.6)
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert out.num_prefill_groups > 0
+    assert seq_group_meta[0].request_id == '1'
+    append_new_token(out, 1)
+
+
+def test_swapped_out_prioritized():
+    scheduler = initialize_scheduler(max_num_seqs=6)
+    # best_of=2 * 3 == 6 sequences.
+    for i in range(3):
+        _, seq_group = create_dummy_prompt(str(i), prompt_length=60, best_of=2)
+        scheduler.add_seq_group(seq_group)
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    # prefill scheduled now.
+    assert len(out.scheduled_seq_groups) == 3
+    append_new_token(out, 1)
+
+    # The last request should be swapped out.
+    scheduler.block_manager.can_append_slots = MagicMock()
+
+    def cannot_append_second_group(seq_group, num_lookahead_slots):
+        return seq_group.request_id != "2"
+
+    scheduler.block_manager.can_append_slots.side_effect = (
+        cannot_append_second_group)
+
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert len(out.scheduled_seq_groups) == 2
+    assert out.num_batched_tokens == 2
+    assert out.blocks_to_swap_out != {}
+    assert out.blocks_to_swap_in == {}
+    append_new_token(out, 1)
+
+    # Add 1 more task. Swap should be prioritized over prefill.
+    _, seq_group = create_dummy_prompt(str(i), prompt_length=60, best_of=2)
+    scheduler.add_seq_group(seq_group)
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    append_new_token(out, 1)
+    assert len(out.scheduled_seq_groups) == 3
+    # 3 decodes. It is swapped in.
+    assert out.num_batched_tokens == 3
+    assert out.blocks_to_swap_in != {}
+    assert out.blocks_to_swap_out == {}
+
+
+def initialize_scheduler(*,
+                         max_num_seqs=1000,
+                         max_token_budget=1000,
+                         max_model_len=1000,
+                         lora_config=None):
+    block_size = 4
+    scheduler_config = SchedulerConfig(max_token_budget, max_num_seqs,
+                                       max_model_len)
+    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+    cache_config.num_cpu_blocks = 8
+    cache_config.num_gpu_blocks = 8
+    scheduler = Scheduler(scheduler_config, cache_config, lora_config)
+    return scheduler
+
+
+def create_token_budget(token_budget: int = 10000,
+                        max_num_seqs: int = 10000) -> SchedulingBudget:
+    return SchedulingBudget(
+        token_budget=token_budget,
+        max_num_seqs=max_num_seqs,
+    )
+
+
+def add_token_budget(budget: SchedulingBudget,
+                     num_batched_tokens: int = 0,
+                     num_curr_seqs: int = 0):
+    mock_seq_group = create_dummy_prompt('10', prompt_length=60)[1]
+    budget.add_num_batched_tokens(mock_seq_group.request_id,
+                                  num_batched_tokens)
+    budget.add_num_seqs(mock_seq_group.request_id, num_curr_seqs)
+
+
+def test_prefill_schedule_max_prompt_len():
+    """
+    Test prompt longer than max_prompt_len is aborted.
+    """
+    scheduler = initialize_scheduler(max_model_len=30)
+    _, seq_group = create_dummy_prompt(0, prompt_length=60)
+    waiting = deque([seq_group])
+    budget = create_token_budget()
+    remaining_waiting, output = scheduler._schedule_prefills(
+        waiting, budget, None)
+    assert len(output.ignored_seq_groups) == 1
+    assert len(output.seq_groups) == 0
+    assert budget.num_batched_tokens == 0
+    assert budget.num_curr_seqs == 0
+    assert len(remaining_waiting) == 0
+
+
+def test_prefill_schedule_token_budget():
+    """
+    Test token budget respected.
+    """
+    scheduler = initialize_scheduler()
+    waiting = deque()
+    budget = create_token_budget(token_budget=0)
+    for i in range(2):
+        _, seq_group = create_dummy_prompt(str(i), prompt_length=60)
+        waiting.append(seq_group)
+
+    # 0 token budget == nothing is scheduled.
+    remaining_waiting, output = scheduler._schedule_prefills(
+        waiting, budget, None)
+    assert len(output.ignored_seq_groups) == 0
+    assert len(output.seq_groups) == 0
+    assert budget.num_batched_tokens == 0
+    assert budget.num_curr_seqs == 0
+    assert len(remaining_waiting) == 2
+
+    # 60 token budget == 1 request scheduled.
+    budget = create_token_budget(token_budget=60)
+    remaining_waiting, output = scheduler._schedule_prefills(
+        waiting, budget, None)
+    assert len(output.ignored_seq_groups) == 0
+    assert len(output.seq_groups) == 1
+    assert budget.num_batched_tokens == 60
+    assert budget.num_curr_seqs == 1
+    assert len(remaining_waiting) == 1
+
+    # Test when current_batched_tokens respected.
+    scheduler = initialize_scheduler()
+    waiting = deque()
+    budget = create_token_budget(token_budget=60)
+    add_token_budget(budget, 30, 0)
+    _, seq_group = create_dummy_prompt(str(i), prompt_length=60)
+    # Cannot schedule a prompt that doesn't fit the budget.
+    waiting.append(seq_group)
+    remaining_waiting, output = scheduler._schedule_prefills(
+        waiting, budget, None)
+    assert len(output.ignored_seq_groups) == 0
+    assert len(output.seq_groups) == 0
+    assert budget.num_batched_tokens == 30
+    assert budget.num_curr_seqs == 0
+    assert len(remaining_waiting) == 1
+    budget = create_token_budget(token_budget=90)
+    add_token_budget(budget, 30, 0)
+    remaining_waiting, output = scheduler._schedule_prefills(
+        waiting, budget, None)
+    assert len(output.seq_groups) == 1
+    assert budget.num_batched_tokens == 90
+    assert budget.num_curr_seqs == 1
+    assert len(remaining_waiting) == 0
+
+
+def test_prefill_schedule_max_seqs():
+    """
+    Test max seq respected.
+    """
+    scheduler = initialize_scheduler()
+    waiting = deque()
+    budget = create_token_budget(max_num_seqs=2)
+    for i in range(3):
+        _, seq_group = create_dummy_prompt(str(i), prompt_length=60)
+        waiting.append(seq_group)
+    remaining_waiting, output = scheduler._schedule_prefills(
+        waiting, budget, None)
+    assert len(output.ignored_seq_groups) == 0
+    assert len(output.seq_groups) == 2
+    assert budget.num_batched_tokens == 120
+    assert budget.num_curr_seqs == 2
+    assert len(remaining_waiting) == 1
+
+    # Verify curr_num_seqs respected.
+    waiting = deque()
+    budget = create_token_budget(max_num_seqs=2)
+    add_token_budget(budget, 0, 2)
+    _, seq_group = create_dummy_prompt(str(i), prompt_length=60)
+    waiting.append(seq_group)
+    remaining_waiting, output = scheduler._schedule_prefills(
+        waiting, budget, None)
+    assert len(output.ignored_seq_groups) == 0
+    assert len(output.seq_groups) == 0
+    assert budget.num_batched_tokens == 0
+    assert budget.num_curr_seqs == 2
+    assert len(remaining_waiting) == 1
+
+
+def test_prefill_schedule_max_lora():
+    """
+    Test max lora is respected and prioritized.
+    """
+    lora_config = LoRAConfig(max_lora_rank=8, max_loras=1)
+    scheduler = initialize_scheduler(lora_config=lora_config)
+    waiting = deque()
+    budget = create_token_budget(token_budget=120)
+    curr_loras = set()
+    for i in range(2):
+        _, seq_group = create_dummy_prompt(str(i),
+                                           prompt_length=60,
+                                           lora_request=LoRARequest(
+                                               lora_name=str(i),
+                                               lora_int_id=i + 1,
+                                               lora_local_path="abc"))
+        waiting.append(seq_group)
+    # Add two more requests to verify lora is prioritized.
+    # 0: Lora, 1: Lora, 2: regular, 3: regular
+    # In the first iteration, index 0, 2 is scheduled.
+    # If a request is not scheduled because it hits max lora, it is
+    # prioritized. Verify that.
+    for i in range(2, 4):
+        _, seq_group = create_dummy_prompt(str(i), prompt_length=60)
+        waiting.append(seq_group)
+    # Schedule 2 requests (0 and 2)
+    remaining_waiting, output = scheduler._schedule_prefills(
+        waiting, budget, curr_loras)
+    assert len(output.ignored_seq_groups) == 0
+    assert len(output.seq_groups) == 2
+    assert budget.num_batched_tokens == 120
+    assert budget.num_curr_seqs == 2
+    assert len(remaining_waiting) == 2
+    assert len(curr_loras) == 1
+    # The second lora request is scheduled next as FCFS policy.
+    # Reset curr_loras so that it can be scheduled.
+    curr_loras = set()
+    budget = create_token_budget(token_budget=60)
+    remaining_waiting, output = scheduler._schedule_prefills(
+        remaining_waiting, budget, curr_loras)
+    assert len(output.seq_groups) == 1
+    assert output.seq_groups[0].seq_group.request_id == "1"
+    assert len(remaining_waiting) == 1
+    assert len(curr_loras) == 1
+    assert budget.num_batched_tokens == 60
+
+
+def test_prefill_schedule_no_block_manager_capacity():
+    """
+    Test sequence cannot be scheduled due to block manager has no capacity.
+    """
+    scheduler = initialize_scheduler()
+    waiting = deque()
+    budget = create_token_budget()
+    for i in range(3):
+        _, seq_group = create_dummy_prompt(str(i), prompt_length=60)
+        waiting.append(seq_group)
+    scheduler.block_manager.can_allocate = MagicMock()
+    scheduler.block_manager.can_allocate.return_value = AllocStatus.LATER
+    remainig_waiting, output = scheduler._schedule_prefills(
+        waiting, budget, None)
+    assert len(output.ignored_seq_groups) == 0
+    assert len(output.seq_groups) == 0
+    assert budget.num_batched_tokens == 0
+    assert budget.num_curr_seqs == 0
+    assert len(remainig_waiting) == 3
+
+    scheduler = initialize_scheduler()
+    waiting = deque()
+    budget = create_token_budget()
+    for i in range(3):
+        _, seq_group = create_dummy_prompt(str(i), prompt_length=60)
+        waiting.append(seq_group)
+    scheduler.block_manager.can_allocate = MagicMock()
+    scheduler.block_manager.can_allocate.return_value = AllocStatus.NEVER
+    remaining_waiting, output = scheduler._schedule_prefills(
+        waiting, budget, None)
+    assert len(output.ignored_seq_groups) == 3
+    assert len(output.seq_groups) == 0
+    assert budget.num_batched_tokens == 0
+    assert budget.num_curr_seqs == 0
+    assert len(remaining_waiting) == 0
+
+
+def test_decode_schedule_preempted():
+    """
+    Test decodes cannot be scheduled and preempted.
+    """
+    scheduler = initialize_scheduler()
+    running = deque()
+    policy = PolicyFactory.get_policy(policy_name="fcfs")
+    curr_loras = None
+    for i in range(3):
+        _, seq_group = create_dummy_prompt(str(i), prompt_length=60)
+        scheduler._allocate_and_set_running(seq_group)
+        append_new_token_seq_group(60, seq_group, 1)
+        running.append(seq_group)
+    scheduler.block_manager.can_append_slots = MagicMock()
+
+    def cannot_append_second_group(seq_group, num_lookahead_slots):
+        return seq_group.request_id != "1"
+
+    scheduler.block_manager.can_append_slots.side_effect = (
+        cannot_append_second_group)
+
+    # 1 cannot be scheduled, and the lowest priority (request 2)
+    # should be preempted. 1 will also be preempted.
+    budget = create_token_budget()
+    remainig_running, output = scheduler._schedule_running(
+        running, budget, curr_loras, policy)
+    assert len(remainig_running) == 0
+    assert len(output.decode_seq_groups) == 1
+    assert len(output.prefill_seq_groups) == 0
+    assert output.decode_seq_groups[0].seq_group.request_id == "0"
+    assert len(output.preempted) == 2
+    # Verify budgets are updated.
+    assert budget.num_batched_tokens == 1
+    # NOTE: When enable_chunk is False, num_seqs budget is not updated.
+    # assert budget.num_curr_seqs == 1
+    # Both should be preempted, not swapped.
+    assert output.blocks_to_swap_out == {}
+    # Nothing is copied.
+    assert output.blocks_to_copy == {}
+
+
+def test_decode_swap_beam_search():
+    """
+    Test best_of > 1 swap out blocks
+    """
+    scheduler = initialize_scheduler()
+    running = deque()
+    policy = PolicyFactory.get_policy(policy_name="fcfs")
+    curr_loras = None
+    budget = create_token_budget()
+    for i in range(3):
+        _, seq_group = create_dummy_prompt(str(i), prompt_length=60, best_of=2)
+        scheduler._allocate_and_set_running(seq_group)
+        running.append(seq_group)
+        append_new_token_seq_group(60, seq_group, 1)
+        budget.add_num_seqs(seq_group.request_id,
+                            seq_group.get_max_num_running_seqs())
+        budget.add_num_batched_tokens(
+            seq_group.request_id, seq_group.num_seqs(SequenceStatus.RUNNING))
+
+    # The last request should be swapped out.
+    scheduler.block_manager.can_append_slots = MagicMock()
+
+    def cannot_append_second_group(seq_group, num_lookahead_slots):
+        return seq_group.request_id != "2"
+
+    scheduler.block_manager.can_append_slots.side_effect = (
+        cannot_append_second_group)
+    scheduler.block_manager.swap_out = MagicMock()
+    expected_swap_mapping = {"5": "7"}
+    scheduler.block_manager.swap_out.return_value = expected_swap_mapping
+
+    remainig_running, output = scheduler._schedule_running(
+        running, budget, curr_loras, policy)
+    assert len(remainig_running) == 0
+    assert len(output.decode_seq_groups) == 2
+    assert len(output.prefill_seq_groups) == 0
+    assert output.decode_seq_groups[0].seq_group.request_id == "0"
+    assert output.decode_seq_groups[1].seq_group.request_id == "1"
+    assert len(output.preempted) == 0
+    assert len(output.swapped_out) == 1
+    # Budget should refledct preempted requests.
+    assert budget.num_batched_tokens == 2
+    # since there are 2 sequences, 2 should be subtracted.
+    assert budget.num_curr_seqs == 4
+    # Both should be preempted, not swapped.
+    assert output.blocks_to_swap_out == expected_swap_mapping
+    # Nothing is copied.
+    assert output.blocks_to_copy == {}
+
+
+def test_schedule_decode_blocks_to_copy_update():
+    """
+    Verify blocks_to_copy is updated.
+    """
+    scheduler = initialize_scheduler()
+    _, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2)
+    running = deque()
+    policy = PolicyFactory.get_policy(policy_name="fcfs")
+    curr_loras = None
+    scheduler._allocate_and_set_running(seq_group)
+    append_new_token_seq_group(60, seq_group, 1)
+    running.append(seq_group)
+
+    # The last request should be swapped out.
+    scheduler.block_manager.append_slots = MagicMock()
+    scheduler.block_manager.append_slots.return_value = {2: [3]}
+
+    budget = create_token_budget()
+    remaining_running, output = scheduler._schedule_running(
+        running, budget, curr_loras, policy)
+    assert len(remaining_running) == 0
+    assert len(output.decode_seq_groups) == 1
+    assert len(output.prefill_seq_groups) == 0
+    assert len(output.preempted) == 0
+    assert len(output.swapped_out) == 0
+    # Nothing is preempted.
+    assert output.blocks_to_swap_out == {}
+    # Since append_slot returns the source -> dist mapping, it should
+    # applied.
+    assert output.blocks_to_copy == {2: [3]}
+
+
+def test_schedule_swapped_simple():
+    scheduler = initialize_scheduler()
+    swapped = deque()
+    policy = PolicyFactory.get_policy(policy_name="fcfs")
+    curr_loras = None
+    blocks_to_swap_out = {}
+    _, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2)
+    scheduler._allocate_and_set_running(seq_group)
+    append_new_token_seq_group(60, seq_group, 1)
+    scheduler._swap_out(seq_group, blocks_to_swap_out)
+    swapped.append(seq_group)
+
+    budget = create_token_budget()
+    remaining_swapped, output = scheduler._schedule_swapped(
+        swapped, budget, curr_loras, policy)
+    assert len(remaining_swapped) == 0
+    assert budget.num_batched_tokens == 1
+    assert budget.num_curr_seqs == 2
+    assert len(output.decode_seq_groups) == 1
+    assert len(output.prefill_seq_groups) == 0
+    # swap in is the reverse of swap out
+    blocks_to_swap_in_reverse = {}
+    for swapin, swapout in output.blocks_to_swap_in.items():
+        blocks_to_swap_in_reverse[swapout] = swapin
+    assert blocks_to_swap_out == blocks_to_swap_in_reverse
+
+
+def test_schedule_swapped_max_token_budget():
+    scheduler = initialize_scheduler()
+    swapped = deque()
+    policy = PolicyFactory.get_policy(policy_name="fcfs")
+    curr_loras = None
+    blocks_to_swap_out = {}
+    for _ in range(2):
+        _, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2)
+        scheduler._allocate_and_set_running(seq_group)
+        append_new_token_seq_group(60, seq_group, 1)
+        scheduler._swap_out(seq_group, blocks_to_swap_out)
+        swapped.append(seq_group)
+
+    budget = create_token_budget(token_budget=1)
+    remaining_swapped, output = scheduler._schedule_swapped(
+        swapped, budget, curr_loras, policy)
+    assert len(remaining_swapped) == 1
+    assert budget.num_batched_tokens == 1
+    assert budget.num_curr_seqs == 2
+    assert len(output.decode_seq_groups) == 1
+    assert len(output.prefill_seq_groups) == 0
+
+    # Verify num_batched_tokens are respected.
+    budget = create_token_budget(token_budget=1)
+    add_token_budget(budget, 1, 0)
+    remaining_swapped, output = scheduler._schedule_swapped(
+        remaining_swapped, budget, curr_loras, policy)
+    assert len(remaining_swapped) == 1
+    assert budget.num_batched_tokens == 1
+    assert budget.num_curr_seqs == 0
+    assert len(output.decode_seq_groups) == 0
+    assert len(output.prefill_seq_groups) == 0
+
+
+def test_schedule_swapped_max_seqs():
+    scheduler = initialize_scheduler()
+    swapped = deque()
+    policy = PolicyFactory.get_policy(policy_name="fcfs")
+    curr_loras = None
+    blocks_to_swap_out = {}
+    for i in range(4):
+        _, seq_group = create_dummy_prompt(str(i), prompt_length=60)
+        scheduler._allocate_and_set_running(seq_group)
+        append_new_token_seq_group(60, seq_group, 1)
+        scheduler._swap_out(seq_group, blocks_to_swap_out)
+        swapped.append(seq_group)
+
+    budget = create_token_budget(max_num_seqs=2)
+    remaining_swapped, output = scheduler._schedule_swapped(
+        swapped, budget, curr_loras, policy)
+    assert len(remaining_swapped) == 2
+    assert budget.num_batched_tokens == 2
+    assert budget.num_curr_seqs == 2
+    assert len(output.decode_seq_groups) == 2
+    assert len(output.prefill_seq_groups) == 0
+
+    # Verify num_curr_seqs are respected.
+    remaining_swapped, output = scheduler._schedule_swapped(
+        remaining_swapped, budget, curr_loras, policy)
+    assert len(remaining_swapped) == 2
+    assert budget.num_batched_tokens == 2
+    assert budget.num_curr_seqs == 2
+    assert len(output.decode_seq_groups) == 0
+    assert len(output.prefill_seq_groups) == 0
+
+
+def test_schedule_swapped_max_loras():
+    lora_config = LoRAConfig(max_lora_rank=8, max_loras=1)
+    scheduler = initialize_scheduler(lora_config=lora_config)
+    swapped = deque()
+    policy = PolicyFactory.get_policy(policy_name="fcfs")
+    curr_loras = set()
+    blocks_to_swap_out = {}
+    for i in range(2):
+        _, seq_group = create_dummy_prompt(str(i),
+                                           prompt_length=60,
+                                           lora_request=LoRARequest(
+                                               lora_name=str(i),
+                                               lora_int_id=i + 1,
+                                               lora_local_path="abc"))
+        scheduler._allocate_and_set_running(seq_group)
+        append_new_token_seq_group(60, seq_group, 1)
+        scheduler._swap_out(seq_group, blocks_to_swap_out)
+        swapped.append(seq_group)
+
+    budget = create_token_budget()
+    remaining_swapped, output = scheduler._schedule_swapped(
+        swapped, budget, curr_loras, policy)
+    assert len(remaining_swapped) == 1
+    assert budget.num_batched_tokens == 1
+    assert budget.num_curr_seqs == 1
+    assert len(output.decode_seq_groups) == 1
+    assert len(output.prefill_seq_groups) == 0
+    assert len(curr_loras) == 1
+
+
+def test_schedule_swapped_cannot_swap_in():
+    scheduler = initialize_scheduler()
+    swapped = deque()
+    policy = PolicyFactory.get_policy(policy_name="fcfs")
+    curr_loras = None
+    blocks_to_swap_out = {}
+    for _ in range(2):
+        _, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2)
+        scheduler._allocate_and_set_running(seq_group)
+        append_new_token_seq_group(60, seq_group, 1)
+        scheduler._swap_out(seq_group, blocks_to_swap_out)
+        swapped.append(seq_group)
+
+    # The last request should be swapped out.
+    scheduler.block_manager.can_swap_in = MagicMock()
+    scheduler.block_manager.can_swap_in.return_value = AllocStatus.LATER
+    # Since we cannot swap in, none of the requests are swapped in.
+    budget = create_token_budget()
+    remaining_swapped, output = scheduler._schedule_swapped(
+        swapped, budget, curr_loras, policy)
+    assert len(remaining_swapped) == 2
+    assert budget.num_batched_tokens == 0
+    assert budget.num_curr_seqs == 0
+    assert len(output.decode_seq_groups) == 0
+    assert len(output.prefill_seq_groups) == 0
+
+
+def test_infeasible_swap():
+    scheduler = initialize_scheduler()
+    swapped = deque()
+    policy = PolicyFactory.get_policy(policy_name="fcfs")
+    curr_loras = None
+    blocks_to_swap_out = {}
+    for _ in range(2):
+        _, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2)
+        scheduler._allocate_and_set_running(seq_group)
+        append_new_token_seq_group(60, seq_group, 1)
+        scheduler._swap_out(seq_group, blocks_to_swap_out)
+        swapped.append(seq_group)
+
+    # The last request should be swapped out.
+    scheduler.block_manager.can_swap_in = MagicMock()
+    scheduler.block_manager.can_swap_in.return_value = AllocStatus.NEVER
+    # Since we cannot swap in, none of the requests are swapped in.
+    budget = create_token_budget()
+    remaining_swapped, output = scheduler._schedule_swapped(
+        swapped, budget, curr_loras, policy)
+    assert len(remaining_swapped) == 0
+    assert len(output.infeasible_seq_groups) == 2
+    assert budget.num_batched_tokens == 0
+    assert budget.num_curr_seqs == 0
+    assert len(output.decode_seq_groups) == 0
+    assert len(output.prefill_seq_groups) == 0
+
+
+def test_schedule_swapped_blocks_to_copy():
+    scheduler = initialize_scheduler()
+    swapped = deque()
+    policy = PolicyFactory.get_policy(policy_name="fcfs")
+    curr_loras = None
+    _, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2)
+    scheduler._allocate_and_set_running(seq_group)
+    append_new_token_seq_group(60, seq_group, 1)
+    blocks_to_swap_out = {}
+    scheduler._swap_out(seq_group, blocks_to_swap_out)
+    swapped.append(seq_group)
+
+    # The last request should be swapped out.
+    scheduler.block_manager.append_slots = MagicMock()
+    scheduler.block_manager.append_slots.return_value = {2: [3]}
+
+    budget = create_token_budget()
+    remaining_swapped, output = scheduler._schedule_swapped(
+        swapped, budget, curr_loras, policy)
+    assert len(remaining_swapped) == 0
+    assert len(output.decode_seq_groups) == 1
+    assert len(output.prefill_seq_groups) == 0
+    assert output.blocks_to_copy == {2: [3]}
+
+
+def test_scheduling_budget():
+    TOKEN_BUDGET = 4
+    MAX_SEQS = 4
+    budget = SchedulingBudget(token_budget=TOKEN_BUDGET, max_num_seqs=MAX_SEQS)
+    assert budget.can_schedule(num_new_tokens=1, num_new_seqs=1)
+    assert budget.can_schedule(num_new_tokens=4, num_new_seqs=4)
+    assert not budget.can_schedule(num_new_tokens=1, num_new_seqs=5)
+    assert not budget.can_schedule(num_new_tokens=5, num_new_seqs=1)
+    assert not budget.can_schedule(num_new_tokens=5, num_new_seqs=5)
+    assert budget.remaining_token_budget() == TOKEN_BUDGET
+
+    # Verify add/subtract num batched tokens.
+    _, seq_group = create_dummy_prompt("1", 3)
+    budget.add_num_batched_tokens(seq_group.request_id, 2)
+    assert budget.remaining_token_budget() == 2
+    assert budget.num_batched_tokens == 2
+    assert budget.can_schedule(num_new_tokens=2, num_new_seqs=1)
+    assert not budget.can_schedule(num_new_tokens=3, num_new_seqs=1)
+    # Verify adding another seq group is no-op.
+    budget.add_num_batched_tokens(seq_group.request_id, 2)
+    assert budget.remaining_token_budget() == 2
+    assert budget.num_batched_tokens == 2
+    budget.subtract_num_batched_tokens(seq_group.request_id, 2)
+    assert budget.remaining_token_budget() == 4
+    assert budget.num_batched_tokens == 0
+    budget.subtract_num_batched_tokens(seq_group.request_id, 2)
+    assert budget.remaining_token_budget() == 4
+    assert budget.num_batched_tokens == 0
+
+    # Verify add/subtract max seqs.
+    _, seq_group = create_dummy_prompt("1", 3)
+    budget.add_num_seqs(seq_group.request_id, 2)
+    assert budget.can_schedule(num_new_tokens=1, num_new_seqs=2)
+    assert not budget.can_schedule(num_new_tokens=1, num_new_seqs=3)
+    assert budget.num_curr_seqs == 2
+    # Verify adding another seq group is no-op.
+    budget.add_num_seqs(seq_group.request_id, 2)
+    assert budget.num_curr_seqs == 2
+    budget.subtract_num_seqs(seq_group.request_id, 2)
+    assert budget.num_curr_seqs == 0
+    budget.subtract_num_seqs(seq_group.request_id, 2)
+    assert budget.num_curr_seqs == 0
--- a/tests/core/utils.py
+++ b/tests/core/utils.py
@@ -0,0 +1,74 @@
+import time
+from typing import Iterable, Optional, Tuple
+
+from vllm import SamplingParams
+from vllm.lora.request import LoRARequest
+from vllm.sequence import Logprob, Sequence, SequenceGroup
+
+
+def create_dummy_prompt(
+    request_id: str,
+    prompt_length: int,
+    block_size: Optional[int] = None,
+    lora_request: Optional[LoRARequest] = None,
+    use_beam_search: bool = False,
+    best_of: int = 1,
+) -> Tuple[Sequence, SequenceGroup]:
+    if not block_size:
+        block_size = prompt_length
+
+    # Create dummy prompt sequence with tokens 0...block_size-1
+    # and prompt "0 ... block_size".
+    prompt_tokens = list(range(prompt_length))
+    prompt_str = " ".join([str(t) for t in prompt_tokens])
+    prompt = Sequence(int(request_id), prompt_str, prompt_tokens, block_size)
+    seq_group = SequenceGroup(
+        request_id, [prompt],
+        SamplingParams(use_beam_search=use_beam_search, best_of=best_of),
+        time.time(), lora_request)
+
+    return prompt, seq_group
+
+
+def create_seq_group(
+        seq_prompt_len: int = 1024,
+        seq_output_lens: Iterable[int] = (128, ),
+        request_id: str = '0',
+        seq_id_start: int = 0,
+        sampling_params: Optional[SamplingParams] = None) -> SequenceGroup:
+
+    assert len(seq_output_lens) > 0
+
+    if sampling_params is None:
+        sampling_params = SamplingParams()
+
+    prompt_token_ids = [0] * seq_prompt_len
+
+    seqs = []
+    for seq_id_offset, output_len in enumerate(seq_output_lens):
+        seq = Sequence(
+            seq_id=seq_id_start + seq_id_offset,
+            prompt="",
+            prompt_token_ids=prompt_token_ids,
+            block_size=16,
+        )
+
+        for i in range(output_len):
+            seq.append_token_id(
+                token_id=i,
+                logprobs={i: Logprob(0.0)},
+            )
+        seqs.append(seq)
+
+    seq_group = SequenceGroup(
+        request_id=request_id,
+        seqs=seqs,
+        sampling_params=sampling_params,
+        arrival_time=time.time(),
+    )
+
+    return seq_group
+
+
+def round_up_to_next_block(seq_len: int, block_size: int) -> int:
+    return (seq_len + block_size - 1) // block_size
--- a/tests/distributed/test_basic_distributed_correctness.py
+++ b/tests/distributed/test_basic_distributed_correctness.py
@@ -0,0 +1,59 @@
+"""Compare the outputs of HF and distributed vLLM when using greedy sampling.
+vLLM will allocate all the available memory, so we need to run the tests one
+by one. The solution is to pass arguments (model name) by environment
+variables.
+Run:
+```sh
+TEST_DIST_MODEL=facebook/opt-125m pytest \
+    test_basic_distributed_correctness.py
+TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf \
+    test_basic_distributed_correctness.py
+```
+"""
+import os
+
+import pytest
+import torch
+
+MODELS = [
+    os.environ["TEST_DIST_MODEL"],
+]
+VLLM_ATTENTION_BACKEND = "VLLM_ATTENTION_BACKEND"
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 2,
+                    reason="Need at least 2 GPUs to run the test.")
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [5])
+def test_models(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+) -> None:
+    enforce_eager = False
+    backend_by_env_var = os.getenv(VLLM_ATTENTION_BACKEND)
+    if backend_by_env_var == "FLASHINFER":
+        enforce_eager = True
+
+    hf_model = hf_runner(model, dtype=dtype)
+    hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+    del hf_model
+
+    vllm_model = vllm_runner(model,
+                             dtype=dtype,
+                             tensor_parallel_size=2,
+                             enforce_eager=enforce_eager)
+    vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+    del vllm_model
+
+    for i in range(len(example_prompts)):
+        hf_output_ids, hf_output_str = hf_outputs[i]
+        vllm_output_ids, vllm_output_str = vllm_outputs[i]
+        assert hf_output_str == vllm_output_str, (
+            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
+        assert hf_output_ids == vllm_output_ids, (
+            f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
--- a/tests/distributed/test_chunked_prefill_distributed.py
+++ b/tests/distributed/test_chunked_prefill_distributed.py
@@ -0,0 +1,66 @@
+"""Compare the outputs of HF and distributed vLLM when using greedy sampling.
+vLLM will allocate all the available memory, so we need to run the tests one
+by one. The solution is to pass arguments (model name) by environment
+variables.
+
+Run:
+```sh
+TEST_DIST_MODEL=facebook/opt-125m pytest \
+    test_chunked_prefill_distributed.py
+TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf \
+    test_chunked_prefill_distributed.py
+```
+"""
+import os
+
+import pytest
+import torch
+
+MODELS = [
+    os.environ["TEST_DIST_MODEL"],
+]
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 2,
+                    reason="Need at least 2 GPUs to run the test.")
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [5])
+@pytest.mark.parametrize("chunked_prefill_token_size", [16])
+def test_models(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+    chunked_prefill_token_size: int,
+) -> None:
+    # Add a chunked prefill config.
+    max_num_seqs = min(chunked_prefill_token_size, 256)
+    assert chunked_prefill_token_size != -1
+    enable_chunked_prefill = True
+    max_num_batched_tokens = chunked_prefill_token_size
+
+    hf_model = hf_runner(model, dtype=dtype)
+    hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+    del hf_model
+
+    vllm_model = vllm_runner(
+        model,
+        dtype=dtype,
+        tensor_parallel_size=2,
+        max_num_seqs=max_num_seqs,
+        enable_chunked_prefill=enable_chunked_prefill,
+        max_num_batched_tokens=max_num_batched_tokens,
+    )
+    vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+    del vllm_model
+
+    for i in range(len(example_prompts)):
+        hf_output_ids, hf_output_str = hf_outputs[i]
+        vllm_output_ids, vllm_output_str = vllm_outputs[i]
+        assert hf_output_str == vllm_output_str, (
+            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
+        assert hf_output_ids == vllm_output_ids, (
+            f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
--- a/tests/distributed/test_comm_ops.py
+++ b/tests/distributed/test_comm_ops.py
@@ -0,0 +1,110 @@
+"""Test the communication operators.
+
+Run `pytest tests/distributed/test_comm_ops.py`.
+"""
+import os
+
+import pytest
+import ray
+import torch
+
+from vllm.distributed import (broadcast_tensor_dict,
+                              tensor_model_parallel_all_gather,
+                              tensor_model_parallel_all_reduce)
+from vllm.test_utils import (init_test_distributed_environment,
+                             multi_process_tensor_parallel)
+
+
+@ray.remote(num_gpus=1, max_calls=1)
+def all_reduce_test_worker(tensor_parallel_size: int, rank: int,
+                           distributed_init_port: str):
+    # it is important to delete the CUDA_VISIBLE_DEVICES environment variable
+    # so that each worker can see all the GPUs
+    # they will be able to set the device to the correct GPU
+    del os.environ["CUDA_VISIBLE_DEVICES"]
+    device = torch.device(f"cuda:{rank}")
+    torch.cuda.set_device(device)
+    init_test_distributed_environment(1, tensor_parallel_size, rank,
+                                      distributed_init_port)
+    num_elements = 8
+    all_tensors = [
+        torch.arange(num_elements, dtype=torch.float32, device="cuda") *
+        (r + 1) for r in range(tensor_parallel_size)
+    ]
+    expected = torch.sum(torch.stack(all_tensors, dim=0), dim=0)
+    t = all_tensors[rank]
+    t = tensor_model_parallel_all_reduce(t)
+    assert torch.allclose(t, expected)
+
+
+@ray.remote(num_gpus=1, max_calls=1)
+def all_gather_test_worker(tensor_parallel_size: int, rank: int,
+                           distributed_init_port: str):
+    # it is important to delete the CUDA_VISIBLE_DEVICES environment variable
+    # so that each worker can see all the GPUs
+    # they will be able to set the device to the correct GPU
+    del os.environ["CUDA_VISIBLE_DEVICES"]
+    device = torch.device(f"cuda:{rank}")
+    torch.cuda.set_device(device)
+    init_test_distributed_environment(1, tensor_parallel_size, rank,
+                                      distributed_init_port)
+    num_dimensions = 3
+    tensor_size = list(range(2, num_dimensions + 2))
+    total_size = 1
+    for s in tensor_size:
+        total_size *= s
+    for all_gather_dimension in range(num_dimensions):
+        all_tensors = [
+            torch.arange(total_size, dtype=torch.float32,
+                         device="cuda").reshape(tensor_size) * (r + 1)
+            for r in range(tensor_parallel_size)
+        ]
+        expected = torch.cat(all_tensors, dim=all_gather_dimension)
+        t = all_tensors[rank]
+        t = tensor_model_parallel_all_gather(t, all_gather_dimension)
+        assert torch.allclose(t, expected)
+
+
+@ray.remote(num_gpus=1, max_calls=1)
+def broadcast_tensor_dict_test_worker(tensor_parallel_size: int, rank: int,
+                                      distributed_init_port: str):
+    # it is important to delete the CUDA_VISIBLE_DEVICES environment variable
+    # so that each worker can see all the GPUs
+    # they will be able to set the device to the correct GPU
+    del os.environ["CUDA_VISIBLE_DEVICES"]
+    device = torch.device(f"cuda:{rank}")
+    torch.cuda.set_device(device)
+    init_test_distributed_environment(1, tensor_parallel_size, rank,
+                                      distributed_init_port)
+    test_dict = {
+        "a": torch.arange(8, dtype=torch.float32, device="cuda"),
+        "b": torch.arange(16, dtype=torch.int8, device="cuda"),
+        "c": "test",
+        "d": [1, 2, 3],
+        "e": {
+            "a": 1,
+            "b": 2
+        },
+    }
+
+    if rank == 0:
+        broadcast_tensor_dict(test_dict, src=0)
+    else:
+        recv_dict = broadcast_tensor_dict(src=0)
+        assert len(recv_dict) == len(test_dict)
+        assert torch.allclose(recv_dict["a"], test_dict["a"])
+        assert torch.allclose(recv_dict["b"], test_dict["b"])
+        assert recv_dict["c"] == test_dict["c"]
+        assert recv_dict["d"] == test_dict["d"]
+        assert recv_dict["e"] == test_dict["e"]
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 2,
+                    reason="Need at least 2 GPUs to run the test.")
+@pytest.mark.parametrize("tensor_parallel_size", [2])
+@pytest.mark.parametrize("test_target", [
+    all_reduce_test_worker, all_gather_test_worker,
+    broadcast_tensor_dict_test_worker
+])
+def test_multi_process_tensor_parallel(tensor_parallel_size, test_target):
+    multi_process_tensor_parallel(tensor_parallel_size, test_target)
--- a/tests/distributed/test_custom_all_reduce.py
+++ b/tests/distributed/test_custom_all_reduce.py
@@ -0,0 +1,84 @@
+import os
+import random
+
+import pytest
+import ray
+import torch
+import torch.distributed as dist
+
+from vllm.distributed import tensor_model_parallel_all_reduce
+from vllm.distributed.device_communicators import custom_all_reduce
+from vllm.test_utils import (init_test_distributed_environment,
+                             multi_process_tensor_parallel)
+
+random.seed(42)
+test_sizes = [random.randint(1024, 2048 * 1024) for _ in range(8)]
+for i, v in enumerate(test_sizes):
+    test_sizes[i] -= v % 8
+
+
+@ray.remote(num_gpus=1, max_calls=1)
+def graph_allreduce(world_size, rank, distributed_init_port):
+    del os.environ["CUDA_VISIBLE_DEVICES"]
+    device = torch.device(f"cuda:{rank}")
+    torch.cuda.set_device(device)
+    init_test_distributed_environment(1, world_size, rank,
+                                      distributed_init_port)
+
+    custom_all_reduce.init_custom_all_reduce()
+    for sz in test_sizes:
+        for dtype in [torch.float32, torch.float16, torch.bfloat16]:
+            with custom_all_reduce.capture():
+                # use integers so result matches NCCL exactly
+                inp1 = torch.randint(1,
+                                     16, (sz, ),
+                                     dtype=dtype,
+                                     device=torch.cuda.current_device())
+                inp2 = torch.randint(1,
+                                     16, (sz, ),
+                                     dtype=dtype,
+                                     device=torch.cuda.current_device())
+                torch.cuda.synchronize()
+                graph = torch.cuda.CUDAGraph()
+                with torch.cuda.graph(graph):
+                    out1 = tensor_model_parallel_all_reduce(inp1)
+                    # the input buffer is immediately modified to test
+                    # synchronization
+                    dist.all_reduce(inp1)
+                    out2 = tensor_model_parallel_all_reduce(inp2)
+                    dist.all_reduce(inp2)
+            graph.replay()
+            assert torch.allclose(out1, inp1)
+            assert torch.allclose(out2, inp2)
+
+
+@ray.remote(num_gpus=1, max_calls=1)
+def eager_allreduce(world_size, rank, distributed_init_port):
+    del os.environ["CUDA_VISIBLE_DEVICES"]
+    device = torch.device(f"cuda:{rank}")
+    torch.cuda.set_device(device)
+    init_test_distributed_environment(1, world_size, rank,
+                                      distributed_init_port)
+
+    sz = 1024
+    custom_all_reduce.init_custom_all_reduce()
+    fa = custom_all_reduce.get_handle()
+    inp = torch.ones(sz, dtype=torch.float32, device=device)
+    out = fa.all_reduce_unreg(inp)
+    assert torch.allclose(out, inp * world_size)
+
+    inp = torch.ones(sz * 4, dtype=torch.bfloat16, device=device)
+    out = fa.all_reduce_unreg(inp)
+    assert torch.allclose(out, inp * world_size)
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 2,
+                    reason="Need at least 2 GPUs to run the test.")
+@pytest.mark.parametrize("tensor_parallel_size", [2])
+@pytest.mark.parametrize("test_target", [eager_allreduce, graph_allreduce])
+def test_multi_process_tensor_parallel(tensor_parallel_size, test_target):
+    multi_process_tensor_parallel(tensor_parallel_size, test_target)
+
+
+if __name__ == "__main__":
+    multi_process_tensor_parallel(2, graph_allreduce)
--- a/tests/distributed/test_pynccl.py
+++ b/tests/distributed/test_pynccl.py
@@ -0,0 +1,159 @@
+import multiprocessing
+
+import pytest
+import torch
+
+import vllm.distributed.device_communicators.pymccl_utils as pymccl_utils
+from vllm.distributed.communication_op import tensor_model_parallel_all_reduce
+from vllm.distributed.device_communicators.pynccl import (NCCLCommunicator,
+                                                          ncclGetUniqueId)
+from vllm.distributed.parallel_state import (
+    ensure_model_parallel_initialized, get_tensor_model_parallel_cpu_group,
+    init_distributed_environment, with_pynccl_for_all_reduce)
+from vllm.utils import update_environment_variables
+
+
+def distributed_run(fn, world_size):
+    number_of_processes = world_size
+    processes = []
+    for i in range(number_of_processes):
+        env = {}
+        env['RANK'] = str(i)
+        env['LOCAL_RANK'] = str(i)
+        env['WORLD_SIZE'] = str(number_of_processes)
+        env['LOCAL_WORLD_SIZE'] = str(number_of_processes)
+        env['MASTER_ADDR'] = 'localhost'
+        env['MASTER_PORT'] = '12345'
+        p = multiprocessing.Process(target=fn, args=(env, ))
+        processes.append(p)
+        p.start()
+
+    for p in processes:
+        p.join()
+
+    for p in processes:
+        assert p.exitcode == 0
+
+
+def worker_fn_wrapper(fn):
+    # `multiprocessing.Process` cannot accept environment variables directly
+    # so we need to pass the environment variables as arguments
+    # and update the environment variables in the function
+    def wrapped_fn(env):
+        update_environment_variables(env)
+        init_distributed_environment()
+        fn()
+
+    return wrapped_fn
+
+
+@worker_fn_wrapper
+def worker_fn():
+    comm = NCCLCommunicator()
+    tensor = torch.ones(16, 1024, 1024, dtype=torch.float32).cuda(comm.rank)
+    comm.all_reduce(tensor)
+    result = tensor.mean().cpu().item()
+    assert result == comm.world_size
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 2,
+                    reason="Need at least 2 GPUs to run the test.")
+def test_pynccl():
+    distributed_run(worker_fn, 2)
+
+
+@worker_fn_wrapper
+def multiple_tp_worker_fn():
+    device = torch.device(f"cuda:{torch.distributed.get_rank()}")
+    groups = [
+        torch.distributed.new_group(ranks=[0, 1], backend="gloo"),
+        torch.distributed.new_group(ranks=[2, 3], backend="gloo")
+    ]
+    group = groups[0] if torch.distributed.get_rank() in [0, 1] else groups[1]
+    comm = NCCLCommunicator(group=group, device=device)
+    tensor = torch.ones(16, 1024, 1024, dtype=torch.float32, device=device)
+    # two groups can communicate independently
+    if torch.distributed.get_rank() in [0, 1]:
+        comm.all_reduce(tensor)
+        comm.all_reduce(tensor)
+        result = tensor.mean().cpu().item()
+        assert result == 4
+    else:
+        comm.all_reduce(tensor)
+        result = tensor.mean().cpu().item()
+        assert result == 2
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 4,
+                    reason="Need at least 4 GPUs to run the test.")
+def test_pynccl_multiple_tp():
+    # this tests pynccl for multiple tp groups, in a standalone way
+    # i.e. call `comm.all_reduce` directly
+    distributed_run(multiple_tp_worker_fn, 4)
+
+
+@worker_fn_wrapper
+def multiple_tp_with_vllm_worker_fn():
+    device = torch.device(f"cuda:{torch.distributed.get_rank()}")
+    torch.cuda.set_device(torch.distributed.get_rank())
+    ensure_model_parallel_initialized(2, 2)
+    pymccl_utils.init_process_group(
+        group=get_tensor_model_parallel_cpu_group())
+    tensor = torch.ones(16, 1024, 1024, dtype=torch.float32, device=device)
+    with with_pynccl_for_all_reduce():
+        # two tp groups can communicate independently
+        if torch.distributed.get_rank() in [0, 1]:
+            tensor = tensor_model_parallel_all_reduce(tensor)
+            tensor = tensor_model_parallel_all_reduce(tensor)
+            result = tensor.mean().cpu().item()
+            assert result == 4
+        else:
+            tensor = tensor_model_parallel_all_reduce(tensor)
+            result = tensor.mean().cpu().item()
+            assert result == 2
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 4,
+                    reason="Need at least 4 GPUs to run the test.")
+def test_pynccl_multiple_tp_with_vllm():
+    # this tests pynccl for multiple tp groups, together with vllm
+    # i.e. call `tensor_model_parallel_all_reduce`
+    distributed_run(multiple_tp_with_vllm_worker_fn, 4)
+
+
+@worker_fn_wrapper
+def worker_fn_with_cudagraph():
+    with torch.no_grad():
+        graph = torch.cuda.CUDAGraph()
+        comm = NCCLCommunicator()
+        # run something in the default stream to initialize torch engine
+        a = torch.ones((4, 4), device=f'cuda:{comm.rank}')
+        torch.cuda.synchronize()
+        with torch.cuda.graph(graph, stream=comm.stream):
+            # operation during the graph capture is recorded but not executed
+            # see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#creating-a-graph-using-stream-capture # noqa
+            comm.all_reduce(a)
+        comm.stream.synchronize()
+        assert a.mean().cpu().item() == comm.world_size**0
+        graph.replay()
+        comm.stream.synchronize()
+        assert a.mean().cpu().item() == comm.world_size**1
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 2,
+                    reason="Need at least 2 GPUs to run the test.")
+def test_pynccl_with_cudagraph():
+    distributed_run(worker_fn_with_cudagraph, 2)
+
+
+def test_ncclGetUniqueId():
+    unique_id = ncclGetUniqueId()
+    # `list(unique_id.internal)` is something like this:
+    # [34, -16, 23, 83, 109, -19, 59, 95, 2, 0, -86, 55, 10, -128, 0, 29, 0,
+    # 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    # 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    # 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    # 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    # 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+    # as long as the function doesn't raise an exception, we're good
+    assert unique_id is not None
--- a/tests/distributed/test_pynccl_library.py
+++ b/tests/distributed/test_pynccl_library.py
@@ -0,0 +1,43 @@
+import multiprocessing
+import tempfile
+
+
+def target_fn(env, filepath):
+    from vllm.utils import update_environment_variables
+    update_environment_variables(env)
+    from vllm.utils import nccl_integrity_check
+    nccl_integrity_check(filepath)
+
+
+def test_library_file():
+    # note: don't import vllm.distributed.device_communicators.pynccl
+    # before running this test, otherwise the library file will be loaded
+    # and it might interfere with the test
+    from vllm.utils import find_nccl_library
+    so_file = find_nccl_library()
+    with open(so_file, 'rb') as f:
+        content = f.read()
+    try:
+        # corrupt the library file, should raise an exception
+        with open(so_file, 'wb') as f:
+            f.write(content[:len(content) // 2])
+        p = multiprocessing.Process(target=target_fn, args=({}, so_file))
+        p.start()
+        p.join()
+        assert p.exitcode != 0
+
+        # move the library file to a tmp path
+        # test VLLM_NCCL_SO_PATH
+        fd, path = tempfile.mkstemp()
+        with open(path, 'wb') as f:
+            f.write(content)
+        p = multiprocessing.Process(target=target_fn,
+                                    args=({
+                                        "VLLM_NCCL_SO_PATH": path
+                                    }, path))
+        p.start()
+        p.join()
+        assert p.exitcode == 0
+    finally:
+        with open(so_file, 'wb') as f:
+            f.write(content)
--- a/tests/engine/output_processor/test_multi_step.py
+++ b/tests/engine/output_processor/test_multi_step.py
@@ -0,0 +1,270 @@
+import random
+from unittest.mock import MagicMock
+
+import pytest
+from transformers import PreTrainedTokenizer
+
+from tests.core.utils import create_seq_group
+from vllm.core.scheduler import Scheduler
+from vllm.engine.output_processor.multi_step import MultiStepOutputProcessor
+from vllm.engine.output_processor.stop_checker import StopChecker
+from vllm.sampling_params import SamplingParams
+from vllm.sequence import (Logprob, SequenceGroupOutput, SequenceOutput,
+                           SequenceStatus)
+from vllm.transformers_utils.detokenizer import Detokenizer
+from vllm.utils import Counter
+
+
+@pytest.mark.parametrize("seq_output_len", [128])
+@pytest.mark.parametrize("num_new_tokens", [1, 12])
+@pytest.mark.skip_global_cleanup
+def test_appends_token_ids(num_new_tokens: int, seq_output_len: int):
+    """Verify multi-step decoding appends token ids correctly.
+
+    We append token ids and verify all the token ids were appended correctly.
+    Note that ignore_eos=True.
+    """
+    detokenizer = MagicMock(spec=Detokenizer)
+    scheduler = MagicMock(spec=Scheduler)
+    stop_checker = MagicMock(spec=StopChecker)
+    seq_counter = Counter()
+
+    output_processor = MultiStepOutputProcessor(
+        detokenizer=detokenizer,
+        scheduler=scheduler,
+        seq_counter=seq_counter,
+        get_tokenizer_for_seq=lambda _: mock_tokenizer(),
+        stop_checker=stop_checker,
+    )
+
+    seq_group = create_seq_group(
+        seq_prompt_len=1024,
+        seq_output_lens=[seq_output_len],
+        sampling_params=SamplingParams(max_tokens=seq_output_len +
+                                       num_new_tokens,
+                                       ignore_eos=True),
+    )
+
+    seq = seq_group.get_seqs()[0]
+    seq.status = SequenceStatus.RUNNING
+
+    new_token_ids = list(range(num_new_tokens))
+
+    outputs = [
+        SequenceGroupOutput(
+            samples=[
+                SequenceOutput(
+                    parent_seq_id=seq.seq_id,
+                    output_token=output_token,
+                    logprobs={output_token: Logprob(0.0)},
+                )
+            ],
+            prompt_logprobs=None,
+        ) for output_token in new_token_ids
+    ]
+
+    assert seq.get_token_ids()[-len(new_token_ids):] != new_token_ids
+    output_processor.process_outputs(seq_group, outputs)
+    assert seq.get_token_ids()[-len(new_token_ids):] == new_token_ids
+
+
+@pytest.mark.parametrize("seq_prompt_len", [1024])
+@pytest.mark.parametrize("seq_output_len", [128])
+@pytest.mark.parametrize("num_new_tokens", [5, 6, 7, 8])
+@pytest.mark.parametrize("max_tokens", [128 + 3])
+@pytest.mark.skip_global_cleanup
+def test_respects_max_tokens(num_new_tokens: int, seq_prompt_len: int,
+                             seq_output_len: int, max_tokens: int):
+    """Verify tokens after max_tokens are dropped and not appended to the
+    sequence.
+    """
+    detokenizer = MagicMock(spec=Detokenizer)
+    scheduler = MagicMock(spec=Scheduler)
+    stop_checker = MagicMock(spec=StopChecker)
+    seq_counter = Counter()
+
+    output_processor = MultiStepOutputProcessor(
+        detokenizer=detokenizer,
+        scheduler=scheduler,
+        seq_counter=seq_counter,
+        get_tokenizer_for_seq=lambda _: mock_tokenizer(),
+        stop_checker=stop_checker,
+    )
+
+    seq_group = create_seq_group(
+        seq_prompt_len=seq_prompt_len,
+        seq_output_lens=[seq_output_len],
+        sampling_params=SamplingParams(max_tokens=max_tokens, ),
+    )
+
+    seq = seq_group.get_seqs()[0]
+    seq.status = SequenceStatus.RUNNING
+
+    new_token_ids = list(range(num_new_tokens))
+
+    outputs = [
+        SequenceGroupOutput(
+            samples=[
+                SequenceOutput(
+                    parent_seq_id=seq.seq_id,
+                    output_token=output_token,
+                    logprobs={output_token: Logprob(0.0)},
+                )
+            ],
+            prompt_logprobs=None,
+        ) for output_token in new_token_ids
+    ]
+
+    assert seq.get_len() == seq_prompt_len + seq_output_len
+    output_processor.process_outputs(seq_group, outputs)
+
+    # Expect the processed sequence to not go over max tokens in len.
+    assert seq.get_len() == seq_prompt_len + max_tokens
+
+    # Expect the correct tokens were appended.
+    expected_appended_tokens = new_token_ids[:max_tokens - seq_output_len]
+    assert seq.get_token_ids(
+    )[-len(expected_appended_tokens):] == expected_appended_tokens
+
+
+@pytest.mark.parametrize("seq_prompt_len", [1024])
+@pytest.mark.parametrize("seq_output_len", [128])
+@pytest.mark.parametrize("num_new_tokens", [12])
+@pytest.mark.parametrize("seed", list(range(6)))
+@pytest.mark.skip_global_cleanup
+def test_respects_eos_token_id(num_new_tokens: int, seq_prompt_len: int,
+                               seq_output_len: int, seed: int):
+    """Verify the eos token id is included in the sequence, but subsequent
+    tokens are dropped (not appended to sequence).
+    """
+    random.seed(seed)
+    detokenizer = MagicMock(spec=Detokenizer)
+    scheduler = MagicMock(spec=Scheduler)
+    stop_checker = MagicMock(spec=StopChecker)
+    seq_counter = Counter()
+
+    eos_token_id = 100
+
+    output_processor = MultiStepOutputProcessor(
+        detokenizer=detokenizer,
+        scheduler=scheduler,
+        seq_counter=seq_counter,
+        get_tokenizer_for_seq=lambda _: mock_tokenizer(eos_token_id),
+        stop_checker=stop_checker,
+    )
+
+    seq_group = create_seq_group(
+        seq_prompt_len=seq_prompt_len,
+        seq_output_lens=[seq_output_len],
+        sampling_params=SamplingParams(
+            # Ensure enough space.
+            max_tokens=seq_output_len + num_new_tokens, ),
+    )
+
+    seq = seq_group.get_seqs()[0]
+    seq.status = SequenceStatus.RUNNING
+
+    new_token_ids = list(range(num_new_tokens))
+    assert eos_token_id not in new_token_ids
+    eos_index = random.randint(0, len(new_token_ids) - 1)
+    new_token_ids[eos_index] = eos_token_id
+
+    outputs = [
+        SequenceGroupOutput(
+            samples=[
+                SequenceOutput(
+                    parent_seq_id=seq.seq_id,
+                    output_token=output_token,
+                    logprobs={output_token: Logprob(0.0)},
+                )
+            ],
+            prompt_logprobs=None,
+        ) for output_token in new_token_ids
+    ]
+
+    assert seq.get_len() == seq_prompt_len + seq_output_len
+    output_processor.process_outputs(seq_group, outputs)
+
+    # Expect the processed sequence to not go beyond provided eos.
+    assert seq.get_len() == seq_prompt_len + seq_output_len + (eos_index + 1)
+
+    # Expect the correct tokens were appended.
+    expected_appended_tokens = new_token_ids[:eos_index + 1]
+    assert seq.get_token_ids(
+    )[-len(expected_appended_tokens):] == expected_appended_tokens
+
+
+@pytest.mark.parametrize("seq_prompt_len", [1024])
+@pytest.mark.parametrize("seq_output_len", [128])
+@pytest.mark.parametrize("num_new_tokens", [12])
+@pytest.mark.parametrize("seed", list(range(6)))
+@pytest.mark.skip_global_cleanup
+def test_ignores_eos_token_id(num_new_tokens: int, seq_prompt_len: int,
+                              seq_output_len: int, seed: int):
+    """When sampling parameters dictate that we should ignore the eos token id,
+    ensure all token ids are appended even if the eos token id is emitted.
+    """
+    random.seed(seed)
+    detokenizer = MagicMock(spec=Detokenizer)
+    scheduler = MagicMock(spec=Scheduler)
+    stop_checker = MagicMock(spec=StopChecker)
+    seq_counter = Counter()
+
+    eos_token_id = 100
+
+    output_processor = MultiStepOutputProcessor(
+        detokenizer=detokenizer,
+        scheduler=scheduler,
+        seq_counter=seq_counter,
+        get_tokenizer_for_seq=lambda _: mock_tokenizer(eos_token_id),
+        stop_checker=stop_checker,
+    )
+
+    seq_group = create_seq_group(
+        seq_prompt_len=seq_prompt_len,
+        seq_output_lens=[seq_output_len],
+        sampling_params=SamplingParams(
+            # Ensure enough space.
+            max_tokens=seq_output_len + num_new_tokens,
+            ignore_eos=True,
+        ),
+    )
+
+    seq = seq_group.get_seqs()[0]
+    seq.status = SequenceStatus.RUNNING
+
+    new_token_ids = list(range(num_new_tokens))
+    assert eos_token_id not in new_token_ids
+    eos_index = random.randint(0, len(new_token_ids) - 1)
+    new_token_ids[eos_index] = eos_token_id
+
+    outputs = [
+        SequenceGroupOutput(
+            samples=[
+                SequenceOutput(
+                    parent_seq_id=seq.seq_id,
+                    output_token=output_token,
+                    logprobs={output_token: Logprob(0.0)},
+                )
+            ],
+            prompt_logprobs=None,
+        ) for output_token in new_token_ids
+    ]
+
+    assert seq.get_len() == seq_prompt_len + seq_output_len
+    output_processor.process_outputs(seq_group, outputs)
+
+    # Expect the processed sequence to go beyond eos.
+    assert seq.get_len() == seq_prompt_len + seq_output_len + num_new_tokens
+
+    # Expect the correct tokens were appended.
+    expected_appended_tokens = new_token_ids[:seq_output_len + num_new_tokens -
+                                             seq_output_len]
+    assert seq.get_token_ids(
+    )[-len(expected_appended_tokens):] == expected_appended_tokens
+
+
+def mock_tokenizer(eos_token_id=1000):
+    tokenizer = MagicMock(spec=PreTrainedTokenizer)
+    tokenizer.eos_token_id = eos_token_id
+    return tokenizer
--- a/tests/engine/test_computed_prefix_blocks.py
+++ b/tests/engine/test_computed_prefix_blocks.py
@@ -0,0 +1,34 @@
+import pytest
+
+from vllm.engine.arg_utils import EngineArgs
+from vllm.engine.llm_engine import LLMEngine
+from vllm.sampling_params import SamplingParams
+
+
+@pytest.mark.parametrize("model", ["facebook/opt-125m"])
+@pytest.mark.parametrize("block_size", [16])
+def test_computed_prefix_blocks(model: str, block_size: int):
+    # This test checks if we are able to run the engine to completion
+    # without triggering asserts.
+    # We are in a scenario where all blocks from the second request's prompt
+    # are full and already computed when the second request arrives.
+    prompt = (
+        "You are a helpful assistant. How do I build a car from cardboard and "
+        "paper clips? Is there an easy to follow video tutorial available "
+        "online for free?")
+    prompt2 = (
+        " Please recommend to me some resources where I can learn not only to "
+        "handle technical difficulties of building a car, but also "
+        "decoration.")
+
+    engine_args = EngineArgs(model=model,
+                             block_size=block_size,
+                             enable_prefix_caching=True)
+
+    engine = LLMEngine.from_engine_args(engine_args)
+    sampling_params = SamplingParams()
+
+    engine.add_request("0", prompt + prompt2, sampling_params)
+    engine.step()
+    engine.add_request("1", prompt, sampling_params)
+    engine.step()
--- a/tests/engine/test_detokenization.py
+++ b/tests/engine/test_detokenization.py
@@ -0,0 +1,32 @@
+import pytest
+
+from vllm.entrypoints.llm import LLM
+from vllm.sampling_params import SamplingParams
+
+
+@pytest.mark.parametrize("model", ["facebook/opt-125m"])
+def test_computed_prefix_blocks(model: str):
+    # This test checks if the engine generates completions both with and
+    # without optional detokenization, that detokenization includes text
+    # and no-detokenization doesn't, and that both completions have the same
+    # token_ids.
+    prompt = (
+        "You are a helpful assistant. How do I build a car from cardboard and "
+        "paper clips? Is there an easy to follow video tutorial available "
+        "online for free?")
+
+    llm = LLM(model=model)
+    sampling_params = SamplingParams(max_tokens=10,
+                                     temperature=0.0,
+                                     detokenize=False)
+
+    outputs_no_detokenization = llm.generate(prompt,
+                                             sampling_params)[0].outputs[0]
+    sampling_params.detokenize = True
+    outputs_with_detokenization = llm.generate(prompt,
+                                               sampling_params)[0].outputs[0]
+
+    assert outputs_no_detokenization.text == ''
+    assert outputs_with_detokenization.text != ''
+    assert outputs_no_detokenization.token_ids == \
+        outputs_with_detokenization.token_ids
--- a/tests/engine/test_multiproc_workers.py
+++ b/tests/engine/test_multiproc_workers.py
@@ -0,0 +1,176 @@
+import asyncio
+from concurrent.futures import ThreadPoolExecutor
+from functools import partial
+from time import sleep
+from typing import Any, List, Tuple
+
+import pytest
+
+from vllm.executor.multiproc_worker_utils import (ProcessWorkerWrapper,
+                                                  ResultHandler, WorkerMonitor)
+
+
+class DummyWorker:
+    """Dummy version of vllm.worker.worker.Worker"""
+
+    def __init__(self, rank: int):
+        self.rank = rank
+
+    def worker_method(self, worker_input: Any) -> Tuple[int, Any]:
+        sleep(0.05)
+
+        if isinstance(worker_input, Exception):
+            # simulate error case
+            raise worker_input
+
+        return self.rank, input
+
+
+def _start_workers() -> Tuple[List[ProcessWorkerWrapper], WorkerMonitor]:
+    result_handler = ResultHandler()
+    workers = [
+        ProcessWorkerWrapper(result_handler, partial(DummyWorker, rank=rank))
+        for rank in range(8)
+    ]
+
+    worker_monitor = WorkerMonitor(workers, result_handler)
+    assert not worker_monitor.is_alive()
+
+    result_handler.start()
+    worker_monitor.start()
+    assert worker_monitor.is_alive()
+
+    return workers, worker_monitor
+
+
+def test_local_workers() -> None:
+    """Test workers with sync task submission"""
+
+    workers, worker_monitor = _start_workers()
+
+    def execute_workers(worker_input: str) -> None:
+        worker_outputs = [
+            worker.execute_method("worker_method", worker_input)
+            for worker in workers
+        ]
+
+        for rank, output in enumerate(worker_outputs):
+            assert output.get() == (rank, input)
+
+    executor = ThreadPoolExecutor(max_workers=4)
+
+    # Test concurrent submission from different threads
+    futures = [
+        executor.submit(partial(execute_workers, f"thread {thread_num}"))
+        for thread_num in range(4)
+    ]
+
+    for future in futures:
+        future.result()
+
+    # Test error case
+    exception = ValueError("fake error")
+    result = workers[0].execute_method("worker_method", exception)
+    try:
+        result.get()
+        pytest.fail("task should have failed")
+    except Exception as e:
+        assert isinstance(e, ValueError)
+        assert str(e) == "fake error"
+
+    # Test cleanup when a worker fails
+    assert worker_monitor.is_alive()
+    workers[3].process.kill()
+
+    # Other workers should get shut down here
+    worker_monitor.join(2)
+
+    # Ensure everything is stopped
+    assert not worker_monitor.is_alive()
+    assert all(not worker.process.is_alive() for worker in workers)
+
+    # Further attempts to submit tasks should fail
+    try:
+        _result = workers[0].execute_method("worker_method", "test")
+        pytest.fail("task should fail once workers have been shut down")
+    except Exception as e:
+        assert isinstance(e, ChildProcessError)
+
+
+def test_local_workers_clean_shutdown() -> None:
+    """Test clean shutdown"""
+
+    workers, worker_monitor = _start_workers()
+
+    assert worker_monitor.is_alive()
+    assert all(worker.process.is_alive() for worker in workers)
+
+    # Clean shutdown
+    worker_monitor.close()
+
+    worker_monitor.join(5)
+
+    # Ensure everything is stopped
+    assert not worker_monitor.is_alive()
+    assert all(not worker.process.is_alive() for worker in workers)
+
+    # Further attempts to submit tasks should fail
+    try:
+        _result = workers[0].execute_method("worker_method", "test")
+        pytest.fail("task should fail once workers have been shut down")
+    except Exception as e:
+        assert isinstance(e, ChildProcessError)
+
+
+@pytest.mark.asyncio
+async def test_local_workers_async() -> None:
+    """Test local workers with async task submission"""
+
+    workers, worker_monitor = _start_workers()
+
+    async def execute_workers(worker_input: str) -> None:
+        worker_coros = [
+            worker.execute_method_async("worker_method", worker_input)
+            for worker in workers
+        ]
+
+        results = await asyncio.gather(*worker_coros)
+        for rank, result in enumerate(results):
+            assert result == (rank, input)
+
+    tasks = [
+        asyncio.create_task(execute_workers(f"task {task_num}"))
+        for task_num in range(4)
+    ]
+
+    for task in tasks:
+        await task
+
+    # Test error case
+    exception = ValueError("fake error")
+    try:
+        _result = await workers[0].execute_method_async(
+            "worker_method", exception)
+        pytest.fail("task should have failed")
+    except Exception as e:
+        assert isinstance(e, ValueError)
+        assert str(e) == "fake error"
+
+    # Test cleanup when a worker fails
+    assert worker_monitor.is_alive()
+    workers[3].process.kill()
+
+    # Other workers should get shut down here
+    worker_monitor.join(2)
+
+    # Ensure everything is stopped
+    assert not worker_monitor.is_alive()
+    assert all(not worker.process.is_alive() for worker in workers)
+
+    # Further attempts to submit tasks should fail
+    try:
+        _result = await workers[0].execute_method_async(
+            "worker_method", "test")
+        pytest.fail("task should fail once workers have been shut down")
+    except Exception as e:
+        assert isinstance(e, ChildProcessError)
--- a/tests/engine/test_skip_tokenizer_init.py
+++ b/tests/engine/test_skip_tokenizer_init.py
@@ -0,0 +1,23 @@
+import pytest
+
+from vllm.entrypoints.llm import LLM
+from vllm.sampling_params import SamplingParams
+
+
+@pytest.mark.parametrize("model", ["facebook/opt-125m"])
+def test_skip_tokenizer_initialization(model: str):
+    # This test checks if the flag skip_tokenizer_init skips the initialization
+    # of tokenizer and detokenizer. The generated output is expected to contain
+    # token ids.
+    llm = LLM(model=model, skip_tokenizer_init=True)
+    sampling_params = SamplingParams(prompt_logprobs=True, detokenize=True)
+    with pytest.raises(ValueError) as err:
+        llm.generate("abc", sampling_params)
+    assert "prompts must be None if" in str(err.value)
+    outputs = llm.generate(prompt_token_ids=[[1, 2, 3]],
+                           sampling_params=sampling_params)
+    assert len(outputs) > 0
+    completions = outputs[0].outputs
+    assert len(completions) > 0
+    assert completions[0].text == ""
+    assert completions[0].token_ids
--- a/tests/engine/test_stop_reason.py
+++ b/tests/engine/test_stop_reason.py
@@ -0,0 +1,59 @@
+"""Test the different finish_reason="stop" situations during generation:
+    1. One of the provided stop strings
+    2. One of the provided stop tokens
+    3. The EOS token
+
+Run `pytest tests/engine/test_stop_reason.py`.
+"""
+
+import pytest
+import transformers
+
+from vllm import SamplingParams
+
+MODEL = "facebook/opt-350m"
+STOP_STR = "."
+SEED = 42
+MAX_TOKENS = 1024
+
+
+@pytest.fixture
+def vllm_model(vllm_runner):
+    vllm_model = vllm_runner(MODEL)
+    yield vllm_model
+    del vllm_model
+
+
+def test_stop_reason(vllm_model, example_prompts):
+    tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL)
+    stop_token_id = tokenizer.convert_tokens_to_ids(STOP_STR)
+    llm = vllm_model.model
+
+    # test stop token
+    outputs = llm.generate(example_prompts,
+                           sampling_params=SamplingParams(
+                               seed=SEED,
+                               max_tokens=MAX_TOKENS,
+                               stop_token_ids=[stop_token_id]))
+    for output in outputs:
+        output = output.outputs[0]
+        assert output.finish_reason == "stop"
+        assert output.stop_reason == stop_token_id
+
+    # test stop string
+    outputs = llm.generate(example_prompts,
+                           sampling_params=SamplingParams(
+                               seed=SEED, max_tokens=MAX_TOKENS, stop="."))
+    for output in outputs:
+        output = output.outputs[0]
+        assert output.finish_reason == "stop"
+        assert output.stop_reason == STOP_STR
+
+    # test EOS token
+    outputs = llm.generate(example_prompts,
+                           sampling_params=SamplingParams(
+                               seed=SEED, max_tokens=MAX_TOKENS))
+    for output in outputs:
+        output = output.outputs[0]
+        assert output.finish_reason == "length" or (
+            output.finish_reason == "stop" and output.stop_reason is None)
--- a/tests/engine/test_stop_strings.py
+++ b/tests/engine/test_stop_strings.py
@@ -0,0 +1,111 @@
+from typing import Any, List, Optional
+
+import pytest
+
+from vllm import CompletionOutput, LLMEngine, SamplingParams
+
+MODEL = "meta-llama/llama-2-7b-hf"
+MAX_TOKENS = 200
+
+
+@pytest.fixture(scope="session")
+def vllm_model(vllm_runner):
+    return vllm_runner(MODEL)
+
+
+@pytest.mark.skip_global_cleanup
+def test_stop_basic(vllm_model):
+    _test_stopping(vllm_model.model.llm_engine,
+                   stop=["."],
+                   include_in_output=False,
+                   expected_output="VLLM is a 100% volunteer organization",
+                   expected_reason=".")
+
+    _test_stopping(vllm_model.model.llm_engine,
+                   stop=["."],
+                   include_in_output=True,
+                   expected_output="VLLM is a 100% volunteer organization.",
+                   expected_reason=".")
+
+
+@pytest.mark.skip_global_cleanup
+def test_stop_multi_tokens(vllm_model):
+    _test_stopping(
+        vllm_model.model.llm_engine,
+        stop=["group of peo", "short"],
+        include_in_output=False,
+        expected_output="VLLM is a 100% volunteer organization. We are a ",
+        expected_reason="group of peo")
+
+    _test_stopping(
+        vllm_model.model.llm_engine,
+        stop=["group of peo", "short"],
+        include_in_output=True,
+        expected_output=
+        "VLLM is a 100% volunteer organization. We are a group of peo",
+        expected_reason="group of peo")
+
+
+@pytest.mark.skip_global_cleanup
+def test_stop_partial_token(vllm_model):
+    _test_stopping(vllm_model.model.llm_engine,
+                   stop=["gani"],
+                   include_in_output=False,
+                   expected_output="VLLM is a 100% volunteer or",
+                   expected_reason="gani")
+
+    _test_stopping(vllm_model.model.llm_engine,
+                   stop=["gani"],
+                   include_in_output=True,
+                   expected_output="VLLM is a 100% volunteer organi",
+                   expected_reason="gani")
+
+
+@pytest.mark.skip_global_cleanup
+def test_stop_token_id(vllm_model):
+    # token id 13013 => " organization"
+
+    _test_stopping(vllm_model.model.llm_engine,
+                   stop_token_ids=[13013],
+                   include_in_output=False,
+                   expected_output="VLLM is a 100% volunteer",
+                   expected_reason=13013)
+
+    _test_stopping(vllm_model.model.llm_engine,
+                   stop_token_ids=[13013],
+                   include_in_output=True,
+                   expected_output="VLLM is a 100% volunteer organization",
+                   expected_reason=13013)
+
+
+def _test_stopping(llm_engine: LLMEngine,
+                   expected_output: str,
+                   expected_reason: Any,
+                   stop: Optional[List[str]] = None,
+                   stop_token_ids: Optional[List[int]] = None,
+                   include_in_output: bool = False) -> None:
+    llm_engine.add_request(
+        "id", "A story about vLLM:\n",
+        SamplingParams(
+            temperature=0.0,
+            max_tokens=MAX_TOKENS,
+            stop=stop,
+            stop_token_ids=stop_token_ids,
+            include_stop_str_in_output=include_in_output,
+        ), None)
+
+    output: Optional[CompletionOutput] = None
+    output_text = ""
+    stop_reason = None
+    while llm_engine.has_unfinished_requests():
+        (request_output, ) = llm_engine.step()
+        (output, ) = request_output.outputs
+
+        # Ensure we don't backtrack
+        assert output.text.startswith(output_text)
+        output_text = output.text
+        stop_reason = output.stop_reason
+
+    assert output is not None
+    assert output_text == expected_output
+    assert stop_reason == expected_reason
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -0,0 +1,37 @@
+import asyncio
+from dataclasses import dataclass
+
+from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
+
+MODEL_NAME = "openai-community/gpt2"
+CHAT_TEMPLATE = "Dummy chat template for testing {}"
+
+
+@dataclass
+class MockModelConfig:
+    tokenizer = MODEL_NAME
+    trust_remote_code = False
+    tokenizer_mode = "auto"
+    max_model_len = 100
+    tokenizer_revision = None
+
+
+@dataclass
+class MockEngine:
+
+    async def get_model_config(self):
+        return MockModelConfig
+
+
+async def _async_serving_chat_init():
+    serving_completion = OpenAIServingChat(MockEngine(),
+                                           served_model_names=[MODEL_NAME],
+                                           response_role="assistant",
+                                           chat_template=CHAT_TEMPLATE)
+    return serving_completion
+
+
+def test_async_serving_chat_init():
+    serving_completion = asyncio.run(_async_serving_chat_init())
+    assert serving_completion.tokenizer is not None
+    assert serving_completion.tokenizer.chat_template == CHAT_TEMPLATE
--- a/tests/entrypoints/test_guided_processors.py
+++ b/tests/entrypoints/test_guided_processors.py
@@ -0,0 +1,113 @@
+# This unit test should be moved to a new
+# tests/test_guided_decoding directory.
+import pytest
+import torch
+from transformers import AutoTokenizer
+
+from vllm.entrypoints.openai.protocol import CompletionRequest
+from vllm.model_executor.guided_decoding import (
+    get_guided_decoding_logits_processor)
+from vllm.model_executor.guided_decoding.outlines_logits_processors import (
+    JSONLogitsProcessor, RegexLogitsProcessor)
+
+TEST_SCHEMA = {
+    "type": "object",
+    "properties": {
+        "name": {
+            "type": "string"
+        },
+        "age": {
+            "type": "integer"
+        },
+        "skills": {
+            "type": "array",
+            "items": {
+                "type": "string",
+                "maxLength": 10
+            },
+            "minItems": 3
+        },
+        "work history": {
+            "type": "array",
+            "items": {
+                "type": "object",
+                "properties": {
+                    "company": {
+                        "type": "string"
+                    },
+                    "duration": {
+                        "type": "string"
+                    },
+                    "position": {
+                        "type": "string"
+                    }
+                },
+                "required": ["company", "position"]
+            }
+        }
+    },
+    "required": ["name", "age", "skills", "work history"]
+}
+
+TEST_REGEX = (r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}"
+              r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)")
+
+
+def test_guided_logits_processors():
+    """Basic unit test for RegexLogitsProcessor and JSONLogitsProcessor."""
+    tokenizer = AutoTokenizer.from_pretrained('HuggingFaceH4/zephyr-7b-beta')
+    regex_LP = RegexLogitsProcessor(TEST_REGEX, tokenizer)
+    json_LP = JSONLogitsProcessor(TEST_SCHEMA,
+                                  tokenizer,
+                                  whitespace_pattern=None)
+
+    regex_LP.init_state()
+    token_ids = tokenizer.encode(
+        f"Give an example IPv4 address with this regex: {TEST_REGEX}")
+    tensor = torch.rand(32000)
+    original_tensor = torch.clone(tensor)
+    regex_LP(token_ids, tensor)
+    assert tensor.shape == original_tensor.shape
+    assert not torch.allclose(tensor, original_tensor)
+
+    json_LP.init_state()
+    token_ids = tokenizer.encode(
+        f"Give an employee profile that fits this schema: {TEST_SCHEMA}")
+    tensor = torch.rand(32000)
+    original_tensor = torch.clone(tensor)
+    json_LP(token_ids, tensor)
+    assert tensor.shape == original_tensor.shape
+    assert not torch.allclose(tensor, original_tensor)
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("backend", ["outlines", "lm-format-enforcer"])
+async def test_guided_logits_processor_black_box(backend: str):
+    tokenizer = AutoTokenizer.from_pretrained('HuggingFaceH4/zephyr-7b-beta')
+    token_ids = tokenizer.encode(
+        f"Give an example IPv4 address with this regex: {TEST_REGEX}")
+    regex_request = CompletionRequest(model='test',
+                                      prompt=token_ids,
+                                      guided_regex=TEST_REGEX)
+    regex_lp = await get_guided_decoding_logits_processor(
+        backend, regex_request, tokenizer)
+    assert regex_lp is not None
+    tensor = torch.rand(32000)
+    original_tensor = torch.clone(tensor)
+    tensor = regex_lp(token_ids, tensor)
+    assert tensor.shape == original_tensor.shape
+    assert not torch.allclose(tensor, original_tensor)
+
+    token_ids = tokenizer.encode(
+        f"Give an employee profile that fits this schema: {TEST_SCHEMA}")
+    json_request = CompletionRequest(model='test',
+                                     prompt=token_ids,
+                                     guided_json=TEST_SCHEMA)
+    json_lp = await get_guided_decoding_logits_processor(
+        backend, json_request, tokenizer)
+    assert json_lp is not None
+    tensor = torch.rand(32000)
+    original_tensor = torch.clone(tensor)
+    tensor = json_lp(token_ids, tensor)
+    assert tensor.shape == original_tensor.shape
+    assert not torch.allclose(tensor, original_tensor)
--- a/tests/entrypoints/test_llm_generate.py
+++ b/tests/entrypoints/test_llm_generate.py
@@ -0,0 +1,41 @@
+import pytest
+
+from vllm import LLM, SamplingParams
+
+
+def test_multiple_sampling_params():
+
+    llm = LLM(model="facebook/opt-125m",
+              max_num_batched_tokens=4096,
+              tensor_parallel_size=1)
+
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+
+    sampling_params = [
+        SamplingParams(temperature=0.01, top_p=0.95),
+        SamplingParams(temperature=0.3, top_p=0.95),
+        SamplingParams(temperature=0.7, top_p=0.95),
+        SamplingParams(temperature=0.99, top_p=0.95),
+    ]
+
+    # Multiple SamplingParams should be matched with each prompt
+    outputs = llm.generate(prompts, sampling_params=sampling_params)
+    assert len(prompts) == len(outputs)
+
+    # Exception raised, if the size of params does not match the size of prompts
+    with pytest.raises(ValueError):
+        outputs = llm.generate(prompts, sampling_params=sampling_params[:3])
+
+    # Single SamplingParams should be applied to every prompt
+    single_sampling_params = SamplingParams(temperature=0.3, top_p=0.95)
+    outputs = llm.generate(prompts, sampling_params=single_sampling_params)
+    assert len(prompts) == len(outputs)
+
+    # sampling_params is None, default params should be applied
+    outputs = llm.generate(prompts, sampling_params=None)
+    assert len(prompts) == len(outputs)
--- a/tests/entrypoints/test_openai_server.py
+++ b/tests/entrypoints/test_openai_server.py
@@ -0,0 +1,894 @@
+# imports for guided decoding tests
+import json
+import os
+import re
+import subprocess
+import sys
+import time
+
+import jsonschema
+import openai  # use the official client for correctness check
+import pytest
+# using Ray for overall ease of process management, parallel requests,
+# and debugging.
+import ray
+import requests
+import torch
+# downloading lora to test lora requests
+from huggingface_hub import snapshot_download
+from openai import BadRequestError
+
+from vllm.transformers_utils.tokenizer import get_tokenizer
+
+MAX_SERVER_START_WAIT_S = 600  # wait for server to start for 60 seconds
+# any model with a chat template should work here
+MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+# technically this needs Mistral-7B-v0.1 as base, but we're not testing
+# generation quality here
+LORA_NAME = "typeof/zephyr-7b-beta-lora"
+
+TEST_SCHEMA = {
+    "type": "object",
+    "properties": {
+        "name": {
+            "type": "string"
+        },
+        "age": {
+            "type": "integer"
+        },
+        "skills": {
+            "type": "array",
+            "items": {
+                "type": "string",
+                "maxLength": 10
+            },
+            "minItems": 3
+        },
+        "work history": {
+            "type": "array",
+            "items": {
+                "type": "object",
+                "properties": {
+                    "company": {
+                        "type": "string"
+                    },
+                    "duration": {
+                        "type": "string"
+                    },
+                    "position": {
+                        "type": "string"
+                    }
+                },
+                "required": ["company", "position"]
+            }
+        }
+    },
+    "required": ["name", "age", "skills", "work history"]
+}
+
+TEST_REGEX = (r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}"
+              r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)")
+
+TEST_CHOICE = [
+    "Python", "Java", "JavaScript", "C++", "C#", "PHP", "TypeScript", "Ruby",
+    "Swift", "Kotlin"
+]
+
+pytestmark = pytest.mark.asyncio
+
+
+@ray.remote(num_gpus=1)
+class ServerRunner:
+
+    def __init__(self, args):
+        env = os.environ.copy()
+        env["PYTHONUNBUFFERED"] = "1"
+        self.proc = subprocess.Popen(
+            ["python3", "-m", "vllm.entrypoints.openai.api_server"] + args,
+            env=env,
+            stdout=sys.stdout,
+            stderr=sys.stderr,
+        )
+        self._wait_for_server()
+
+    def ready(self):
+        return True
+
+    def _wait_for_server(self):
+        # run health check
+        start = time.time()
+        while True:
+            try:
+                if requests.get(
+                        "http://localhost:8000/health").status_code == 200:
+                    break
+            except Exception as err:
+                if self.proc.poll() is not None:
+                    raise RuntimeError("Server exited unexpectedly.") from err
+
+                time.sleep(0.5)
+                if time.time() - start > MAX_SERVER_START_WAIT_S:
+                    raise RuntimeError(
+                        "Server failed to start in time.") from err
+
+    def __del__(self):
+        if hasattr(self, "proc"):
+            self.proc.terminate()
+
+
+@pytest.fixture(scope="session")
+def zephyr_lora_files():
+    return snapshot_download(repo_id=LORA_NAME)
+
+
+@pytest.fixture(scope="session")
+def server(zephyr_lora_files):
+    ray.init()
+    server_runner = ServerRunner.remote([
+        "--model",
+        MODEL_NAME,
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "8192",
+        "--enforce-eager",
+        # lora config below
+        "--enable-lora",
+        "--lora-modules",
+        f"zephyr-lora={zephyr_lora_files}",
+        f"zephyr-lora2={zephyr_lora_files}",
+        "--max-lora-rank",
+        "64",
+        "--max-cpu-loras",
+        "2",
+        "--max-num-seqs",
+        "128",
+    ])
+    ray.get(server_runner.ready.remote())
+    yield server_runner
+    ray.shutdown()
+
+
+@pytest.fixture(scope="module")
+def client():
+    client = openai.AsyncOpenAI(
+        base_url="http://localhost:8000/v1",
+        api_key="token-abc123",
+    )
+    yield client
+
+
+async def test_check_models(server, client: openai.AsyncOpenAI):
+    models = await client.models.list()
+    models = models.data
+    served_model = models[0]
+    lora_models = models[1:]
+    assert served_model.id == MODEL_NAME
+    assert all(model.root == MODEL_NAME for model in models)
+    assert lora_models[0].id == "zephyr-lora"
+    assert lora_models[1].id == "zephyr-lora2"
+
+
+@pytest.mark.parametrize(
+    # first test base model, then test loras
+    "model_name",
+    [MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
+)
+async def test_single_completion(server, client: openai.AsyncOpenAI,
+                                 model_name: str):
+    completion = await client.completions.create(model=model_name,
+                                                 prompt="Hello, my name is",
+                                                 max_tokens=5,
+                                                 temperature=0.0)
+
+    assert completion.id is not None
+    assert completion.choices is not None and len(completion.choices) == 1
+    assert completion.choices[0].text is not None and len(
+        completion.choices[0].text) >= 5
+    assert completion.choices[0].finish_reason == "length"
+    assert completion.usage == openai.types.CompletionUsage(
+        completion_tokens=5, prompt_tokens=6, total_tokens=11)
+
+    # test using token IDs
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=[0, 0, 0, 0, 0],
+        max_tokens=5,
+        temperature=0.0,
+    )
+    assert completion.choices[0].text is not None and len(
+        completion.choices[0].text) >= 5
+
+
+@pytest.mark.parametrize(
+    # first test base model, then test loras
+    "model_name",
+    [MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
+)
+async def test_zero_logprobs(server, client: openai.AsyncOpenAI,
+                             model_name: str):
+    # test using token IDs
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=[0, 0, 0, 0, 0],
+        max_tokens=5,
+        temperature=0.0,
+        logprobs=0,
+    )
+    choice = completion.choices[0]
+    assert choice.logprobs is not None
+    assert choice.logprobs.token_logprobs is not None
+    assert choice.logprobs.top_logprobs is None
+
+
+@pytest.mark.parametrize(
+    # just test 1 lora hereafter
+    "model_name",
+    [MODEL_NAME, "zephyr-lora"],
+)
+async def test_single_chat_session(server, client: openai.AsyncOpenAI,
+                                   model_name: str):
+    messages = [{
+        "role": "system",
+        "content": "you are a helpful assistant"
+    }, {
+        "role": "user",
+        "content": "what is 1+1?"
+    }]
+
+    # test single completion
+    chat_completion = await client.chat.completions.create(model=model_name,
+                                                           messages=messages,
+                                                           max_tokens=10,
+                                                           logprobs=True,
+                                                           top_logprobs=5)
+    assert chat_completion.id is not None
+    assert chat_completion.choices is not None and len(
+        chat_completion.choices) == 1
+    assert chat_completion.choices[0].message is not None
+    assert chat_completion.choices[0].logprobs is not None
+    assert chat_completion.choices[0].logprobs.top_logprobs is not None
+    assert len(chat_completion.choices[0].logprobs.top_logprobs[0]) == 5
+    message = chat_completion.choices[0].message
+    assert message.content is not None and len(message.content) >= 10
+    assert message.role == "assistant"
+    messages.append({"role": "assistant", "content": message.content})
+
+    # test multi-turn dialogue
+    messages.append({"role": "user", "content": "express your result in json"})
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_tokens=10,
+    )
+    message = chat_completion.choices[0].message
+    assert message.content is not None and len(message.content) >= 0
+
+
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_too_many_logprobs(server, client: openai.AsyncOpenAI,
+                                 model_name: str):
+    messages = [{
+        "role": "system",
+        "content": "you are a helpful assistant"
+    }, {
+        "role": "user",
+        "content": "what is 1+1?"
+    }]
+
+    # Default max_logprobs is 5, so this should raise an error
+    with pytest.raises((openai.BadRequestError, openai.APIError)):
+        stream = await client.chat.completions.create(model=model_name,
+                                                      messages=messages,
+                                                      max_tokens=10,
+                                                      logprobs=True,
+                                                      top_logprobs=10,
+                                                      stream=True)
+        async for chunk in stream:
+            ...
+
+    with pytest.raises(openai.BadRequestError):
+        await client.chat.completions.create(model=model_name,
+                                             messages=messages,
+                                             max_tokens=10,
+                                             logprobs=True,
+                                             top_logprobs=10,
+                                             stream=False)
+
+    with pytest.raises((openai.BadRequestError, openai.APIError)):
+        stream = await client.completions.create(model=model_name,
+                                                 prompt="Test",
+                                                 max_tokens=10,
+                                                 logprobs=10,
+                                                 stream=True)
+        async for chunk in stream:
+            ...
+
+    with pytest.raises(openai.BadRequestError):
+        await client.completions.create(model=model_name,
+                                        prompt="Test",
+                                        max_tokens=10,
+                                        logprobs=10,
+                                        stream=False)
+
+    # the server should still work afterwards
+    chat_completion = await client.chat.completions.create(model=model_name,
+                                                           messages=messages,
+                                                           max_tokens=10,
+                                                           stream=False)
+    message = chat_completion.choices[0].message
+    assert message.content is not None and len(message.content) >= 0
+
+
+@pytest.mark.parametrize(
+    # just test 1 lora hereafter
+    "model_name",
+    [MODEL_NAME, "zephyr-lora"],
+)
+async def test_completion_streaming(server, client: openai.AsyncOpenAI,
+                                    model_name: str):
+    prompt = "What is an LLM?"
+
+    single_completion = await client.completions.create(
+        model=model_name,
+        prompt=prompt,
+        max_tokens=5,
+        temperature=0.0,
+    )
+    single_output = single_completion.choices[0].text
+    single_usage = single_completion.usage
+
+    stream = await client.completions.create(model=model_name,
+                                             prompt=prompt,
+                                             max_tokens=5,
+                                             temperature=0.0,
+                                             stream=True)
+    chunks = []
+    finish_reason_count = 0
+    async for chunk in stream:
+        chunks.append(chunk.choices[0].text)
+        if chunk.choices[0].finish_reason is not None:
+            finish_reason_count += 1
+    # finish reason should only return in last block
+    assert finish_reason_count == 1
+    assert chunk.choices[0].finish_reason == "length"
+    assert chunk.choices[0].text
+    assert chunk.usage == single_usage
+    assert "".join(chunks) == single_output
+
+
+@pytest.mark.parametrize(
+    # just test 1 lora hereafter
+    "model_name",
+    [MODEL_NAME, "zephyr-lora"],
+)
+async def test_chat_streaming(server, client: openai.AsyncOpenAI,
+                              model_name: str):
+    messages = [{
+        "role": "system",
+        "content": "you are a helpful assistant"
+    }, {
+        "role": "user",
+        "content": "what is 1+1?"
+    }]
+
+    # test single completion
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_tokens=10,
+        temperature=0.0,
+    )
+    output = chat_completion.choices[0].message.content
+    stop_reason = chat_completion.choices[0].finish_reason
+
+    # test streaming
+    stream = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_tokens=10,
+        temperature=0.0,
+        stream=True,
+    )
+    chunks = []
+    finish_reason_count = 0
+    async for chunk in stream:
+        delta = chunk.choices[0].delta
+        if delta.role:
+            assert delta.role == "assistant"
+        if delta.content:
+            chunks.append(delta.content)
+        if chunk.choices[0].finish_reason is not None:
+            finish_reason_count += 1
+    # finish reason should only return in last block
+    assert finish_reason_count == 1
+    assert chunk.choices[0].finish_reason == stop_reason
+    assert delta.content
+    assert "".join(chunks) == output
+
+
+@pytest.mark.parametrize(
+    # just test 1 lora hereafter
+    "model_name",
+    [MODEL_NAME, "zephyr-lora"],
+)
+async def test_batch_completions(server, client: openai.AsyncOpenAI,
+                                 model_name: str):
+    # test simple list
+    batch = await client.completions.create(
+        model=model_name,
+        prompt=["Hello, my name is", "Hello, my name is"],
+        max_tokens=5,
+        temperature=0.0,
+    )
+    assert len(batch.choices) == 2
+    assert batch.choices[0].text == batch.choices[1].text
+
+    # test n = 2
+    batch = await client.completions.create(
+        model=model_name,
+        prompt=["Hello, my name is", "Hello, my name is"],
+        n=2,
+        max_tokens=5,
+        temperature=0.0,
+        extra_body=dict(
+            # NOTE: this has to be true for n > 1 in vLLM, but not necessary
+            # for official client.
+            use_beam_search=True),
+    )
+    assert len(batch.choices) == 4
+    assert batch.choices[0].text != batch.choices[
+        1].text, "beam search should be different"
+    assert batch.choices[0].text == batch.choices[
+        2].text, "two copies of the same prompt should be the same"
+    assert batch.choices[1].text == batch.choices[
+        3].text, "two copies of the same prompt should be the same"
+
+    # test streaming
+    batch = await client.completions.create(
+        model=model_name,
+        prompt=["Hello, my name is", "Hello, my name is"],
+        max_tokens=5,
+        temperature=0.0,
+        stream=True,
+    )
+    texts = [""] * 2
+    async for chunk in batch:
+        assert len(chunk.choices) == 1
+        choice = chunk.choices[0]
+        texts[choice.index] += choice.text
+    assert texts[0] == texts[1]
+
+
+async def test_logits_bias(server, client: openai.AsyncOpenAI):
+    prompt = "Hello, my name is"
+    max_tokens = 5
+    tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
+
+    # Test exclusive selection
+    token_id = 1000
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=prompt,
+        max_tokens=max_tokens,
+        temperature=0.0,
+        logit_bias={str(token_id): 100},
+        seed=42,
+    )
+    assert completion.choices[0].text is not None and len(
+        completion.choices[0].text) >= 5
+    response_tokens = tokenizer(completion.choices[0].text,
+                                add_special_tokens=False)["input_ids"]
+    expected_tokens = tokenizer(tokenizer.decode([token_id] * 5),
+                                add_special_tokens=False)["input_ids"]
+    assert all([
+        response == expected
+        for response, expected in zip(response_tokens, expected_tokens)
+    ])
+
+    # Test ban
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=prompt,
+        max_tokens=max_tokens,
+        temperature=0.0,
+    )
+    response_tokens = tokenizer(completion.choices[0].text,
+                                add_special_tokens=False)["input_ids"]
+    first_response = completion.choices[0].text
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=prompt,
+        max_tokens=max_tokens,
+        temperature=0.0,
+        logit_bias={str(token): -100
+                    for token in response_tokens},
+    )
+    assert first_response != completion.choices[0].text
+
+
+@pytest.mark.parametrize("guided_decoding_backend",
+                         ["outlines", "lm-format-enforcer"])
+async def test_guided_json_completion(server, client: openai.AsyncOpenAI,
+                                      guided_decoding_backend: str):
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=f"Give an example JSON for an employee profile "
+        f"that fits this schema: {TEST_SCHEMA}",
+        n=3,
+        temperature=1.0,
+        max_tokens=500,
+        extra_body=dict(guided_json=TEST_SCHEMA,
+                        guided_decoding_backend=guided_decoding_backend))
+
+    assert completion.id is not None
+    assert completion.choices is not None and len(completion.choices) == 3
+    for i in range(3):
+        assert completion.choices[i].text is not None
+        output_json = json.loads(completion.choices[i].text)
+        jsonschema.validate(instance=output_json, schema=TEST_SCHEMA)
+
+
+@pytest.mark.parametrize("guided_decoding_backend",
+                         ["outlines", "lm-format-enforcer"])
+async def test_guided_json_chat(server, client: openai.AsyncOpenAI,
+                                guided_decoding_backend: str):
+    messages = [{
+        "role": "system",
+        "content": "you are a helpful assistant"
+    }, {
+        "role":
+        "user",
+        "content":
+        f"Give an example JSON for an employee profile that "
+        f"fits this schema: {TEST_SCHEMA}"
+    }]
+    chat_completion = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=messages,
+        max_tokens=1000,
+        extra_body=dict(guided_json=TEST_SCHEMA,
+                        guided_decoding_backend=guided_decoding_backend))
+    message = chat_completion.choices[0].message
+    assert message.content is not None
+    json1 = json.loads(message.content)
+    jsonschema.validate(instance=json1, schema=TEST_SCHEMA)
+
+    messages.append({"role": "assistant", "content": message.content})
+    messages.append({
+        "role":
+        "user",
+        "content":
+        "Give me another one with a different name and age"
+    })
+    chat_completion = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=messages,
+        max_tokens=1000,
+        extra_body=dict(guided_json=TEST_SCHEMA,
+                        guided_decoding_backend=guided_decoding_backend))
+    message = chat_completion.choices[0].message
+    assert message.content is not None
+    json2 = json.loads(message.content)
+    jsonschema.validate(instance=json2, schema=TEST_SCHEMA)
+    assert json1["name"] != json2["name"]
+    assert json1["age"] != json2["age"]
+
+
+@pytest.mark.parametrize("guided_decoding_backend",
+                         ["outlines", "lm-format-enforcer"])
+async def test_guided_regex_completion(server, client: openai.AsyncOpenAI,
+                                       guided_decoding_backend: str):
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=f"Give an example IPv4 address with this regex: {TEST_REGEX}",
+        n=3,
+        temperature=1.0,
+        max_tokens=20,
+        extra_body=dict(guided_regex=TEST_REGEX,
+                        guided_decoding_backend=guided_decoding_backend))
+
+    assert completion.id is not None
+    assert completion.choices is not None and len(completion.choices) == 3
+    for i in range(3):
+        assert completion.choices[i].text is not None
+        assert re.fullmatch(TEST_REGEX, completion.choices[i].text) is not None
+
+
+@pytest.mark.parametrize("guided_decoding_backend",
+                         ["outlines", "lm-format-enforcer"])
+async def test_guided_regex_chat(server, client: openai.AsyncOpenAI,
+                                 guided_decoding_backend: str):
+    messages = [{
+        "role": "system",
+        "content": "you are a helpful assistant"
+    }, {
+        "role":
+        "user",
+        "content":
+        f"Give an example IP address with this regex: {TEST_REGEX}"
+    }]
+    chat_completion = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=messages,
+        max_tokens=20,
+        extra_body=dict(guided_regex=TEST_REGEX,
+                        guided_decoding_backend=guided_decoding_backend))
+    ip1 = chat_completion.choices[0].message.content
+    assert ip1 is not None
+    assert re.fullmatch(TEST_REGEX, ip1) is not None
+
+    messages.append({"role": "assistant", "content": ip1})
+    messages.append({"role": "user", "content": "Give me a different one"})
+    chat_completion = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=messages,
+        max_tokens=20,
+        extra_body=dict(guided_regex=TEST_REGEX,
+                        guided_decoding_backend=guided_decoding_backend))
+    ip2 = chat_completion.choices[0].message.content
+    assert ip2 is not None
+    assert re.fullmatch(TEST_REGEX, ip2) is not None
+    assert ip1 != ip2
+
+
+@pytest.mark.parametrize("guided_decoding_backend",
+                         ["outlines", "lm-format-enforcer"])
+async def test_guided_choice_completion(server, client: openai.AsyncOpenAI,
+                                        guided_decoding_backend: str):
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt="The best language for type-safe systems programming is ",
+        n=2,
+        temperature=1.0,
+        max_tokens=10,
+        extra_body=dict(guided_choice=TEST_CHOICE,
+                        guided_decoding_backend=guided_decoding_backend))
+
+    assert completion.id is not None
+    assert completion.choices is not None and len(completion.choices) == 2
+    for i in range(2):
+        assert completion.choices[i].text in TEST_CHOICE
+
+
+@pytest.mark.parametrize("guided_decoding_backend",
+                         ["outlines", "lm-format-enforcer"])
+async def test_guided_choice_chat(server, client: openai.AsyncOpenAI,
+                                  guided_decoding_backend: str):
+    messages = [{
+        "role": "system",
+        "content": "you are a helpful assistant"
+    }, {
+        "role":
+        "user",
+        "content":
+        "The best language for type-safe systems programming is "
+    }]
+    chat_completion = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=messages,
+        max_tokens=10,
+        extra_body=dict(guided_choice=TEST_CHOICE,
+                        guided_decoding_backend=guided_decoding_backend))
+    choice1 = chat_completion.choices[0].message.content
+    assert choice1 in TEST_CHOICE
+
+    messages.append({"role": "assistant", "content": choice1})
+    messages.append({
+        "role": "user",
+        "content": "I disagree, pick another one"
+    })
+    chat_completion = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=messages,
+        max_tokens=10,
+        extra_body=dict(guided_choice=TEST_CHOICE,
+                        guided_decoding_backend=guided_decoding_backend))
+    choice2 = chat_completion.choices[0].message.content
+    assert choice2 in TEST_CHOICE
+    assert choice1 != choice2
+
+
+@pytest.mark.parametrize("guided_decoding_backend",
+                         ["outlines", "lm-format-enforcer"])
+async def test_guided_decoding_type_error(server, client: openai.AsyncOpenAI,
+                                          guided_decoding_backend: str):
+    with pytest.raises(openai.BadRequestError):
+        _ = await client.completions.create(
+            model=MODEL_NAME,
+            prompt="Give an example JSON that fits this schema: 42",
+            extra_body=dict(guided_json=42,
+                            guided_decoding_backend=guided_decoding_backend))
+
+    messages = [{
+        "role": "system",
+        "content": "you are a helpful assistant"
+    }, {
+        "role":
+        "user",
+        "content":
+        "The best language for type-safe systems programming is "
+    }]
+    with pytest.raises(openai.BadRequestError):
+        _ = await client.chat.completions.create(model=MODEL_NAME,
+                                                 messages=messages,
+                                                 extra_body=dict(guided_regex={
+                                                     1: "Python",
+                                                     2: "C++"
+                                                 }))
+
+    with pytest.raises(openai.BadRequestError):
+        _ = await client.completions.create(
+            model=MODEL_NAME,
+            prompt="Give an example string that fits this regex",
+            extra_body=dict(guided_regex=TEST_REGEX, guided_json=TEST_SCHEMA))
+
+
+@pytest.mark.parametrize("guided_decoding_backend",
+                         ["outlines", "lm-format-enforcer"])
+async def test_guided_choice_chat_logprobs(server, client: openai.AsyncOpenAI,
+                                           guided_decoding_backend: str):
+    messages = [{
+        "role": "system",
+        "content": "you are a helpful assistant"
+    }, {
+        "role":
+        "user",
+        "content":
+        "The best language for type-safe systems programming is "
+    }]
+    chat_completion = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=messages,
+        max_tokens=10,
+        logprobs=True,
+        top_logprobs=5,
+        extra_body=dict(guided_choice=TEST_CHOICE,
+                        guided_decoding_backend=guided_decoding_backend))
+    top_logprobs = chat_completion.choices[0].logprobs.top_logprobs
+
+    # -9999.0 is the minimum logprob returned by OpenAI
+    assert all(
+        isinstance(logprob, float) and logprob >= -9999.0
+        for token_dict in top_logprobs
+        for token, logprob in token_dict.items())
+
+
+async def test_response_format_json_object(server, client: openai.AsyncOpenAI):
+    for _ in range(2):
+        resp = await client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=[{
+                "role":
+                "user",
+                "content": ('what is 1+1? please respond with a JSON object, '
+                            'the format is {"result": 2}')
+            }],
+            response_format={"type": "json_object"})
+
+        content = resp.choices[0].message.content
+        loaded = json.loads(content)
+        assert loaded == {"result": 2}, loaded
+
+
+async def test_extra_fields(server, client: openai.AsyncOpenAI):
+    with pytest.raises(BadRequestError) as exc_info:
+        await client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=[{
+                "role": "system",
+                "content": "You are a helpful assistant.",
+                "extra_field": "0",
+            }],  # type: ignore
+            temperature=0,
+            seed=0)
+
+    assert "extra_forbidden" in exc_info.value.message
+
+
+async def test_complex_message_content(server, client: openai.AsyncOpenAI):
+    resp = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=[{
+            "role":
+            "user",
+            "content": [{
+                "type":
+                "text",
+                "text":
+                "what is 1+1? please provide the result without any other text."
+            }]
+        }],
+        temperature=0,
+        seed=0)
+    content = resp.choices[0].message.content
+    assert content == "2"
+
+
+async def test_guided_grammar(server, client: openai.AsyncOpenAI):
+    simple_sql_grammar = """
+start: select_statement
+
+select_statement: "SELECT" column "from" table "where" condition
+
+column: "col_1" | "col_2"
+table: "table_1" | "table_2"
+condition: column "=" number
+
+number: "1" | "2"
+"""
+
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=("Generate a sql state that select col_1 from "
+                "table_1 where it is equals to 1"),
+        temperature=1.0,
+        max_tokens=500,
+        extra_body=dict(guided_grammar=simple_sql_grammar))
+
+    content = completion.choices[0].text
+
+    # use Lark to parse the output, and make sure it's a valid parse tree
+    from lark import Lark
+    parser = Lark(simple_sql_grammar)
+    parser.parse(content)
+
+    # remove spaces for comparison b/c we removed them in the grammar
+    ground_truth = "SELECT col_1 from table_1 where col_1 = 1".replace(" ", "")
+
+    assert content.strip() == ground_truth
+
+
+@pytest.mark.parametrize(
+    # first test base model, then test loras
+    "model_name",
+    [MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
+)
+async def test_echo_logprob_completion(server, client: openai.AsyncOpenAI,
+                                       model_name: str):
+    tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
+    # test using text and token IDs
+    for prompt in ("Hello, my name is", [0, 0, 0, 0, 0]):
+        completion = await client.completions.create(model=model_name,
+                                                     prompt=prompt,
+                                                     max_tokens=5,
+                                                     temperature=0.0,
+                                                     echo=True,
+                                                     logprobs=1)
+
+        prompt_text = tokenizer.decode(prompt) if isinstance(prompt,
+                                                             list) else prompt
+        assert (completion.choices[0].text is not None
+                and re.search(r"^" + prompt_text, completion.choices[0].text))
+        logprobs = completion.choices[0].logprobs
+        assert logprobs is not None
+        assert len(logprobs.text_offset) > 5
+        assert (len(logprobs.token_logprobs) > 5
+                and logprobs.token_logprobs[0] is None)
+        assert (len(logprobs.top_logprobs) > 5
+                and logprobs.top_logprobs[0] is None)
+        assert len(logprobs.tokens) > 5
+
+
+async def test_long_seed(server, client: openai.AsyncOpenAI):
+    for seed in [
+            torch.iinfo(torch.long).min - 1,
+            torch.iinfo(torch.long).max + 1
+    ]:
+        with pytest.raises(BadRequestError) as exc_info:
+            await client.chat.completions.create(
+                model=MODEL_NAME,
+                messages=[{
+                    "role": "system",
+                    "content": "You are a helpful assistant.",
+                }],
+                temperature=0,
+                seed=seed)
+
+        assert ("greater_than_equal" in exc_info.value.message
+                or "less_than_equal" in exc_info.value.message)
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
--- a/tests/entrypoints/test_server_oot_registration.py
+++ b/tests/entrypoints/test_server_oot_registration.py
@@ -0,0 +1,66 @@
+import multiprocessing
+import sys
+import time
+
+import torch
+from openai import OpenAI, OpenAIError
+
+from vllm import ModelRegistry
+from vllm.model_executor.models.opt import OPTForCausalLM
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.utils import get_open_port
+
+
+class MyOPTForCausalLM(OPTForCausalLM):
+
+    def compute_logits(self, hidden_states: torch.Tensor,
+                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        # this dummy model always predicts the first token
+        logits = super().compute_logits(hidden_states, sampling_metadata)
+        logits.zero_()
+        logits[:, 0] += 1.0
+        return logits
+
+
+def server_function(port):
+    # register our dummy model
+    ModelRegistry.register_model("OPTForCausalLM", MyOPTForCausalLM)
+    sys.argv = ["placeholder.py"] + \
+        ("--model facebook/opt-125m --dtype"
+        f" float32 --api-key token-abc123 --port {port}").split()
+    import runpy
+    runpy.run_module('vllm.entrypoints.openai.api_server', run_name='__main__')
+
+
+def test_oot_registration_for_api_server():
+    port = get_open_port()
+    server = multiprocessing.Process(target=server_function, args=(port, ))
+    server.start()
+    client = OpenAI(
+        base_url=f"http://localhost:{port}/v1",
+        api_key="token-abc123",
+    )
+    while True:
+        try:
+            completion = client.chat.completions.create(
+                model="facebook/opt-125m",
+                messages=[{
+                    "role": "system",
+                    "content": "You are a helpful assistant."
+                }, {
+                    "role": "user",
+                    "content": "Hello!"
+                }],
+                temperature=0,
+            )
+            break
+        except OpenAIError as e:
+            if "Connection error" in str(e):
+                time.sleep(3)
+            else:
+                raise e
+    server.kill()
+    generated_text = completion.choices[0].message.content
+    # make sure only the first token is generated
+    rest = generated_text.replace("<s>", "")
+    assert rest == ""
--- a/tests/fp8_kv/llama2-70b-fp8-kv/kv_cache_scales.json
+++ b/tests/fp8_kv/llama2-70b-fp8-kv/kv_cache_scales.json
@@ -0,0 +1,90 @@
+{
+    "model_type": "llama",
+    "kv_cache": {
+        "dtype": "float8_e4m3fn",
+        "scaling_factor": {
+            "0": {
+                "0": 0.0230364128947258,
+                "1": 0.01979283057153225,
+                "2": 0.0241350457072258,
+                "3": 0.0308314748108387,
+                "4": 0.0430733822286129,
+                "5": 0.0370396226644516,
+                "6": 0.0306222103536129,
+                "7": 0.0357491634786129,
+                "8": 0.0358189195394516,
+                "9": 0.0443289652466774,
+                "10": 0.0433175228536129,
+                "11": 0.0416782945394516,
+                "12": 0.0366908498108387,
+                "13": 0.0432477705180645,
+                "14": 0.0410505048930645,
+                "15": 0.0457589291036129,
+                "16": 0.0418526791036129,
+                "17": 0.0432477705180645,
+                "18": 0.0469447560608387,
+                "19": 0.0514787957072258,
+                "20": 0.0541294664144516,
+                "21": 0.0587681382894516,
+                "22": 0.0625,
+                "23": 0.0585588738322258,
+                "24": 0.0600237175822258,
+                "25": 0.0588030144572258,
+                "26": 0.0531180277466774,
+                "27": 0.06396484375,
+                "28": 0.0603027381002903,
+                "29": 0.0582101047039032,
+                "30": 0.0625348836183548,
+                "31": 0.0585588738322258,
+                "32": 0.0582798570394516,
+                "33": 0.0575125589966774,
+                "34": 0.0590820349752903,
+                "35": 0.0614188089966774,
+                "36": 0.0631975457072258,
+                "37": 0.0615931935608387,
+                "38": 0.0601283498108387,
+                "39": 0.0571986623108387,
+                "40": 0.0670340433716774,
+                "41": 0.0523507259786129,
+                "42": 0.0547223798930645,
+                "43": 0.0631975457072258,
+                "44": 0.0663713738322258,
+                "45": 0.0603376142680645,
+                "46": 0.0652204304933548,
+                "47": 0.0734514519572258,
+                "48": 0.0693708211183548,
+                "49": 0.0725446492433548,
+                "50": 0.0627790242433548,
+                "51": 0.0691266804933548,
+                "52": 0.0688825398683548,
+                "53": 0.068429134786129,
+                "54": 0.0605119988322258,
+                "55": 0.0799386203289032,
+                "56": 0.0853097140789032,
+                "57": 0.0661969929933548,
+                "58": 0.0689871683716774,
+                "59": 0.0724051371216774,
+                "60": 0.0541643425822258,
+                "61": 0.0626743882894516,
+                "62": 0.0628487765789032,
+                "63": 0.0607212632894516,
+                "64": 0.0589076466858387,
+                "65": 0.0451660193502903,
+                "66": 0.0453055277466774,
+                "67": 0.0414341539144516,
+                "68": 0.0385044664144516,
+                "69": 0.0414341539144516,
+                "70": 0.0466308631002903,
+                "71": 0.0399693101644516,
+                "72": 0.0437011756002903,
+                "73": 0.0434221550822258,
+                "74": 0.0428989976644516,
+                "75": 0.0401785746216774,
+                "76": 0.0431082621216774,
+                "77": 0.0484444759786129,
+                "78": 0.0417829267680645,
+                "79": 0.0418178029358387
+            }
+        }
+    }
+}
--- a/tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json
+++ b/tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json
@@ -0,0 +1,42 @@
+{
+    "model_type": "llama",
+    "kv_cache": {
+        "dtype": "float8_e4m3fn",
+        "scaling_factor": {
+            "0": {
+                "0": 0.0152239128947258,
+                "1": 0.0188860222697258,
+                "2": 0.0354178324341774,
+                "3": 0.0376674123108387,
+                "4": 0.0418526791036129,
+                "5": 0.0433175228536129,
+                "6": 0.0397600457072258,
+                "7": 0.0424455925822258,
+                "8": 0.0415387861430645,
+                "9": 0.0408412404358387,
+                "10": 0.0395856611430645,
+                "11": 0.0377371683716774,
+                "12": 0.0400739423930645,
+                "13": 0.040771484375,
+                "14": 0.0393415205180645,
+                "15": 0.0369001142680645,
+                "16": 0.03857421875,
+                "17": 0.0387486070394516,
+                "18": 0.0403180830180645,
+                "19": 0.0396205373108387,
+                "20": 0.0375627800822258,
+                "21": 0.0407366082072258,
+                "22": 0.0432477705180645,
+                "23": 0.0377022884786129,
+                "24": 0.0399693101644516,
+                "25": 0.0374581478536129,
+                "26": 0.0413295216858387,
+                "27": 0.0442243330180645,
+                "28": 0.0424804724752903,
+                "29": 0.0456891767680645,
+                "30": 0.0409109964966774,
+                "31": 0.0482352152466774
+            }
+        }
+    }
+}
--- a/tests/kernels/allclose_default.py
+++ b/tests/kernels/allclose_default.py
@@ -0,0 +1,18 @@
+import torch
+
+# Reference default values of atol and rtol are from
+# https://github.com/pytorch/pytorch/blob/6d96beb6bec24d73ee3f080bac54d2104068f675/test/test_transformers.py#L67
+default_atol = {torch.float16: 1e-3, torch.bfloat16: 1e-3, torch.float: 1e-5}
+default_rtol = {
+    torch.float16: 1e-3,
+    torch.bfloat16: 1.6e-2,
+    torch.float: 1.3e-6
+}
+
+
+def get_default_atol(output) -> float:
+    return default_atol[output.dtype]
+
+
+def get_default_rtol(output) -> float:
+    return default_rtol[output.dtype]
--- a/tests/kernels/conftest.py
+++ b/tests/kernels/conftest.py
@@ -0,0 +1,14 @@
+import pytest
+
+from vllm.utils import (create_kv_caches_with_random,
+                        create_kv_caches_with_random_flash)
+
+
+@pytest.fixture()
+def kv_cache_factory():
+    return create_kv_caches_with_random
+
+
+@pytest.fixture()
+def kv_cache_factory_flashinfer():
+    return create_kv_caches_with_random_flash
--- a/tests/kernels/test_activation.py
+++ b/tests/kernels/test_activation.py
@@ -0,0 +1,78 @@
+from typing import Type
+
+import pytest
+import torch
+from allclose_default import get_default_atol, get_default_rtol
+
+from vllm.model_executor.layers.activation import (FastGELU, GeluAndMul,
+                                                   NewGELU, SiluAndMul)
+
+DTYPES = [torch.half, torch.bfloat16, torch.float]
+NUM_TOKENS = [7, 83, 2048]  # Arbitrary values for testing
+D = [512, 4096, 5120, 13824]  # Arbitrary values for testing
+SEEDS = [0]
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+]
+
+
+@pytest.mark.parametrize("activation", ["silu", "gelu", "gelu_tanh"])
+@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
+@pytest.mark.parametrize("d", D)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@torch.inference_mode()
+def test_act_and_mul(
+    activation: str,
+    num_tokens: int,
+    d: int,
+    dtype: torch.dtype,
+    seed: int,
+    device: str,
+) -> None:
+    torch.random.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+    torch.set_default_device(device)
+    x = torch.randn(num_tokens, 2 * d, dtype=dtype)
+    if activation == "silu":
+        layer = SiluAndMul()
+    elif activation == "gelu":
+        layer = GeluAndMul(approximate="none")
+    elif activation == "gelu_tanh":
+        layer = GeluAndMul(approximate="tanh")
+    out = layer(x)
+    ref_out = layer._forward(x)
+    # The SiLU and GELU implementations are equivalent to the native PyTorch
+    # implementations, so we can do exact comparison.
+    assert torch.allclose(out, ref_out, atol=0.0, rtol=0.0)
+
+
+@pytest.mark.parametrize("activation", [FastGELU, NewGELU])
+@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
+@pytest.mark.parametrize("d", D)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@torch.inference_mode()
+def test_activation(
+    activation: Type[torch.nn.Module],
+    num_tokens: int,
+    d: int,
+    dtype: torch.dtype,
+    seed: int,
+    device: str,
+) -> None:
+    torch.random.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+    torch.set_default_device(device)
+    x = torch.randn(num_tokens, d, dtype=dtype)
+    layer = activation()
+    out = layer(x)
+    ref_out = layer._forward(x)
+    assert torch.allclose(out,
+                          ref_out,
+                          atol=get_default_atol(out),
+                          rtol=get_default_rtol(out))
--- a/tests/kernels/test_attention.py
+++ b/tests/kernels/test_attention.py
@@ -0,0 +1,376 @@
+import random
+from typing import List, Optional, Tuple
+
+import pytest
+import torch
+from allclose_default import get_default_atol, get_default_rtol
+from xformers import ops as xops
+from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask
+
+from vllm import _custom_ops as ops
+from vllm.utils import get_max_shared_memory_bytes, is_hip
+
+FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
+# This will change depending on the compute capability.
+# - 512 as a buffer
+MAX_SEQ_LEN = get_max_shared_memory_bytes() // FLOAT32_BYTES - 512
+# There may not be enough gpu memory due to large NUM_BLOCKS.
+# Reduce NUM_BLOCKS when it happens.
+NUM_BLOCKS = 4321  # Arbitrary values for testing
+PARTITION_SIZE = 512
+# flshattF and tritonflashattF supported: {torch.float16, torch.bfloat16}
+DTYPES = [torch.half, torch.bfloat16, torch.float
+          ] if not is_hip() else [torch.half, torch.bfloat16]
+NUM_GEN_SEQS = [7]  # Arbitrary values for testing
+NUM_PREFILL_SEQS = [3]  # Arbitrary values for testing
+NUM_HEADS = [(40, 40), (64, 8)]  # Arbitrary values for testing
+
+# FlashAttention forward only supports head dimension at most 128
+# https://github.com/ROCmSoftwarePlatform/flash-attention/blob/3d2b6f5d037782cc2c906909a46fb7e2e1b48b25/csrc/flash_attn_rocm/flash_api.cpp#L62
+HEAD_SIZES = [64, 80, 96, 112, 128, 256
+              ] if not is_hip() else [64, 80, 96, 112, 128]
+
+BLOCK_SIZES = [16, 32]
+USE_ALIBI = [False, True]
+KV_CACHE_DTYPE = ["auto", "fp8"]
+SEEDS = [0]
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+]
+
+
+def ref_masked_attention(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    scale: float,
+    attn_mask: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    attn_weights = scale * torch.einsum("qhd,khd->hqk", query, key).float()
+    if attn_mask is not None:
+        attn_weights = attn_weights + attn_mask.float()
+    attn_weights = torch.softmax(attn_weights, dim=-1).to(value.dtype)
+    out = torch.einsum("hqk,khd->qhd", attn_weights, value)
+    return out
+
+
+def ref_single_query_cached_kv_attention(
+    output: torch.Tensor,
+    query: torch.Tensor,
+    num_queries_per_kv: int,
+    key_cache: torch.Tensor,
+    value_cache: torch.Tensor,
+    block_tables: torch.Tensor,
+    seq_lens: torch.Tensor,
+    scale: float,
+    alibi_slopes: Optional[torch.Tensor],
+) -> None:
+    num_query_heads = query.shape[1]
+    num_kv_heads = value_cache.shape[1]
+    head_size = value_cache.shape[2]
+    block_size = value_cache.shape[3]
+    num_seqs = query.shape[0]
+
+    block_tables = block_tables.cpu().tolist()
+    seq_lens = seq_lens.cpu().tolist()
+    for i in range(num_seqs):
+        q = query[i].unsqueeze(0)
+        block_table = block_tables[i]
+        seq_len = int(seq_lens[i])
+
+        keys = []
+        values = []
+        for j in range(seq_len):
+            block_number = int(block_table[j // block_size])
+            block_offset = j % block_size
+
+            k = key_cache[block_number, :, :, block_offset, :]
+            k = k.reshape(num_kv_heads, head_size)
+            keys.append(k)
+
+            v = value_cache[block_number, :, :, block_offset]
+            values.append(v)
+        keys = torch.stack(keys, dim=0)
+        values = torch.stack(values, dim=0)
+        if num_queries_per_kv > 1:
+            # Handle MQA and GQA
+            keys = torch.repeat_interleave(keys, num_queries_per_kv, dim=1)
+            values = torch.repeat_interleave(values, num_queries_per_kv, dim=1)
+
+        alibi_bias = None
+        if alibi_slopes is not None:
+            # Create the ALiBi bias used in the paged attention kernel.
+            position_ids = torch.arange(seq_len).int()
+            alibi_bias = (position_ids - seq_len + 1).float()
+            alibi_bias = alibi_slopes.view(-1, 1, 1) * alibi_bias.view(
+                1, 1, -1)
+
+        out = ref_masked_attention(q, keys, values, scale, alibi_bias)
+        out = out.view(num_query_heads, head_size)
+        output[i].copy_(out, non_blocking=True)
+
+
+@pytest.mark.parametrize("version", ["v1", "v2"])
+@pytest.mark.parametrize("num_seqs", NUM_GEN_SEQS)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("use_alibi", USE_ALIBI)
+@pytest.mark.parametrize("block_size", BLOCK_SIZES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_paged_attention(
+    kv_cache_factory,
+    version: str,
+    num_seqs: int,
+    num_heads: Tuple[int, int],
+    head_size: int,
+    use_alibi: bool,
+    block_size: int,
+    dtype: torch.dtype,
+    kv_cache_dtype: str,
+    seed: int,
+    device: str,
+) -> None:
+    random.seed(seed)
+    torch.random.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+    torch.set_default_device(device)
+    scale = float(1.0 / (head_size**0.5))
+    num_query_heads, num_kv_heads = num_heads
+    query = torch.empty(num_seqs, num_query_heads, head_size, dtype=dtype)
+    query.uniform_(-scale, scale)
+
+    assert num_query_heads % num_kv_heads == 0
+    num_queries_per_kv = num_query_heads // num_kv_heads
+    alibi_slopes = None
+    if use_alibi:
+        alibi_slopes = torch.randn(num_query_heads, dtype=torch.float)
+
+    seq_lens = [random.randint(1, MAX_SEQ_LEN) for _ in range(num_seqs)]
+    seq_lens[-1] = MAX_SEQ_LEN
+    max_seq_len = max(seq_lens)
+    seq_lens = torch.tensor(seq_lens, dtype=torch.int)
+
+    # Create the block tables.
+    max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size
+    block_tables = []
+    for _ in range(num_seqs):
+        block_table = [
+            random.randint(0, NUM_BLOCKS - 1)
+            for _ in range(max_num_blocks_per_seq)
+        ]
+        block_tables.append(block_table)
+    block_tables = torch.tensor(block_tables, dtype=torch.int)
+
+    # Create the KV caches.
+    key_caches, value_caches = kv_cache_factory(NUM_BLOCKS, block_size, 1,
+                                                num_kv_heads, head_size,
+                                                kv_cache_dtype, dtype, seed,
+                                                device)
+    key_cache, value_cache = key_caches[0], value_caches[0]
+
+    # Using default kv_scale
+    kv_scale = 1.0
+
+    # Call the paged attention kernel.
+    output = torch.empty_like(query)
+    if version == "v1":
+        ops.paged_attention_v1(
+            output,
+            query,
+            key_cache,
+            value_cache,
+            num_kv_heads,
+            scale,
+            block_tables,
+            seq_lens,
+            block_size,
+            max_seq_len,
+            alibi_slopes,
+            kv_cache_dtype,
+            kv_scale,
+        )
+    elif version == "v2":
+        num_partitions = ((max_seq_len + PARTITION_SIZE - 1) // PARTITION_SIZE)
+        assert PARTITION_SIZE % block_size == 0
+        num_seqs, num_heads, head_size = output.shape
+        tmp_output = torch.empty(
+            size=(num_seqs, num_heads, num_partitions, head_size),
+            dtype=output.dtype,
+        )
+        exp_sums = torch.empty(
+            size=(num_seqs, num_heads, num_partitions),
+            dtype=torch.float32,
+        )
+        max_logits = torch.empty_like(exp_sums)
+        ops.paged_attention_v2(
+            output,
+            exp_sums,
+            max_logits,
+            tmp_output,
+            query,
+            key_cache,
+            value_cache,
+            num_kv_heads,
+            scale,
+            block_tables,
+            seq_lens,
+            block_size,
+            max_seq_len,
+            alibi_slopes,
+            kv_cache_dtype,
+            kv_scale,
+        )
+    else:
+        raise AssertionError(f"Unknown version: {version}")
+
+    # Run the reference implementation.
+    if kv_cache_dtype == "fp8":
+        # Convert cache data back to dtype.
+        x = 16 // torch.tensor([], dtype=dtype).element_size()
+        key_cache_shape = (NUM_BLOCKS, num_kv_heads, head_size // x,
+                           block_size, x)
+        dequantized_key_cache = torch.empty(size=key_cache_shape,
+                                            dtype=dtype,
+                                            device=device)
+        ops.convert_fp8(key_cache, dequantized_key_cache)
+        key_cache = dequantized_key_cache
+
+        value_cache_shape = value_cache.shape
+        dequantized_value_cache = torch.empty(size=value_cache_shape,
+                                              dtype=dtype,
+                                              device=device)
+        ops.convert_fp8(value_cache, dequantized_value_cache)
+        value_cache = dequantized_value_cache
+
+    ref_output = torch.empty_like(query)
+    ref_single_query_cached_kv_attention(
+        ref_output,
+        query,
+        num_queries_per_kv,
+        key_cache,
+        value_cache,
+        block_tables,
+        seq_lens,
+        scale,
+        alibi_slopes,
+    )
+
+    # NOTE(woosuk): Due to the kernel-level differences in the two
+    # implementations, there is a small numerical difference in the two
+    # outputs. Thus, we use a relaxed tolerance for the test.
+    atol = get_default_atol(output) if is_hip() else 1e-3
+    rtol = get_default_rtol(output) if is_hip() else 1e-5
+
+    # NOTE(zhaoyang): FP8 KV Cache will introduce quantization error,
+    # so we use a relaxed tolerance for the test.
+    atol, rtol = 1e-3, 1e-5
+    if kv_cache_dtype == "fp8":
+        atol, rtol = 1e-2, 1e-5
+    assert torch.allclose(output, ref_output, atol=atol, rtol=rtol)
+
+
+def ref_multi_query_kv_attention(
+    cu_seq_lens: List[int],
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    scale: float,
+    dtype: torch.dtype,
+) -> torch.Tensor:
+    num_seqs = len(cu_seq_lens) - 1
+    ref_outputs = []
+    for i in range(num_seqs):
+        start_idx = cu_seq_lens[i]
+        end_idx = cu_seq_lens[i + 1]
+        seq_len = end_idx - start_idx
+
+        # Create attention mask.
+        attn_mask = torch.triu(torch.ones(seq_len, seq_len, dtype=dtype),
+                               diagonal=1)
+        attn_mask = attn_mask * torch.finfo(dtype).min
+        attn_mask = attn_mask.to(dtype=dtype)
+
+        ref_output = ref_masked_attention(
+            query[start_idx:end_idx],
+            key[start_idx:end_idx],
+            value[start_idx:end_idx],
+            scale,
+            attn_mask=attn_mask,
+        )
+        ref_outputs.append(ref_output)
+    ref_output = torch.cat(ref_outputs, dim=0)
+    return ref_output
+
+
+# TODO(woosuk): Add tests for USE_ALIBI=True.
+@pytest.mark.parametrize("num_seqs", NUM_PREFILL_SEQS)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@torch.inference_mode()
+def test_multi_query_kv_attention(
+    num_seqs: int,
+    num_heads: Tuple[int, int],
+    head_size: int,
+    dtype: torch.dtype,
+    seed: int,
+    device: str,
+) -> None:
+    random.seed(seed)
+    torch.random.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+    torch.set_default_device(device)
+    # MAX_SEQ_LEN sometimes causes OOM in the reference implementation.
+    # As the xformers library is already tested with its own tests, we can use
+    # a smaller MAX_SEQ_LEN here.
+    max_len = min(MAX_SEQ_LEN, 4096)
+    seq_lens = random.sample(range(1, max_len), num_seqs)
+    num_tokens = sum(seq_lens)
+
+    scale = float(1.0 / (head_size**0.5))
+    num_query_heads, num_kv_heads = num_heads
+    qkv = torch.empty(num_tokens,
+                      num_query_heads + 2 * num_kv_heads,
+                      head_size,
+                      dtype=dtype)
+    qkv.uniform_(-scale, scale)
+    query, key, value = qkv.split(
+        [num_query_heads, num_kv_heads, num_kv_heads], dim=1)
+
+    num_queries_per_kv = num_query_heads // num_kv_heads
+    if num_queries_per_kv > 1:
+        # Handle MQA and GQA
+        key = torch.repeat_interleave(key, num_queries_per_kv, dim=1)
+        value = torch.repeat_interleave(value, num_queries_per_kv, dim=1)
+    attn_bias = BlockDiagonalCausalMask.from_seqlens(seq_lens)
+    output = xops.memory_efficient_attention_forward(
+        query.unsqueeze(0),
+        key.unsqueeze(0),
+        value.unsqueeze(0),
+        attn_bias=attn_bias,
+        p=0.0,
+        scale=scale,
+    )
+    output = output.squeeze(0)
+
+    cu_seq_lens = [0]
+    for seq_len in seq_lens:
+        cu_seq_lens.append(cu_seq_lens[-1] + seq_len)
+    ref_output = ref_multi_query_kv_attention(
+        cu_seq_lens,
+        query,
+        key,
+        value,
+        scale,
+        dtype,
+    )
+    atol = get_default_atol(output) if is_hip() else 1e-3
+    rtol = get_default_rtol(output) if is_hip() else 1e-5
+    assert torch.allclose(output, ref_output, atol=atol, rtol=rtol)
--- a/tests/kernels/test_cache.py
+++ b/tests/kernels/test_cache.py
@@ -0,0 +1,375 @@
+import random
+from typing import Tuple
+
+import pytest
+import torch
+
+from vllm import _custom_ops as ops
+from vllm_C import cache_ops
+from vllm.utils import is_hip
+
+COPYING_DIRECTION = [('cuda', 'cpu'), ('cuda', 'cuda'), ('cpu', 'cuda')]
+DTYPES = [torch.half, torch.bfloat16, torch.float]
+NUM_TOKENS = [42]  # Arbitrary values for testing
+NUM_LAYERS = [1]  # Arbitrary values for testing
+NUM_HEADS = [8]  # Arbitrary values for testing
+HEAD_SIZES = [64, 80, 96, 112, 128, 256]
+BLOCK_SIZES = [8, 16, 32]
+
+# Arbitrary values for testing
+# don't make it too large. e.g. [1024, 36000] will OOM
+NUM_BLOCKS = [1024, 10000]
+
+NUM_MAPPINGS = [256]  # Arbitrary values for testing
+SEEDS = [0]
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+]
+KV_CACHE_DTYPE = ["auto", "fp8"]
+
+
+@pytest.mark.parametrize("num_mappings", NUM_MAPPINGS)
+@pytest.mark.parametrize("num_layers", NUM_LAYERS)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("block_size", BLOCK_SIZES)
+@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
+@torch.inference_mode()
+def test_copy_blocks(
+    kv_cache_factory,
+    num_mappings: int,
+    num_layers: int,
+    num_heads: int,
+    head_size: int,
+    block_size: int,
+    num_blocks: int,
+    dtype: torch.dtype,
+    seed: int,
+    kv_cache_dtype: str,
+    device: str,
+) -> None:
+    random.seed(seed)
+    torch.random.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+    torch.set_default_device(device)
+    # Generate random block mappings where each source block is mapped to two
+    # destination blocks.
+    assert 2 * num_mappings <= num_blocks
+    src_blocks = random.sample(range(num_blocks), num_mappings)
+    remainig_blocks = list(set(range(num_blocks)) - set(src_blocks))
+    dst_blocks = random.sample(remainig_blocks, 2 * num_mappings)
+    block_mapping = {}
+    for i in range(num_mappings):
+        src = src_blocks[i]
+        dst1 = dst_blocks[2 * i]
+        dst2 = dst_blocks[2 * i + 1]
+        block_mapping[src] = [dst1, dst2]
+
+    # Create the KV caches.
+    key_caches, value_caches = kv_cache_factory(num_blocks, block_size,
+                                                num_layers, num_heads,
+                                                head_size, kv_cache_dtype,
+                                                dtype, seed, device)
+
+    # Clone the KV caches.
+    cloned_key_caches = [key_cache.clone() for key_cache in key_caches]
+    cloned_value_caches = [value_cache.clone() for value_cache in value_caches]
+
+    # Call the copy blocks kernel.
+    ops.copy_blocks(key_caches, value_caches, block_mapping)
+
+    # Run the reference implementation.
+    for src, dsts in block_mapping.items():
+        for dst in dsts:
+            for cloned_key_cache in cloned_key_caches:
+                cloned_key_cache[dst].copy_(cloned_key_cache[src])
+            for cloned_value_cache in cloned_value_caches:
+                cloned_value_cache[dst].copy_(cloned_value_cache[src])
+
+    # Compare the results.
+    for key_cache, cloned_key_cache in zip(key_caches, cloned_key_caches):
+        assert torch.allclose(key_cache, cloned_key_cache)
+    for value_cache, cloned_value_cache in zip(value_caches,
+                                               cloned_value_caches):
+        assert torch.allclose(value_cache, cloned_value_cache)
+
+
+@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("block_size", BLOCK_SIZES)
+@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
+@torch.inference_mode()
+def test_reshape_and_cache(
+    kv_cache_factory,
+    num_tokens: int,
+    num_heads: int,
+    head_size: int,
+    block_size: int,
+    num_blocks: int,
+    dtype: torch.dtype,
+    seed: int,
+    device: str,
+    kv_cache_dtype: str,
+) -> None:
+    if not is_hip() and kv_cache_dtype == "fp8":
+        pytest.skip()  # This test is not tuned for e5m2 cuda precision
+    random.seed(seed)
+    torch.random.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+    torch.set_default_device(device)
+    # Create a random slot mapping.
+    num_slots = block_size * num_blocks
+    slot_mapping = random.sample(range(num_slots), num_tokens)
+    slot_mapping = torch.tensor(slot_mapping, dtype=torch.long)
+
+    qkv = torch.randn(num_tokens, 3, num_heads, head_size, dtype=dtype)
+    _, key, value = qkv.unbind(dim=1)
+
+    # Create the KV caches.
+    key_caches, value_caches = kv_cache_factory(num_blocks, block_size, 1,
+                                                num_heads, head_size,
+                                                kv_cache_dtype, dtype, seed,
+                                                device)
+    key_cache, value_cache = key_caches[0], value_caches[0]
+
+    # Clone the KV caches.
+    if kv_cache_dtype == "fp8":
+        cloned_key_cache = torch.empty_like(key_cache, dtype=torch.float16)
+        ops.convert_fp8(key_cache, cloned_key_cache)
+        cloned_value_cache = torch.empty_like(value_cache, dtype=torch.float16)
+        ops.convert_fp8(value_cache, cloned_value_cache)
+    else:
+        cloned_key_cache = key_cache.clone()
+        cloned_value_cache = value_cache.clone()
+
+    # Using default kv_scale
+    kv_scale = 1.0
+
+    # Call the reshape_and_cache kernel.
+    ops.reshape_and_cache(key, value, key_cache, value_cache, slot_mapping,
+                          kv_cache_dtype, kv_scale)
+
+    if kv_cache_dtype == "fp8":
+        result_key_cache = torch.empty_like(key_cache, dtype=torch.float16)
+        ops.convert_fp8(key_cache, result_key_cache)
+        result_value_cache = torch.empty_like(value_cache, dtype=torch.float16)
+        ops.convert_fp8(value_cache, result_value_cache)
+
+    # Run the reference implementation.
+    reshaped_key = key.reshape(num_tokens, *key_cache[0, :, :, 0, :].shape)
+    block_indicies = torch.div(slot_mapping, block_size, rounding_mode="floor")
+    block_indicies = block_indicies.cpu().tolist()
+    block_offsets = slot_mapping % block_size
+    block_offsets = block_offsets.cpu().tolist()
+    for i in range(num_tokens):
+        block_idx = block_indicies[i]
+        block_offset = block_offsets[i]
+        cloned_key_cache[block_idx, :, :, block_offset, :] = reshaped_key[i]
+        cloned_value_cache[block_idx, :, :, block_offset] = value[i]
+
+    if kv_cache_dtype == "fp8":
+        assert torch.allclose(result_key_cache,
+                              cloned_key_cache,
+                              atol=0.001,
+                              rtol=0.1)
+        assert torch.allclose(result_value_cache,
+                              cloned_value_cache,
+                              atol=0.001,
+                              rtol=0.1)
+    else:
+        assert torch.allclose(key_cache, cloned_key_cache)
+        assert torch.allclose(value_cache, cloned_value_cache)
+
+
+@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("block_size", BLOCK_SIZES)
+@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
+@torch.inference_mode()
+def test_reshape_and_cache_flash(
+    kv_cache_factory_flashinfer,
+    num_tokens: int,
+    num_heads: int,
+    head_size: int,
+    block_size: int,
+    num_blocks: int,
+    dtype: torch.dtype,
+    seed: int,
+    device: str,
+    kv_cache_dtype: str,
+) -> None:
+    if kv_cache_dtype == "fp8":
+        pytest.skip()
+    random.seed(seed)
+    torch.random.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+
+    # Create a random slot mapping.
+    num_slots = block_size * num_blocks
+    slot_mapping = random.sample(range(num_slots), num_tokens)
+    slot_mapping = torch.tensor(slot_mapping, dtype=torch.long, device='cuda')
+
+    qkv = torch.randn(num_tokens,
+                      3,
+                      num_heads,
+                      head_size,
+                      dtype=dtype,
+                      device=device)
+    _, key, value = qkv.unbind(dim=1)
+
+    # Create the KV caches.
+    key_caches, value_caches = kv_cache_factory_flashinfer(
+        num_blocks,
+        block_size,
+        1,
+        num_heads,
+        head_size,
+        kv_cache_dtype,
+        dtype,
+    )
+    key_cache, value_cache = key_caches[0], value_caches[0]
+
+    # Clone the KV caches.
+    cloned_key_cache = key_cache.clone()
+    cloned_value_cache = value_cache.clone()
+
+    # Call the reshape_and_cache kernel.
+    cache_ops.reshape_and_cache_flash(key, value, key_cache, value_cache,
+                                      slot_mapping, kv_cache_dtype)
+
+    # Run the reference implementation.
+    block_indicies = torch.div(slot_mapping, block_size, rounding_mode='floor')
+    block_indicies = block_indicies.cpu().tolist()
+    block_offsets = slot_mapping % block_size
+    block_offsets = block_offsets.cpu().tolist()
+    for i in range(num_tokens):
+        block_idx = block_indicies[i]
+        block_offset = block_offsets[i]
+        cloned_key_cache[block_idx, block_offset, :, :] = key[i]
+        cloned_value_cache[block_idx, block_offset, :, :] = value[i]
+
+    assert torch.allclose(key_cache, cloned_key_cache)
+    assert torch.allclose(value_cache, cloned_value_cache)
+
+
+@pytest.mark.parametrize("direction", COPYING_DIRECTION)
+@pytest.mark.parametrize("num_mappings", NUM_MAPPINGS)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("block_size", BLOCK_SIZES)
+@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
+@torch.inference_mode()
+def test_swap_blocks(
+    kv_cache_factory,
+    direction: Tuple[str, str],
+    num_mappings: int,
+    num_heads: int,
+    head_size: int,
+    block_size: int,
+    num_blocks: int,
+    dtype: torch.dtype,
+    seed: int,
+    device: str,
+    kv_cache_dtype: str,
+) -> None:
+    if kv_cache_dtype == "fp8" and "cpu" in direction:
+        pytest.skip()
+    if not is_hip() and kv_cache_dtype == "fp8":
+        pytest.skip()  # This test is not tuned for e5m2 cuda precision
+    random.seed(seed)
+    torch.random.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+
+    src_device = device if direction[0] == "cuda" else 'cpu'
+    dst_device = device if direction[1] == "cuda" else 'cpu'
+
+    src_blocks = random.sample(range(num_blocks), num_mappings)
+    # For the same device, mapping must not overlap
+    if src_device == dst_device:
+        remaining_blocks = list(set(range(num_blocks)) - set(src_blocks))
+        dst_blocks = random.sample(remaining_blocks, num_mappings)
+    else:
+        dst_blocks = random.sample(range(num_blocks), num_mappings)
+
+    block_mapping = dict(zip(src_blocks, dst_blocks))
+
+    # Create the KV caches on the first device.
+    src_key_caches, src_value_caches = kv_cache_factory(
+        num_blocks, block_size, 1, num_heads, head_size, kv_cache_dtype, dtype,
+        seed, src_device)
+
+    # Create the KV caches on the second device.
+    dist_key_caches, dist_value_caches = kv_cache_factory(
+        num_blocks, block_size, 1, num_heads, head_size, kv_cache_dtype, dtype,
+        seed, dst_device)
+
+    src_key_caches_clone = src_key_caches[0].clone()
+    src_value_caches_clone = src_value_caches[0].clone()
+
+    # Call the swap_blocks kernel.
+    ops.swap_blocks(src_key_caches[0], dist_key_caches[0], block_mapping)
+    ops.swap_blocks(src_value_caches[0], dist_value_caches[0], block_mapping)
+
+    for src, dst in block_mapping.items():
+        assert torch.allclose(src_key_caches_clone[src].cpu(),
+                              dist_key_caches[0][dst].cpu())
+        assert torch.allclose(src_value_caches_clone[src].cpu(),
+                              dist_value_caches[0][dst].cpu())
+
+
+@pytest.mark.skipif(not is_hip(), reason="FP8 conversion test requires e4m3")
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("block_size", BLOCK_SIZES)
+@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@torch.inference_mode()
+def test_fp8_conversion(
+    num_heads: int,
+    head_size: int,
+    block_size: int,
+    num_blocks: int,
+    dtype: torch.dtype,
+    seed: int,
+    device: str,
+) -> None:
+    random.seed(seed)
+    torch.random.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+
+    low = -224.0
+    high = 224.0
+    shape = (num_blocks, num_heads, head_size, block_size)
+    cache = torch.empty(shape, dtype=dtype, device=device)
+    cache.uniform_(low, high)
+
+    cache_fp8 = torch.empty_like(cache, dtype=torch.uint8)
+    ops.convert_fp8(cache, cache_fp8)
+
+    converted_cache = torch.empty_like(cache)
+    ops.convert_fp8(cache_fp8, converted_cache)
+
+    assert torch.allclose(cache, converted_cache, atol=0.001, rtol=0.1)
--- a/tests/kernels/test_layernorm.py
+++ b/tests/kernels/test_layernorm.py
@@ -0,0 +1,54 @@
+import pytest
+import torch
+
+from vllm.model_executor.layers.layernorm import RMSNorm
+
+DTYPES = [torch.half, torch.bfloat16, torch.float]
+NUM_TOKENS = [7, 83, 4096]  # Arbitrary values for testing
+HIDDEN_SIZES = [768, 769, 770, 771, 5120, 5124, 5125, 5126, 8192,
+                8199]  # Arbitrary values for testing
+ADD_RESIDUAL = [False, True]
+SEEDS = [0]
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+]
+
+
+@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
+@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
+@pytest.mark.parametrize("add_residual", ADD_RESIDUAL)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@torch.inference_mode()
+def test_rms_norm(
+    num_tokens: int,
+    hidden_size: int,
+    add_residual: bool,
+    dtype: torch.dtype,
+    seed: int,
+    device: str,
+) -> None:
+    torch.random.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+    torch.set_default_device(device)
+    layer = RMSNorm(hidden_size).to(dtype=dtype)
+    layer.weight.data.normal_(mean=1.0, std=0.1)
+    scale = 1 / (2 * hidden_size)
+    x = torch.randn(num_tokens, hidden_size, dtype=dtype)
+    x *= scale
+    residual = torch.randn_like(x) * scale if add_residual else None
+
+    # NOTE(woosuk): The reference implementation should be executed first
+    # because the custom kernel is in-place.
+    ref_out = layer._forward(x, residual)
+    out = layer(x, residual)
+    # NOTE(woosuk): LayerNorm operators (including RMS) typically have larger
+    # numerical errors than other operators because they involve reductions.
+    # Therefore, we use a larger tolerance.
+    if add_residual:
+        assert torch.allclose(out[0], ref_out[0], atol=1e-2, rtol=1e-2)
+        assert torch.allclose(out[1], ref_out[1], atol=1e-2, rtol=1e-2)
+    else:
+        assert torch.allclose(out, ref_out, atol=1e-2, rtol=1e-2)
--- a/tests/kernels/test_moe.py
+++ b/tests/kernels/test_moe.py
@@ -0,0 +1,101 @@
+"""Tests for the MOE layers.
+
+Run `pytest tests/kernels/test_moe.py`.
+"""
+import pytest
+import torch
+from transformers import MixtralConfig
+from transformers.models.mixtral.modeling_mixtral import MixtralSparseMoeBlock
+
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.fused_moe import fused_moe
+from vllm.model_executor.models.mixtral import MixtralMoE
+
+
+def torch_moe(a, w1, w2, score, topk):
+    B, D = a.shape
+    a = a.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D)
+    out = torch.zeros(B * topk, w2.shape[1], dtype=a.dtype, device=a.device)
+    score = torch.softmax(score, dim=-1, dtype=torch.float32)
+    topk_weight, topk_ids = torch.topk(score, topk)
+    topk_weight = topk_weight.view(-1)
+    topk_ids = topk_ids.view(-1)
+    for i in range(w1.shape[0]):
+        mask = topk_ids == i
+        if mask.sum():
+            out[mask] = SiluAndMul()(
+                a[mask] @ w1[i].transpose(0, 1)) @ w2[i].transpose(0, 1)
+    return (out.view(B, -1, w2.shape[1]) *
+            topk_weight.view(B, -1, 1).to(out.dtype)).sum(dim=1)
+
+
+@pytest.mark.parametrize("m", [512, 222, 33, 1])
+@pytest.mark.parametrize("n", [2048, 256, 1024])
+@pytest.mark.parametrize("k", [128, 511, 1024])
+@pytest.mark.parametrize("e", [8, 64])
+@pytest.mark.parametrize("topk", [2, 6])
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+def test_fused_moe(
+    m: int,
+    n: int,
+    k: int,
+    e: int,
+    topk: int,
+    dtype: torch.dtype,
+):
+    a = torch.randn((m, k), device='cuda', dtype=dtype) / 10
+    w1 = torch.randn((e, 2 * n, k), device='cuda', dtype=dtype) / 10
+    w2 = torch.randn((e, k, n), device='cuda', dtype=dtype) / 10
+
+    score = torch.randn((m, e), device='cuda', dtype=dtype)
+    triton_output = fused_moe(a, w1, w2, score, topk, renormalize=False)
+    torch_output = torch_moe(a, w1, w2, score, topk)
+    assert torch.allclose(triton_output, torch_output, atol=1e-2, rtol=0)
+
+
+@pytest.mark.parametrize("dtype",
+                         [torch.float32, torch.float16, torch.bfloat16])
+@torch.inference_mode()
+def test_mixtral_moe(dtype: torch.dtype):
+    """Make sure our Mixtral MoE implementation agrees with the one from
+    huggingface."""
+
+    # Instantiate our and huggingface's MoE blocks
+    config = MixtralConfig()
+    hf_moe = MixtralSparseMoeBlock(config).to(dtype).to("cuda")
+    vllm_moe = MixtralMoE(
+        num_experts=config.num_local_experts,
+        top_k=config.num_experts_per_tok,
+        hidden_size=config.hidden_size,
+        intermediate_size=config.intermediate_size,
+        params_dtype=dtype,
+        tp_size=1,
+    ).cuda()
+
+    # Load the weights
+    vllm_moe.gate.weight.data[:] = hf_moe.gate.weight.data
+    for i in range(config.num_local_experts):
+        weights = (hf_moe.experts[i].w1.weight.data,
+                   hf_moe.experts[i].w3.weight.data)
+        vllm_moe.w13_weight[i][:] = torch.cat(weights, dim=0)
+        vllm_moe.w2_weight[i][:] = hf_moe.experts[i].w2.weight.data
+
+    # Generate input batch of dimensions [batch_size, seq_len, hidden_dim]
+    hf_inputs = torch.randn((1, 64, config.hidden_size)).to(dtype).to("cuda")
+    # vLLM uses 1D query [num_tokens, hidden_dim]
+    vllm_inputs = hf_inputs.flatten(0, 1)
+
+    # Run forward passes for both MoE blocks
+    hf_states, _ = hf_moe.forward(hf_inputs)
+    vllm_states = vllm_moe.forward(vllm_inputs)
+
+    mixtral_moe_tol = {
+        torch.float32: 1e-3,
+        torch.float16: 1e-3,
+        torch.bfloat16: 1e-2,
+    }
+
+    assert torch.allclose(hf_states.flatten(0, 1),
+                          vllm_states,
+                          rtol=mixtral_moe_tol[dtype],
+                          atol=mixtral_moe_tol[dtype])
--- a/tests/kernels/test_pos_encoding.py
+++ b/tests/kernels/test_pos_encoding.py
@@ -0,0 +1,208 @@
+from itertools import accumulate
+from typing import List, Optional
+
+import pytest
+import torch
+from allclose_default import get_default_atol, get_default_rtol
+
+from vllm.model_executor.layers.rotary_embedding import get_rope
+
+IS_NEOX_STYLE = [True, False]
+DTYPES = [torch.half, torch.bfloat16, torch.float]
+HEAD_SIZES = [64, 80, 96, 112, 128, 256]
+ROTARY_DIMS = [None, 32]  # None means rotary dim == head size
+NUM_HEADS = [7, 17]  # Arbitrary values for testing
+BATCH_SIZES = [1, 5]  # Arbitrary values for testing
+SEQ_LENS = [11, 8192]  # Arbitrary values for testing
+SEEDS = [0]
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+]
+
+
+@pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE)
+@pytest.mark.parametrize("batch_size", BATCH_SIZES)
+@pytest.mark.parametrize("seq_len", SEQ_LENS)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("rotary_dim", ROTARY_DIMS)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@torch.inference_mode()
+def test_rotary_embedding(
+    is_neox_style: bool,
+    batch_size: int,
+    seq_len: int,
+    num_heads: int,
+    head_size: int,
+    rotary_dim: Optional[int],
+    dtype: torch.dtype,
+    seed: int,
+    device: str,
+    max_position: int = 8192,
+    base: int = 10000,
+) -> None:
+    if rotary_dim is None:
+        rotary_dim = head_size
+    torch.random.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+    torch.set_default_device(device)
+    if rotary_dim is None:
+        rotary_dim = head_size
+    rope = get_rope(head_size, rotary_dim, max_position, base, is_neox_style)
+    rope = rope.to(dtype=dtype)
+
+    positions = torch.randint(0, max_position, (batch_size, seq_len))
+    query = torch.randn(batch_size,
+                        seq_len,
+                        num_heads * head_size,
+                        dtype=dtype)
+    key = torch.randn_like(query)
+
+    # NOTE(woosuk): The reference implementation should be executed first
+    # because the custom kernel is in-place.
+    ref_query, ref_key = rope._forward(positions, query, key)
+    out_query, out_key = rope.forward(positions, query, key)
+    # Compare the results.
+    assert torch.allclose(out_query,
+                          ref_query,
+                          atol=get_default_atol(out_query),
+                          rtol=get_default_rtol(out_query))
+    assert torch.allclose(out_key,
+                          ref_key,
+                          atol=get_default_atol(out_key),
+                          rtol=get_default_rtol(out_key))
+
+
+@pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE)
+@pytest.mark.parametrize("batch_size", BATCH_SIZES)
+@pytest.mark.parametrize("seq_len", SEQ_LENS)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("rotary_dim", ROTARY_DIMS)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@torch.inference_mode()
+def test_batched_rotary_embedding(
+    is_neox_style: bool,
+    batch_size: int,
+    seq_len: int,
+    num_heads: int,
+    head_size: int,
+    rotary_dim: Optional[int],
+    dtype: torch.dtype,
+    seed: int,
+    device: str,
+    max_position: int = 8192,
+    base: int = 10000,
+) -> None:
+    torch.random.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+    torch.set_default_device(device)
+    if rotary_dim is None:
+        rotary_dim = head_size
+    rope = get_rope(head_size, rotary_dim, max_position, base, is_neox_style, {
+        "type": "linear",
+        "factor": (1, )
+    })
+    rope = rope.to(dtype=dtype)
+
+    positions = torch.randint(0, max_position, (batch_size, seq_len))
+    query = torch.randn(batch_size,
+                        seq_len,
+                        num_heads * head_size,
+                        dtype=dtype)
+    key = torch.randn_like(query)
+
+    # NOTE(woosuk): The reference implementation should be executed first
+    # because the custom kernel is in-place.
+    ref_query, ref_key = rope._forward(positions, query, key)
+    out_query, out_key = rope.forward(positions,
+                                      query,
+                                      key,
+                                      offsets=torch.zeros(batch_size * seq_len,
+                                                          dtype=int,
+                                                          device=device))
+    # Compare the results.
+    assert torch.allclose(out_query,
+                          ref_query,
+                          atol=get_default_atol(out_query),
+                          rtol=get_default_rtol(out_query))
+    assert torch.allclose(out_key,
+                          ref_key,
+                          atol=get_default_atol(out_key),
+                          rtol=get_default_rtol(out_key))
+
+
+@pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE)
+@pytest.mark.parametrize("batch_size", BATCH_SIZES)
+@pytest.mark.parametrize("seq_len", SEQ_LENS)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("rotary_dim", ROTARY_DIMS)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@torch.inference_mode()
+def test_batched_rotary_embedding_multi_lora(
+    is_neox_style: bool,
+    batch_size: int,
+    seq_len: int,
+    num_heads: int,
+    head_size: int,
+    rotary_dim: Optional[int],
+    dtype: torch.dtype,
+    seed: int,
+    device: str,
+    max_position: int = 8192,
+    base: int = 10000,
+) -> None:
+    torch.random.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+    torch.set_default_device(device)
+    if rotary_dim is None:
+        rotary_dim = head_size
+    scaling_factors: List[int] = [1, 2, 4]
+    rope = get_rope(head_size, rotary_dim, max_position, base, is_neox_style, {
+        "type": "linear",
+        "factor": tuple(scaling_factors)
+    })
+    rope = rope.to(dtype=dtype)
+
+    positions = torch.randint(0, max_position, (batch_size, seq_len))
+    query = torch.randn(batch_size,
+                        seq_len,
+                        num_heads * head_size,
+                        dtype=dtype)
+    key = torch.randn_like(query)
+
+    offset_map = torch.tensor(
+        list(
+            accumulate([0] + [
+                max_position * scaling_factor * 2
+                for scaling_factor in scaling_factors[:-1]
+            ])))
+    query_types = torch.randint(0,
+                                len(scaling_factors), (batch_size, seq_len),
+                                device=device)
+    query_offsets = offset_map[query_types]
+
+    # NOTE(woosuk): The reference implementation should be executed first
+    # because the custom kernel is in-place.
+    ref_query, ref_key = rope._forward(positions, query, key, query_offsets)
+    out_query, out_key = rope.forward(positions, query, key,
+                                      query_offsets.flatten())
+    # Compare the results.
+    assert torch.allclose(out_query,
+                          ref_query,
+                          atol=get_default_atol(out_query),
+                          rtol=get_default_rtol(out_query))
+    assert torch.allclose(out_key,
+                          ref_key,
+                          atol=get_default_atol(out_key),
+                          rtol=get_default_rtol(out_key))
--- a/tests/kernels/test_prefix_prefill.py
+++ b/tests/kernels/test_prefix_prefill.py
@@ -0,0 +1,209 @@
+import random
+import time
+
+import pytest
+import torch
+from xformers import ops as xops
+from xformers.ops.fmha.attn_bias import BlockDiagonalCausalFromBottomRightMask
+
+from vllm.attention.ops.prefix_prefill import context_attention_fwd
+
+NUM_HEADS = [64]
+NUM_QUERIES_PER_KV = [1, 8, 64]
+HEAD_SIZES = [128, 96]
+DTYPES = [torch.float16]
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+]
+SLIDING_WINDOW = [0, 16, 64, 128, 256, 512, 2048]
+
+
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("num_queries_per_kv", NUM_QUERIES_PER_KV)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("sliding_window", SLIDING_WINDOW)
+@torch.inference_mode()
+def test_contexted_kv_attention(
+    num_heads: int,
+    num_queries_per_kv: int,
+    head_size: int,
+    sliding_window: int,
+    dtype: torch.dtype,
+    device: str,
+) -> None:
+    random.seed(0)
+    torch.manual_seed(0)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(0)
+    torch.set_default_device(device)
+
+    # Need this, otherwise when we capture the graph the process
+    # for GPU 1 would run on both GPU0 and GPU1 and things would hang
+    #
+    # see also similar issue: https://github.com/Dao-AILab/flash-attention/issues/523
+    torch.cuda.set_device(device)
+
+    MAX_SEQ_LEN = 1024
+    MAX_CTX_LEN = 1024
+    BS = 10
+    cache_size = 640
+    block_size = 32
+    max_block_per_request = 64
+    query_lens = [random.randint(16, MAX_SEQ_LEN) for _ in range(BS)]
+    ctx_lens = [random.randint(16, MAX_CTX_LEN) for _ in range(BS)]
+    seq_lens = [a + b for a, b in zip(query_lens, ctx_lens)]
+    num_kv_heads = num_heads // num_queries_per_kv
+
+    num_tokens = sum(query_lens)
+    query = torch.empty(num_tokens, num_heads, head_size, dtype=dtype)
+    query.uniform_(-1e-3, 1e-3)
+    output = torch.empty(num_tokens, num_heads, head_size, dtype=dtype)
+
+    kv = torch.empty(sum(seq_lens), 2, num_kv_heads, head_size, dtype=dtype)
+    kv.uniform_(-1e-3, 1e-3)
+    key, value = kv.unbind(dim=1)
+
+    k_cache = torch.zeros(cache_size,
+                          block_size,
+                          num_kv_heads,
+                          head_size,
+                          dtype=dtype)
+    v_cache = torch.zeros(cache_size,
+                          block_size,
+                          num_kv_heads,
+                          head_size,
+                          dtype=dtype)
+    k = torch.zeros(sum(query_lens), num_kv_heads, head_size, dtype=dtype)
+    v = torch.zeros(sum(query_lens), num_kv_heads, head_size, dtype=dtype)
+    values = torch.arange(0, cache_size, dtype=torch.long)
+    values = values[torch.randperm(cache_size)]
+    block_table = values[:BS * max_block_per_request].view(
+        BS, max_block_per_request)
+    b_seq_len = torch.tensor(seq_lens, dtype=torch.long)
+    b_ctx_len = torch.tensor(ctx_lens, dtype=torch.long)
+    b_start_loc = torch.cumsum(torch.tensor([0] + query_lens[:-1],
+                                            dtype=torch.long),
+                               dim=0)
+    max_input_len = MAX_SEQ_LEN
+    # copy kv to cache
+    b_seq_start_loc = torch.cumsum(torch.tensor([0] + seq_lens[:-1],
+                                                dtype=torch.long),
+                                   dim=0)
+    for i in range(BS):
+        for j in range(query_lens[i]):
+            k[b_start_loc[i] + j].copy_(key[b_seq_start_loc[i] + b_ctx_len[i] +
+                                            j])
+            v[b_start_loc[i] + j].copy_(value[b_seq_start_loc[i] +
+                                              b_ctx_len[i] + j])
+        cur_ctx = 0
+        block_id = 0
+        while cur_ctx < b_ctx_len[i]:
+            start_loc = b_seq_start_loc[i] + cur_ctx
+            if cur_ctx + block_size > b_ctx_len[i]:
+                end_loc = b_seq_start_loc[i] + b_ctx_len[i]
+            else:
+                end_loc = start_loc + block_size
+            start_slot = block_table[i, block_id] * block_size
+            end_slot = start_slot + end_loc - start_loc
+            k_cache.view(-1, num_kv_heads,
+                         head_size)[start_slot:end_slot].copy_(
+                             key[start_loc:end_loc])
+            v_cache.view(-1, num_kv_heads,
+                         head_size)[start_slot:end_slot].copy_(
+                             value[start_loc:end_loc])
+            cur_ctx += block_size
+            block_id += 1
+    # transpose K_cache[num_blocks, block_size, num_kv_heads, head_size]
+    # to K_cache[num_blocks, num_kv_heads, head_size/8, block_size, 8]
+    k_cache = k_cache.view(-1, block_size, num_kv_heads, head_size // 8,
+                           8).permute(0, 2, 3, 1, 4).contiguous()
+    # transpose V_cache[num_blocks, block_size, num_kv_heads, head_size]
+    # to V_cache[num_blocks, num_kv_heads, head_size, block_size]
+    v_cache = v_cache.view(-1, block_size, num_kv_heads,
+                           head_size).permute(0, 2, 3, 1).contiguous()
+
+    # Warm up the Triton kernel by calling it once before actually measuring
+    # generation time
+    context_attention_fwd(query,
+                          k,
+                          v,
+                          output,
+                          k_cache,
+                          v_cache,
+                          block_table,
+                          b_start_loc,
+                          b_seq_len,
+                          b_ctx_len,
+                          max_input_len,
+                          sliding_window=sliding_window)
+    torch.cuda.synchronize()
+    start_time = time.time()
+    context_attention_fwd(query,
+                          k,
+                          v,
+                          output,
+                          k_cache,
+                          v_cache,
+                          block_table,
+                          b_start_loc,
+                          b_seq_len,
+                          b_ctx_len,
+                          max_input_len,
+                          sliding_window=sliding_window)
+    torch.cuda.synchronize()
+    end_time = time.time()
+    print(f"triton Time: {(end_time - start_time)*1000:.2f} ms")
+
+    scale = float(1.0 / (head_size**0.5))
+
+    attn_op = xops.fmha.cutlass.FwOp()
+
+    if num_kv_heads != num_heads:
+        # As of Nov 2023, xformers only supports MHA. For MQA/GQA,
+        # project the key and value tensors to the desired number of
+        # heads.
+        #
+        # see also: vllm/model_executor/layers/attention.py
+        query = query.view(query.shape[0], num_kv_heads, num_queries_per_kv,
+                           query.shape[-1])
+        key = key[:, :, None, :].expand(key.shape[0], num_kv_heads,
+                                        num_queries_per_kv, key.shape[-1])
+        value = value[:, :,
+                      None, :].expand(value.shape[0], num_kv_heads,
+                                      num_queries_per_kv, value.shape[-1])
+    query = query.unsqueeze(0)
+    key = key.unsqueeze(0)
+    value = value.unsqueeze(0)
+
+    attn_bias = BlockDiagonalCausalFromBottomRightMask.from_seqlens(
+        query_lens, seq_lens)
+    if sliding_window > 0:
+        attn_bias = attn_bias.make_local_attention_from_bottomright(
+            sliding_window)
+    output_ref = xops.memory_efficient_attention_forward(
+        query,
+        key,
+        value,
+        attn_bias=attn_bias,
+        p=0.0,
+        scale=scale,
+        op=attn_op,
+    )
+    torch.cuda.synchronize()
+    start_time = time.time()
+    output_ref = xops.memory_efficient_attention_forward(
+        query,
+        key,
+        value,
+        attn_bias=attn_bias,
+        p=0.0,
+        scale=scale,
+        op=attn_op,
+    )
+    torch.cuda.synchronize()
+    end_time = time.time()
+    print(f"xformers Time: {(end_time - start_time)*1000:.2f} ms")
+    output_ref = output_ref.reshape(output.shape)
+    assert torch.allclose(output_ref, output, atol=1e-6, rtol=0)
--- a/tests/kernels/test_rand.py
+++ b/tests/kernels/test_rand.py
@@ -0,0 +1,52 @@
+import random
+
+import pytest
+import torch
+
+from vllm.model_executor.layers.ops.rand import seeded_uniform
+from vllm.model_executor.utils import set_random_seed
+
+
+@pytest.mark.parametrize("dtype",
+                         [torch.float32, torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("use_3d", [True, False])
+def test_seeded_uniform(dtype: torch.dtype, use_3d: bool):
+    device = "cuda"
+    for seed in range(512):
+        set_random_seed(seed)
+        rows = random.randint(1, 512)
+        cols = random.randint(1, 64000)
+        if use_3d:
+            third_dim = random.randint(2, 10)
+            dims = [rows, third_dim, cols]
+        else:
+            dims = [rows, cols]
+        seeds = torch.randint(torch.iinfo(torch.long).min,
+                              torch.iinfo(torch.long).max, (rows, ),
+                              device=device)
+
+        # Test that the same seed produces the same output
+        out = seeded_uniform(*dims, seeds=seeds, dtype=dtype, device=device)
+        out2 = seeded_uniform(*dims, seeds=seeds, dtype=dtype, device=device)
+        torch.testing.assert_close(out, out2)
+        # del to save memory
+        del out2
+
+        out3 = seeded_uniform(*dims, seeds=seeds, dtype=dtype, device=device)
+        torch.testing.assert_close(out, out3)
+        # del to save memory
+        del out3
+
+        # Initialize out tensor with garbage to ensure that it is overwritten
+        out_with_tensor = seeded_uniform(
+            *dims,
+            out=torch.full(
+                (*dims, ),
+                -1,
+                dtype=dtype,
+                device=device,
+            ),
+            seeds=seeds,
+            dtype=dtype,
+        )
+        torch.testing.assert_close(out, out_with_tensor)
--- a/tests/kernels/test_sampler.py
+++ b/tests/kernels/test_sampler.py
@@ -0,0 +1,196 @@
+import gc
+
+import pytest
+import torch
+import triton
+import triton.language as tl
+
+from vllm.model_executor.layers.ops.sample import (
+    MAX_TRITON_N_COLS, _uniform_to_exponential, get_num_triton_sampler_splits,
+    sample)
+from vllm.model_executor.sampling_metadata import SamplingTensors
+from vllm.model_executor.utils import set_random_seed
+
+SINGLE_SPLIT_VOCAB_SIZE = 32000  # llama/mistral/mixtral vocab size
+MULTI_SPLIT_VOCAB_SIZE = MAX_TRITON_N_COLS + 100
+
+
+@pytest.fixture(autouse=True)
+def _cleanup():
+    yield
+    gc.collect()
+    torch.cuda.empty_cache()
+
+
+@triton.jit
+def _uniform_to_exponential_kernel(input, output, n: tl.constexpr):
+    idx = tl.arange(0, n)
+    x = tl.load(input + idx)
+    y = _uniform_to_exponential(x)
+    tl.store(output + idx, y)
+
+
+def test_uniform_to_exponential():
+    """Test that we can convert uniform to exponential without div by 0."""
+    input = torch.tensor([0.0, 1.0 - torch.finfo(torch.float32).eps],
+                         dtype=torch.float32,
+                         device="cuda")
+    output = torch.zeros(input.shape, dtype=torch.float32, device="cuda")
+    _uniform_to_exponential_kernel[(1, )](input, output, 2)
+    assert torch.all(torch.isfinite(output))
+    assert torch.all(output > 0)
+    assert torch.all(torch.isfinite(torch.full_like(output, 1.0) / output))
+
+
+@pytest.mark.parametrize("random_sampling", [True, False, "mixed"])
+@pytest.mark.parametrize("max_best_of", [1, 2, 3, 4, 5])
+@pytest.mark.parametrize("modify_greedy_probs", [True, False])
+@pytest.mark.parametrize("seed", [1337])
+@pytest.mark.parametrize("vocab_size",
+                         [SINGLE_SPLIT_VOCAB_SIZE, MULTI_SPLIT_VOCAB_SIZE])
+@pytest.mark.parametrize("save_logprobs", [True, False])
+def test_sample_decoding_only(random_sampling, max_best_of,
+                              modify_greedy_probs, seed, vocab_size,
+                              save_logprobs):
+    set_random_seed(seed)
+    bs = 8
+    probs = torch.zeros((bs, vocab_size), dtype=torch.float32, device="cuda")
+    for i in range(bs):
+        probs[i, i * (vocab_size // bs)] = 1.0
+    logprobs = torch.rand_like(probs)
+    sample_indices = torch.arange(bs, dtype=torch.long, device="cuda")
+    n_splits = get_num_triton_sampler_splits(probs.shape[1])
+    if random_sampling == "mixed":
+        random_sampling_mask = (torch.rand(
+            (1, bs), device="cuda") < 0.5).expand(n_splits, bs)
+    elif random_sampling:
+        random_sampling_mask = torch.ones((n_splits, bs),
+                                          dtype=torch.bool,
+                                          device="cuda")
+    else:
+        random_sampling_mask = torch.zeros((n_splits, bs),
+                                           dtype=torch.bool,
+                                           device="cuda")
+
+    seeds = torch.randint(1,
+                          torch.iinfo(torch.long).max, (n_splits, bs),
+                          device="cuda").mul_(random_sampling_mask)
+    sampled_tokens, sampled_logprobs, sampled_modified_probs = sample(
+        probs=probs,
+        logprobs=logprobs,
+        sample_indices=sample_indices,
+        seeds=seeds,
+        max_best_of=max_best_of,
+        modify_greedy_probs=modify_greedy_probs,
+        save_logprobs=save_logprobs,
+        _save_modified_probs=True)
+    assert sampled_tokens.shape == (bs, max_best_of)
+    for i in range(bs):
+        assert torch.all(sampled_tokens[i] == i * (vocab_size // bs))
+        request_uses_random_sampling = random_sampling_mask[0, i]
+        if modify_greedy_probs and not request_uses_random_sampling:
+            # If we are modifying greedy probs and the request is greedy,
+            # we want to make sure the probs tensor is modified in place
+            assert torch.allclose(
+                probs[i][sampled_tokens[i]],
+                torch.full_like(probs[i][sampled_tokens[i]], 1.0))
+            assert torch.sum(probs[i]) == 1.0
+            assert torch.allclose(
+                sampled_modified_probs[i][0],
+                torch.full_like(sampled_modified_probs[i][0], 1.0))
+        elif request_uses_random_sampling:
+            # If the request is random, we want to make sure
+            # sampled_modified_probs tensor has noise added
+            # (and thus is different from probs tensor)
+            assert not torch.allclose(sampled_modified_probs[i][0],
+                                      probs[i][sampled_tokens[i]])
+        elif not request_uses_random_sampling:
+            # If the request is greedy and we are not modifying greedy probs,
+            # we want to make sure sampled_modified_probs tensor is the same as
+            # the probs tensor.
+            assert torch.allclose(sampled_modified_probs[i][0],
+                                  probs[i][sampled_tokens[i]])
+
+    if save_logprobs:
+        assert sampled_logprobs.shape == (bs, max_best_of)
+        for i in range(bs):
+            for best_of in range(max_best_of):
+                assert torch.all(sampled_logprobs[i] == logprobs[i][
+                    sampled_tokens[i, best_of]])
+    else:
+        assert sampled_logprobs is None
+
+
+@pytest.mark.parametrize("random_sampling", [True, False, "mixed"])
+@pytest.mark.parametrize("max_best_of", [1, 2, 3, 4, 5])
+@pytest.mark.parametrize("modify_greedy_probs", [True, False])
+@pytest.mark.parametrize("seed", [1337])
+@pytest.mark.parametrize("vocab_size",
+                         [SINGLE_SPLIT_VOCAB_SIZE, MULTI_SPLIT_VOCAB_SIZE])
+def test_sample_prompt_logprobs(random_sampling, max_best_of,
+                                modify_greedy_probs, seed, vocab_size):
+    set_random_seed(seed)
+    prompt_sizes = [16, 32, 64, 128] * 2
+    samples = 8
+    bs = samples + sum(prompt_sizes)
+    probs = torch.zeros((bs, vocab_size), dtype=torch.float32, device="cuda")
+    for i in range(bs):
+        probs[i, i * (vocab_size // bs)] = 1.0
+    logprobs = torch.rand_like(probs)
+    sample_indices = torch.tensor(prompt_sizes,
+                                  dtype=torch.long,
+                                  device="cuda").cumsum_(0)
+    n_splits = get_num_triton_sampler_splits(probs.shape[1])
+    if random_sampling == "mixed":
+        random_sampling_mask = torch.rand(
+            (n_splits, samples), device="cuda") < 0.5
+    elif random_sampling:
+        random_sampling_mask = torch.ones((n_splits, samples),
+                                          dtype=torch.bool,
+                                          device="cuda")
+    else:
+        random_sampling_mask = torch.zeros((n_splits, samples),
+                                           dtype=torch.bool,
+                                           device="cuda")
+
+    seeds = torch.randint(1,
+                          torch.iinfo(torch.long).max, (n_splits, samples),
+                          device="cuda").mul_(random_sampling_mask)
+    sampled_tokens, sampled_logprobs, _ = sample(
+        probs=probs,
+        logprobs=logprobs,
+        sample_indices=sample_indices,
+        seeds=seeds,
+        max_best_of=max_best_of,
+        modify_greedy_probs=modify_greedy_probs,
+        save_logprobs=True)
+    assert sampled_tokens.shape == (samples, max_best_of)
+    assert sampled_logprobs.shape == (samples, max_best_of)
+    for i, t in enumerate(sample_indices):
+        assert torch.all(sampled_tokens[i] == t * (vocab_size // bs))
+        for best_of in range(max_best_of):
+            assert torch.all(sampled_logprobs[i] == logprobs[sample_indices[i]]
+                             [sampled_tokens[i, best_of]])
+
+
+@pytest.mark.parametrize("seed", list(range(16)))
+def test_get_sequence_seeds(seed):
+    """Ensure that we get a different child seed from base 
+    seed + extra entropy"""
+    starting_seed = seed
+    seq_seed = None
+    extra_entropy = 1
+    for i in range(512):
+        new_seq_seed = SamplingTensors._get_sequence_seeds(starting_seed,
+                                                           i,
+                                                           seeds_to_generate=1,
+                                                           is_greedy=False)[0]
+        new_seq_seed_extra_entropy = SamplingTensors._get_sequence_seeds(
+            starting_seed,
+            i,
+            extra_entropy,
+            seeds_to_generate=1,
+            is_greedy=False)[0]
+        assert new_seq_seed_extra_entropy != new_seq_seed
+        assert seq_seed != new_seq_seed
+        seq_seed = new_seq_seed
--- a/tests/lora/init.py
+++ b/tests/lora/init.py
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -0,0 +1,179 @@
+import contextlib
+import gc
+import tempfile
+from collections import OrderedDict
+from unittest.mock import MagicMock, patch
+
+import pytest
+import ray
+import torch
+import torch.nn as nn
+from huggingface_hub import snapshot_download
+
+import vllm
+from vllm.config import LoRAConfig
+from vllm.distributed import destroy_model_parallel, initialize_model_parallel
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               MergedColumnParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.sampler import Sampler
+from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
+from vllm.model_executor.model_loader import get_model
+
+
+def cleanup():
+    destroy_model_parallel()
+    with contextlib.suppress(AssertionError):
+        torch.distributed.destroy_process_group()
+    gc.collect()
+    torch.cuda.empty_cache()
+    ray.shutdown()
+
+
+@pytest.fixture(autouse=True)
+def cleanup_fixture():
+    yield
+    cleanup()
+
+
+@pytest.fixture
+def dist_init():
+    if not torch.distributed.is_initialized():
+        temp_file = tempfile.mkstemp()[1]
+        torch.distributed.init_process_group(
+            backend="nccl",
+            world_size=1,
+            rank=0,
+            init_method=f"file://{temp_file}",
+        )
+        torch.distributed.all_reduce(torch.zeros(1).cuda())
+    initialize_model_parallel(1, 1)
+    yield
+    cleanup()
+
+
+@pytest.fixture
+def dist_init_torch_only():
+    if torch.distributed.is_initialized():
+        return
+    temp_file = tempfile.mkstemp()[1]
+    torch.distributed.init_process_group(
+        backend="nccl",
+        world_size=1,
+        rank=0,
+        init_method=f"file://{temp_file}",
+    )
+
+
+@pytest.fixture
+def dummy_model() -> nn.Module:
+    model = nn.Sequential(
+        OrderedDict([
+            ("dense1", ColumnParallelLinear(764, 100)),
+            ("dense2", RowParallelLinear(100, 50)),
+            (
+                "layer1",
+                nn.Sequential(
+                    OrderedDict([
+                        ("dense1", ColumnParallelLinear(100, 10)),
+                        ("dense2", RowParallelLinear(10, 50)),
+                    ])),
+            ),
+            ("act2", nn.ReLU()),
+            ("output", ColumnParallelLinear(50, 10)),
+            ("outact", nn.Sigmoid()),
+            # Special handling for lm_head & sampler
+            ("lm_head", ParallelLMHead(512, 10)),
+            ("logits_processor", LogitsProcessor(512)),
+            ("sampler", Sampler())
+        ]))
+    model.config = MagicMock()
+    return model
+
+
+@pytest.fixture
+def dummy_model_gate_up() -> nn.Module:
+    model = nn.Sequential(
+        OrderedDict([
+            ("dense1", ColumnParallelLinear(764, 100)),
+            ("dense2", RowParallelLinear(100, 50)),
+            (
+                "layer1",
+                nn.Sequential(
+                    OrderedDict([
+                        ("dense1", ColumnParallelLinear(100, 10)),
+                        ("dense2", RowParallelLinear(10, 50)),
+                    ])),
+            ),
+            ("act2", nn.ReLU()),
+            ("gate_up_proj", MergedColumnParallelLinear(50, [5, 5])),
+            ("outact", nn.Sigmoid()),
+            # Special handling for lm_head & sampler
+            ("lm_head", ParallelLMHead(512, 10)),
+            ("logits_processor", LogitsProcessor(512)),
+            ("sampler", Sampler())
+        ]))
+    model.config = MagicMock()
+    return model
+
+
+@pytest.fixture(scope="session")
+def sql_lora_files():
+    return snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test")
+
+
+@pytest.fixture(scope="session")
+def mixtral_lora_files():
+    return snapshot_download(repo_id="terrysun/mixtral-lora-adapter")
+
+
+@pytest.fixture(scope="session")
+def gemma_lora_files():
+    return snapshot_download(repo_id="wskwon/gemma-7b-test-lora")
+
+
+@pytest.fixture(scope="session")
+def chatglm3_lora_files():
+    return snapshot_download(repo_id="jeeejeee/chatglm3-text2sql-spider")
+
+
+@pytest.fixture(scope="session")
+def baichuan_lora_files():
+    return snapshot_download(repo_id="jeeejeee/baichuan7b-text2sql-spider")
+
+
+@pytest.fixture(scope="session")
+def baichuan_zero_lora_files():
+    # all the lora_B weights are initialized to zero.
+    return snapshot_download(repo_id="jeeejeee/baichuan7b-zero-init")
+
+
+@pytest.fixture(scope="session")
+def tinyllama_lora_files():
+    return snapshot_download(repo_id="jashing/tinyllama-colorist-lora")
+
+
+@pytest.fixture
+def llama_2_7b_engine_extra_embeddings() -> nn.Module:
+    cleanup()
+    get_model_old = get_model
+
+    def get_model_patched(*, model_config, device_config, **kwargs):
+        kwargs["lora_config"] = LoRAConfig(max_loras=4, max_lora_rank=8)
+        return get_model_old(model_config=model_config,
+                             device_config=device_config,
+                             **kwargs)
+
+    with patch("vllm.worker.model_runner.get_model", get_model_patched):
+        engine = vllm.LLM("meta-llama/Llama-2-7b-hf", enable_lora=False)
+    yield engine.llm_engine
+    del engine
+    cleanup()
+
+
+@pytest.fixture
+def llama_2_7b_model_extra_embeddings(
+        llama_2_7b_engine_extra_embeddings) -> nn.Module:
+    yield (llama_2_7b_engine_extra_embeddings.model_executor.driver_worker.
+           model_runner.model)
--- a/tests/lora/test_baichuan.py
+++ b/tests/lora/test_baichuan.py
@@ -0,0 +1,108 @@
+import pytest
+
+import vllm
+from vllm.lora.request import LoRARequest
+
+from .conftest import cleanup
+
+MODEL_PATH = "baichuan-inc/Baichuan-7B"
+
+PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.\n"\n##Instruction:\nconcert_singer contains tables such as stadium, singer, concert, singer_in_concert. Table stadium has columns such as Stadium_ID, Location, Name, Capacity, Highest, Lowest, Average. Stadium_ID is the primary key.\nTable singer has columns such as Singer_ID, Name, Country, Song_Name, Song_release_year, Age, Is_male. Singer_ID is the primary key.\nTable concert has columns such as concert_ID, concert_Name, Theme, Stadium_ID, Year. concert_ID is the primary key.\nTable singer_in_concert has columns such as concert_ID, Singer_ID. concert_ID is the primary key.\nThe Stadium_ID of concert is the foreign key of Stadium_ID of stadium.\nThe Singer_ID of singer_in_concert is the foreign key of Singer_ID of singer.\nThe concert_ID of singer_in_concert is the foreign key of concert_ID of concert.\n\n###Input:\n{query}\n\n###Response:"""  # noqa: E501
+
+
+def do_sample(llm, lora_path: str, lora_id: int) -> str:
+    prompts = [
+        PROMPT_TEMPLATE.format(query="How many singers do we have?"),
+        PROMPT_TEMPLATE.format(
+            query=
+            "What is the average, minimum, and maximum age of all singers from France?"  # noqa: E501
+        ),
+        PROMPT_TEMPLATE.format(
+            query=
+            "Show name, country, age for all singers ordered by age from the oldest to the youngest."  # noqa: E501
+        ),
+    ]
+    print(prompts)
+    sampling_params = vllm.SamplingParams(temperature=0, max_tokens=256)
+    outputs = llm.generate(
+        prompts,
+        sampling_params,
+        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
+        if lora_id else None)
+    # Print the outputs.
+    generated_texts = []
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text.strip()
+        generated_texts.append(generated_text)
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    return generated_texts
+
+
+def test_baichuan_lora(baichuan_lora_files):
+    llm = vllm.LLM(MODEL_PATH,
+                   max_model_len=1024,
+                   enable_lora=True,
+                   max_loras=4,
+                   max_lora_rank=64,
+                   trust_remote_code=True)
+
+    expected_lora_output = [
+        "SELECT count(*) FROM singer",
+        "SELECT avg(age) ,  min(age) ,  max(age) FROM singer WHERE Country  =  'France'",  # noqa: E501
+        "SELECT name ,  country ,  age FROM singer ORDER BY age ASC",
+    ]
+
+    output1 = do_sample(llm, baichuan_lora_files, lora_id=1)
+    for i in range(len(expected_lora_output)):
+        assert output1[i] == expected_lora_output[i]
+    output2 = do_sample(llm, baichuan_lora_files, lora_id=2)
+    for i in range(len(expected_lora_output)):
+        assert output2[i] == expected_lora_output[i]
+
+
+@pytest.mark.skip("Requires multiple GPUs")
+def test_baichuan_tensor_parallel_equality(baichuan_lora_files):
+    # Cannot use as it will initialize torch.cuda too early...
+    # if torch.cuda.device_count() < 4:
+    #     pytest.skip(f"Not enough GPUs for tensor parallelism {4}")
+
+    llm_tp1 = vllm.LLM(MODEL_PATH,
+                       enable_lora=True,
+                       max_num_seqs=16,
+                       max_loras=4,
+                       max_lora_rank=64,
+                       tensor_parallel_size=1,
+                       trust_remote_code=True)
+    output_tp1 = do_sample(llm_tp1, baichuan_lora_files, lora_id=1)
+
+    del llm_tp1
+    cleanup()
+
+    llm_tp2 = vllm.LLM(MODEL_PATH,
+                       enable_lora=True,
+                       max_num_seqs=16,
+                       max_loras=4,
+                       max_lora_rank=64,
+                       tensor_parallel_size=2,
+                       trust_remote_code=True)
+    output_tp2 = do_sample(llm_tp2, baichuan_lora_files, lora_id=2)
+
+    del llm_tp2
+    cleanup()
+
+    assert output_tp1 == output_tp2
+
+    llm_tp4 = vllm.LLM(MODEL_PATH,
+                       enable_lora=True,
+                       max_num_seqs=16,
+                       max_loras=4,
+                       max_lora_rank=64,
+                       tensor_parallel_size=4,
+                       trust_remote_code=True)
+    output_tp4 = do_sample(llm_tp4, baichuan_lora_files, lora_id=2)
+
+    del llm_tp4
+    cleanup()
+
+    assert output_tp1 == output_tp4
--- a/tests/lora/test_chatglm3.py
+++ b/tests/lora/test_chatglm3.py
@@ -0,0 +1,57 @@
+import vllm
+from vllm.lora.request import LoRARequest
+
+MODEL_PATH = "THUDM/chatglm3-6b"
+
+PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.\n"\n##Instruction:\nconcert_singer contains tables such as stadium, singer, concert, singer_in_concert. Table stadium has columns such as Stadium_ID, Location, Name, Capacity, Highest, Lowest, Average. Stadium_ID is the primary key.\nTable singer has columns such as Singer_ID, Name, Country, Song_Name, Song_release_year, Age, Is_male. Singer_ID is the primary key.\nTable concert has columns such as concert_ID, concert_Name, Theme, Stadium_ID, Year. concert_ID is the primary key.\nTable singer_in_concert has columns such as concert_ID, Singer_ID. concert_ID is the primary key.\nThe Stadium_ID of concert is the foreign key of Stadium_ID of stadium.\nThe Singer_ID of singer_in_concert is the foreign key of Singer_ID of singer.\nThe concert_ID of singer_in_concert is the foreign key of concert_ID of concert.\n\n###Input:\n{query}\n\n###Response:"""  # noqa: E501
+
+
+def do_sample(llm, lora_path: str, lora_id: int) -> str:
+    prompts = [
+        PROMPT_TEMPLATE.format(query="How many singers do we have?"),
+        PROMPT_TEMPLATE.format(
+            query=
+            "What is the average, minimum, and maximum age of all singers from France?"  # noqa: E501
+        ),
+        PROMPT_TEMPLATE.format(
+            query=
+            "Show name, country, age for all singers ordered by age from the oldest to the youngest."  # noqa: E501
+        ),
+    ]
+    print(prompts)
+    sampling_params = vllm.SamplingParams(temperature=0, max_tokens=32)
+    outputs = llm.generate(
+        prompts,
+        sampling_params,
+        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
+        if lora_id else None)
+    # Print the outputs.
+    generated_texts = []
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text.strip()
+        generated_texts.append(generated_text)
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    return generated_texts
+
+
+def test_chatglm3_lora(chatglm3_lora_files):
+    llm = vllm.LLM(MODEL_PATH,
+                   max_model_len=1024,
+                   enable_lora=True,
+                   max_loras=4,
+                   max_lora_rank=64,
+                   trust_remote_code=True)
+
+    expected_lora_output = [
+        "SELECT count(*) FROM singer",
+        "SELECT avg(age) ,  min(age) ,  max(age) FROM singer WHERE country  =  'France'",  # noqa: E501
+        "SELECT name ,  country ,  age FROM singer ORDER BY age",
+    ]
+
+    output1 = do_sample(llm, chatglm3_lora_files, lora_id=1)
+    for i in range(len(expected_lora_output)):
+        assert output1[i] == expected_lora_output[i]
+    output2 = do_sample(llm, chatglm3_lora_files, lora_id=2)
+    for i in range(len(expected_lora_output)):
+        assert output2[i] == expected_lora_output[i]
--- a/tests/lora/test_gemma.py
+++ b/tests/lora/test_gemma.py
@@ -0,0 +1,46 @@
+import vllm
+from vllm.lora.request import LoRARequest
+
+MODEL_PATH = "google/gemma-7b"
+
+
+def do_sample(llm, lora_path: str, lora_id: int) -> str:
+    prompts = [
+        "Quote: Imagination is",
+        "Quote: Be yourself;",
+        "Quote: So many books,",
+    ]
+    sampling_params = vllm.SamplingParams(temperature=0, max_tokens=32)
+    outputs = llm.generate(
+        prompts,
+        sampling_params,
+        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
+        if lora_id else None)
+    # Print the outputs.
+    generated_texts = []
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text.strip()
+        generated_texts.append(generated_text)
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    return generated_texts
+
+
+def test_gemma_lora(gemma_lora_files):
+    llm = vllm.LLM(MODEL_PATH,
+                   max_model_len=1024,
+                   enable_lora=True,
+                   max_loras=4)
+
+    expected_lora_output = [
+        "more important than knowledge.\nAuthor: Albert Einstein\n",
+        "everyone else is already taken.\nAuthor: Oscar Wilde\n",
+        "so little time\nAuthor: Frank Zappa\n",
+    ]
+
+    output1 = do_sample(llm, gemma_lora_files, lora_id=1)
+    for i in range(len(expected_lora_output)):
+        assert output1[i].startswith(expected_lora_output[i])
+    output2 = do_sample(llm, gemma_lora_files, lora_id=2)
+    for i in range(len(expected_lora_output)):
+        assert output2[i].startswith(expected_lora_output[i])
--- a/tests/lora/test_layer_variation.py
+++ b/tests/lora/test_layer_variation.py
@@ -0,0 +1,106 @@
+import tempfile
+from random import sample
+from typing import List, Optional
+
+import peft
+import pytest
+from transformers import AutoModelForCausalLM
+
+import vllm
+from vllm.lora.request import LoRARequest
+
+from .conftest import cleanup
+
+MODEL_PATH = "Felladrin/Llama-68M-Chat-v1"
+PROMPTS = [
+    "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nSpellForce 3 is a pretty bad game. The developer Grimlore Games is clearly a bunch of no-talent hacks, and 2017 was a terrible year for games anyway. [/user] [assistant]",  # noqa: E501
+    "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nI wanted to like Grimlore Games' 2017 entry, but in SpellForce 3 they just didn't get anything right. [/user] [assistant]",  # noqa: E501
+    "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nBioShock is a good role-playing, action-adventure, shooter that released for PlayStation, Xbox, and PC in 2007. It is available on Steam, and it has a Mac release but not a Linux release. [/user] [assistant]",  # noqa: E501
+]
+
+
+def get_lora_model(model_id: str, target_modules: List[str], rank: int):
+    model = AutoModelForCausalLM.from_pretrained(model_id)
+    lora_config = peft.tuners.lora.LoraConfig(target_modules, rank)
+    lora_model = peft.PeftModel(model, lora_config)
+    return lora_model
+
+
+def do_sample(llm,
+              lora_path: Optional[str] = None,
+              lora_id: Optional[int] = None,
+              logprobs: int = 0,
+              n_tokens: int = 256):
+    prompts = PROMPTS
+    sampling_params = vllm.SamplingParams(temperature=0,
+                                          max_tokens=n_tokens,
+                                          logprobs=logprobs,
+                                          stop=["[/assistant]"])
+    outputs = llm.generate(
+        prompts,
+        sampling_params,
+        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
+        if lora_id else None)
+    # Print the outputs.
+    generated_texts = []
+    generated_logprobs = []
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        generated_texts.append(generated_text)
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+        generated_logprobs.append([
+            list(logprob.keys()) for out in output.outputs
+            for logprob in out.logprobs
+        ])
+    return generated_logprobs if logprobs else generated_texts
+
+
+SUPPORTED_MODULES = [
+    "qkv_proj", "o_proj", "gate_up_proj", "down_proj", "embed_tokens",
+    "lm_head"
+]
+TARGET_MODULES_LIST = []
+for length in range(2, 6):
+    TARGET_MODULES_LIST.extend(
+        [sample(SUPPORTED_MODULES, length) for _ in range(3)])
+
+
+# Test the correctness when layer and rank are varied
+# step 1: init a base model and serve with LoRA to get the reference results
+# step 2: merge the same LoRA to the base model, serve the merged model
+# step 3: compare the results from step 1 and step 2
+@pytest.mark.parametrize("tp_size", [1])
+@pytest.mark.parametrize("target_modules", TARGET_MODULES_LIST)
+@pytest.mark.parametrize("rank", [8, 16, 32, 64])
+def test_layer_variation_correctness(tp_size, target_modules, rank):
+    llm = vllm.LLM(MODEL_PATH,
+                   enable_lora=True,
+                   max_num_seqs=16,
+                   max_loras=4,
+                   tensor_parallel_size=tp_size,
+                   worker_use_ray=True)
+    model = get_lora_model(MODEL_PATH, target_modules, rank)
+    with tempfile.TemporaryDirectory() as tmpdir:
+        model.save_pretrained(tmpdir)
+        merged_probs = do_sample(llm, tmpdir, 1, logprobs=5, n_tokens=32)
+    del llm
+    cleanup()
+    reference_id_sets = [set(prob[0]) for prob in merged_probs]
+
+    model = get_lora_model(MODEL_PATH, target_modules, rank)
+    with tempfile.TemporaryDirectory() as tmpdir:
+        merged_model = model.merge_and_unload()
+        merged_model.save_pretrained(tmpdir)
+        llm = vllm.LLM(tmpdir,
+                       tokenizer=MODEL_PATH,
+                       enable_lora=False,
+                       max_num_seqs=16,
+                       tensor_parallel_size=tp_size,
+                       worker_use_ray=True)
+    probs = do_sample(llm, logprobs=5, n_tokens=32)
+    del llm
+    cleanup()
+    # verify the top-5 tokens are identical for each token
+    id_sets = [set(prob[0]) for prob in probs]
+    assert id_sets == reference_id_sets
--- a/tests/lora/test_layers.py
+++ b/tests/lora/test_layers.py
@@ -0,0 +1,773 @@
+import random
+from copy import deepcopy
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Tuple
+
+import pytest
+import torch
+import torch.nn.functional as F
+
+from vllm.config import LoRAConfig
+from vllm.lora.fully_sharded_layers import (
+    ColumnParallelLinearWithShardedLoRA,
+    MergedColumnParallelLinearWithShardedLoRA,
+    MergedQKVParallelLinearWithShardedLora, RowParallelLinearWithShardedLoRA)
+# yapf conflicts with isort for this block
+# yapf: disable
+from vllm.lora.layers import (BaseLayerWithLoRA, ColumnParallelLinearWithLoRA,
+                              LogitsProcessorWithLoRA, LoRAMapping,
+                              MergedColumnParallelLinearWithLoRA,
+                              MergedQKVParallelLinearWithLora,
+                              QKVParallelLinearWithLora,
+                              RowParallelLinearWithLoRA,
+                              VocabParallelEmbeddingWithLoRA)
+# yapf: enable
+from vllm.lora.models import (LoRALayerWeights, PackedLoRALayerWeights,
+                              convert_mapping)
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.utils import set_random_seed
+
+from .utils import DummyLoRAManager
+
+TOLERANCES = {
+    torch.float16: (5e-3, 5e-3),
+    torch.float32: (5e-3, 5e-3),
+    torch.bfloat16: (3e-2, 2e-2),
+}
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+]
+
+
+def get_random_id_to_index(num_loras: int,
+                           num_slots: int,
+                           log: bool = True) -> List[Optional[int]]:
+    """Creates a random lora_id_to_index mapping.
+
+    Args:
+        num_loras: The number of active loras in the mapping.
+        num_slots: The number of slots in the mapping. Must be larger
+            than num_loras.
+        log: Whether to log the output.
+    """
+
+    if num_loras > num_slots:
+        raise ValueError(
+            f"num_loras is higher than num_slots: {num_loras} > {num_slots}. "
+            "num_loras must be less than or equal to num_slots.")
+
+    slots: List[Optional[int]] = [None] * num_slots
+    random_slot_selections = (torch.randperm(num_slots)[:num_loras]).tolist()
+    for lora_id, slot_idx in enumerate(random_slot_selections, start=1):
+        slots[slot_idx] = lora_id
+
+    if log:
+        print(f"Created lora_id_to_index mapping: {slots}.")
+
+    return slots
+
+
+def populate_loras(
+    id_to_index: List[Optional[int]],
+    layer: BaseLayerWithLoRA,
+    layer_weights: torch.Tensor,
+    generate_embeddings_tensor: int = 0,
+    repeats: int = 1,
+) -> Tuple[Dict[int, LoRALayerWeights], Dict[int, List[LoRALayerWeights]]]:
+    """This method populates the lora layers with lora weights.
+
+    Args:
+        id_to_index: a list of lora ids. The index of the lora id
+            represents which memory slot the lora matrices are
+            stored in. A None value indicates a free slot.
+        layer: the LoRAlayer to populate.
+        layer_weights: the PyTorch tensor containing the layer's
+            weights.
+        generate_embeddings_tensor: whether to generate an
+            embeddings tensor for each LoRA.
+        repeats: must only be set for column parallel packed
+            layers. Indicates the number of loras to compose
+            together to create a single lora layer.
+    """
+
+    # Dictionary that maps the lora ID to the
+    # corresponding lora weights.
+    lora_dict: Dict[int, LoRALayerWeights] = dict()
+
+    # Dictionary that maps the lora ID to the
+    # corresponding subloras.
+    sublora_dict: Dict[int, List[LoRALayerWeights]] = dict()
+
+    for slot_idx, lora_id in enumerate(id_to_index):
+        if lora_id is not None:
+            subloras = []
+            sublora_len = layer_weights.shape[0] // repeats
+            for i in range(repeats):
+                sublora = DummyLoRAManager().init_random_lora(
+                    module_name=f"fake_{i}",
+                    weight=layer_weights,
+                    generate_embeddings_tensor=generate_embeddings_tensor,
+                )
+                sublora.lora_b = sublora.lora_b[:, (sublora_len *
+                                                    i):(sublora_len * (i + 1))]
+                sublora.optimize()
+                subloras.append(sublora)
+
+            lora = PackedLoRALayerWeights.pack(
+                subloras) if repeats > 1 else subloras[0]
+
+            layer.set_lora(
+                slot_idx,
+                lora_a=lora.lora_a,
+                lora_b=lora.lora_b,
+                embeddings_tensor=lora.embeddings_tensor,
+            )
+
+            lora_dict[lora_id] = lora
+            sublora_dict[lora_id] = subloras
+
+    return lora_dict, sublora_dict
+
+
+def create_random_inputs(
+    active_lora_ids: List[int],
+    num_inputs: int,
+    input_size: Tuple[int, ...],
+    input_range: Tuple[float, float],
+    input_type: torch.dtype = torch.int,
+) -> Tuple[List[torch.Tensor], List[int], List[int]]:
+    """Creates random inputs.
+
+    Args:
+        active_lora_ids: lora IDs of active lora weights.
+        num_inputs: the number of inputs to create.
+        input_size: the size of each individual input.
+        input_range: the range of values to include in the input.
+            input_range[0] <= possible input values < input_range[1]
+        input_type: the type of values in the input.
+    """
+
+    low, high = input_range
+
+    inputs, index_mapping, prompt_mapping = [], [], []
+    for _ in range(num_inputs):
+        if input_type == torch.int:
+            inputs.append(
+                torch.randint(low=int(low), high=int(high), size=input_size))
+        else:
+            inputs.append(
+                torch.rand(size=input_size, dtype=input_type) * high + low)
+
+        lora_id = random.choice(active_lora_ids)
+        index_mapping += [lora_id] * input_size[0]
+        prompt_mapping += [lora_id]
+
+    return inputs, index_mapping, prompt_mapping
+
+
+@torch.inference_mode()
+@pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 128000])
+def test_embeddings(dist_init, num_loras, device, vocab_size) -> None:
+
+    torch.set_default_device(device)
+    max_loras = 8
+    lora_config = LoRAConfig(max_loras=max_loras,
+                             max_lora_rank=8,
+                             lora_dtype=torch.float16)
+
+    def create_random_embedding_layer():
+        embedding = VocabParallelEmbedding(vocab_size, 256)
+        embedding.weight.data = torch.rand_like(embedding.weight.data)
+        embedding.weight.data[vocab_size:, :] = 0
+        lora_embedding = VocabParallelEmbeddingWithLoRA(embedding)
+        lora_embedding.create_lora_weights(max_loras, lora_config)
+
+        return embedding, lora_embedding
+
+    for i in range(10):
+        set_random_seed(i)
+
+        id_to_index = get_random_id_to_index(num_loras, max_loras)
+        embedding, lora_embedding = create_random_embedding_layer()
+
+        lora_dict, _ = populate_loras(
+            id_to_index,
+            layer=lora_embedding,
+            layer_weights=embedding.weight.T,
+        )
+
+        inputs, index_mapping, prompt_mapping = create_random_inputs(
+            active_lora_ids=list(lora_dict.keys()),
+            num_inputs=num_loras * 3,
+            input_size=(200, ),
+            input_range=(1, vocab_size),
+        )
+        lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
+
+        mapping_info = convert_mapping(lora_mapping, id_to_index, max_loras,
+                                       vocab_size,
+                                       lora_config.lora_extra_vocab_size)
+        lora_embedding.set_mapping(*mapping_info)
+
+        lora_result = lora_embedding(torch.cat(inputs))
+
+        expected_results = []
+        for input_, lora_id in zip(inputs, prompt_mapping):
+            lora = lora_dict[lora_id]
+            result = embedding(input_)
+            after_a = F.embedding(
+                input_,
+                lora.lora_a,
+            )
+            result += (after_a @ lora.lora_b)
+            expected_results.append(result)
+        expected_result = torch.cat(expected_results)
+
+        rtol, atol = TOLERANCES[lora_result.dtype]
+        assert torch.allclose(lora_result,
+                              expected_result,
+                              rtol=rtol,
+                              atol=atol)
+
+        # Check that resetting the lora weights succeeds
+
+        for slot_idx in range(max_loras):
+            lora_embedding.reset_lora(slot_idx)
+
+        inputs, index_mapping, prompt_mapping = create_random_inputs(
+            active_lora_ids=[0],
+            num_inputs=num_loras * 3,
+            input_size=(200, ),
+            input_range=(1, vocab_size),
+        )
+        lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
+
+        mapping_info = convert_mapping(lora_mapping, id_to_index, max_loras,
+                                       vocab_size,
+                                       lora_config.lora_extra_vocab_size)
+        lora_embedding.set_mapping(*mapping_info, )
+
+        lora_result = lora_embedding(torch.cat(inputs))
+        expected_result = embedding(torch.cat(inputs))
+
+        rtol, atol = TOLERANCES[lora_result.dtype]
+        assert torch.allclose(lora_result,
+                              expected_result,
+                              rtol=rtol,
+                              atol=atol)
+
+
+@torch.inference_mode()
+# @pytest.mark.skip(
+#     reason="Fails when loras are in any slot other than the first.")
+@pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 128000])
+def test_embeddings_with_new_embeddings(dist_init, num_loras, device,
+                                        vocab_size) -> None:
+
+    torch.set_default_device(device)
+    max_loras = 8
+    lora_config = LoRAConfig(max_loras=max_loras,
+                             max_lora_rank=8,
+                             lora_dtype=torch.float16)
+
+    def create_random_embedding_layer():
+        embedding = VocabParallelEmbedding(vocab_size, 256)
+        embedding_data = torch.rand_like(embedding.weight.data)
+        embedding.weight.data = embedding_data
+        embedding.weight.data[vocab_size:, :] = 0
+        expanded_embedding = VocabParallelEmbedding(
+            vocab_size + lora_config.lora_extra_vocab_size * max_loras,
+            256,
+            org_num_embeddings=vocab_size)
+        expanded_embedding.weight.data[:vocab_size, :] = embedding_data
+        # We need to deepcopy the embedding as it will be modified
+        # in place
+        lora_embedding = VocabParallelEmbeddingWithLoRA(
+            deepcopy(expanded_embedding))
+        lora_embedding.create_lora_weights(max_loras, lora_config)
+
+        return expanded_embedding, lora_embedding
+
+    for i in range(10):
+        set_random_seed(i)
+
+        id_to_index = get_random_id_to_index(num_loras, max_loras)
+        expanded_embedding, lora_embedding = create_random_embedding_layer()
+        lora_dict, _ = populate_loras(
+            id_to_index,
+            layer=lora_embedding,
+            layer_weights=torch.zeros(
+                (256, vocab_size + lora_config.lora_extra_vocab_size)),
+            generate_embeddings_tensor=256,
+        )
+
+        # All embeddings tensors have the same shape.
+        embeddings_tensors = [
+            lora_dict[id].embeddings_tensor for id in sorted(lora_dict.keys())
+        ]
+        embeddings_tensor_len = embeddings_tensors[0].shape[0]
+
+        # Add empty embeddings_tensors for unoccupied lora slots.
+        for _ in range(max_loras - len(embeddings_tensors)):
+            embeddings_tensors.append(torch.zeros(embeddings_tensors[0].shape))
+
+        inputs, index_mapping, prompt_mapping = create_random_inputs(
+            active_lora_ids=list(lora_dict.keys()),
+            num_inputs=num_loras * 3,
+            input_size=(200, ),
+            input_range=(1, vocab_size),
+        )
+        lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
+
+        original_inputs = deepcopy(inputs)
+
+        # Force some of the inputs to be in the extended embeddings range
+        # to guarantee that their behavior is tested.
+        for input_, original_input_, lora_id in zip(inputs, original_inputs,
+                                                    prompt_mapping):
+            embedding_id = lora_id - 1
+            input_[-1] = vocab_size + (embedding_id * embeddings_tensor_len)
+            original_input_[-1] = vocab_size
+            input_[-2] = vocab_size + (
+                (embedding_id + 1) * embeddings_tensor_len - 1)
+            original_input_[-2] = vocab_size + embeddings_tensor_len - 1
+
+        mapping_info = convert_mapping(lora_mapping, id_to_index, max_loras,
+                                       vocab_size,
+                                       lora_config.lora_extra_vocab_size)
+        lora_embedding.set_mapping(*mapping_info, )
+
+        expanded_embedding.weight[vocab_size:vocab_size +
+                                  (embeddings_tensor_len *
+                                   max_loras)] = torch.cat(embeddings_tensors)
+
+        lora_result = lora_embedding(torch.cat(original_inputs))
+
+        expected_results = []
+        for input_, original_input_, lora_id in zip(inputs, original_inputs,
+                                                    prompt_mapping):
+            lora = lora_dict[lora_id]
+            result = expanded_embedding(input_)
+            after_a = F.embedding(
+                original_input_,
+                lora.lora_a,
+            )
+            result += (after_a @ lora.lora_b)
+            expected_results.append(result)
+        expected_result = torch.cat(expected_results)
+
+        rtol, atol = TOLERANCES[lora_result.dtype]
+        assert torch.allclose(lora_result,
+                              expected_result,
+                              rtol=rtol,
+                              atol=atol)
+
+        # Check that resetting the lora weights succeeds
+
+        for slot_idx in range(max_loras):
+            lora_embedding.reset_lora(slot_idx)
+
+        inputs, index_mapping, prompt_mapping = create_random_inputs(
+            active_lora_ids=[0],
+            num_inputs=num_loras * 3,
+            input_size=(200, ),
+            input_range=(1, vocab_size),
+        )
+        lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
+
+        original_inputs = deepcopy(inputs)
+
+        mapping_info = convert_mapping(lora_mapping, id_to_index, max_loras,
+                                       vocab_size,
+                                       lora_config.lora_extra_vocab_size)
+        lora_embedding.set_mapping(*mapping_info, )
+
+        lora_result = lora_embedding(torch.cat(original_inputs))
+        expected_result = expanded_embedding(torch.cat(inputs))
+
+        rtol, atol = TOLERANCES[lora_result.dtype]
+        assert torch.allclose(lora_result,
+                              expected_result,
+                              rtol=rtol,
+                              atol=atol)
+
+
+@torch.inference_mode()
+@pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 128000])
+def test_lm_head_logits_processor(dist_init, num_loras, device,
+                                  vocab_size) -> None:
+
+    torch.set_default_device(device)
+    max_loras = 8
+    lora_config = LoRAConfig(max_loras=max_loras,
+                             max_lora_rank=8,
+                             lora_dtype=torch.float16)
+
+    def _pretest():
+        linear = ParallelLMHead(vocab_size + lora_config.lora_extra_vocab_size,
+                                1024,
+                                vocab_size,
+                                params_dtype=torch.float16)
+        linear.weight.data = torch.rand_like(linear.weight.data)
+        linear.weight.data[:, vocab_size:] = 0
+        logits_processor = LogitsProcessor(
+            vocab_size + lora_config.lora_extra_vocab_size, vocab_size)
+        lora_logits_processor = LogitsProcessorWithLoRA(
+            logits_processor, 1024, linear.weight.dtype, linear.weight.device)
+        lora_logits_processor.create_lora_weights(max_loras, lora_config)
+
+        return linear, logits_processor, lora_logits_processor
+
+    for i in range(10):
+        set_random_seed(i)
+
+        id_to_index = get_random_id_to_index(num_loras, max_loras)
+        linear, logits_processor, lora_logits_processor = _pretest()
+
+        # NOTE: all the generated loras share the same embeddings tensor.
+        lora_dict, _ = populate_loras(
+            id_to_index,
+            layer=lora_logits_processor,
+            layer_weights=linear.weight,
+            generate_embeddings_tensor=1024,
+        )
+        embeddings_tensor = list(lora_dict.values())[0].embeddings_tensor
+        embeddings_tensor_len = embeddings_tensor.shape[0]
+
+        inputs, index_mapping, prompt_mapping = create_random_inputs(
+            active_lora_ids=list(lora_dict.keys()),
+            num_inputs=8 * num_loras,  # * 3,
+            input_size=(1, 1024),
+            input_range=(0, 1),
+            input_type=torch.float16,
+        )
+        lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
+
+        input_ = torch.rand(20, 1024)
+        mapping_info = convert_mapping(
+            lora_mapping,
+            id_to_index,
+            max_loras,
+            vocab_size,
+            lora_config.lora_extra_vocab_size,
+        )
+        lora_logits_processor.set_mapping(*mapping_info, )
+
+        lora_result = lora_logits_processor._get_logits(
+            hidden_states=torch.cat(inputs),
+            embedding=linear.weight,
+            embedding_bias=None)
+
+        original_weight = linear.weight.clone()
+
+        linear.weight[logits_processor.
+                      org_vocab_size:logits_processor.org_vocab_size +
+                      embeddings_tensor_len] = embeddings_tensor
+
+        logits_processor.org_vocab_size = (vocab_size +
+                                           lora_config.lora_extra_vocab_size)
+        expected_results = []
+        for input_, lora_id in zip(inputs, prompt_mapping):
+            lora = lora_dict[lora_id]
+            result = logits_processor._get_logits(hidden_states=input_,
+                                                  embedding=linear.weight,
+                                                  embedding_bias=None)
+            result[:, vocab_size + embeddings_tensor_len:] = float("-inf")
+            result += input_ @ lora.lora_a @ lora.lora_b * lora.scaling
+            expected_results.append(result)
+        expected_result = torch.cat(expected_results)
+        logits_processor.org_vocab_size = vocab_size
+
+        # Check that resetting the lora weights succeeds
+
+        for slot_idx in range(max_loras):
+            lora_logits_processor.reset_lora(slot_idx)
+
+        inputs, index_mapping, prompt_mapping = create_random_inputs(
+            active_lora_ids=[0],
+            num_inputs=8 * num_loras * 3,
+            input_size=(1, 1024),
+            input_range=(0, 1),
+            input_type=torch.float16,
+        )
+        lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
+
+        mapping_info = convert_mapping(lora_mapping, id_to_index, max_loras,
+                                       vocab_size,
+                                       lora_config.lora_extra_vocab_size)
+        lora_logits_processor.set_mapping(*mapping_info, )
+
+        lora_result = lora_logits_processor._get_logits(
+            hidden_states=torch.cat(inputs),
+            embedding=original_weight,
+            embedding_bias=None)[:, :vocab_size]
+        expected_result = logits_processor._get_logits(
+            hidden_states=torch.cat(inputs),
+            embedding=original_weight,
+            embedding_bias=None)
+
+        rtol, atol = TOLERANCES[lora_result.dtype]
+        assert torch.allclose(lora_result,
+                              expected_result,
+                              rtol=rtol,
+                              atol=atol)
+
+
+@torch.inference_mode()
+@pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
+@pytest.mark.parametrize("orientation", ["row", "column"])
+@pytest.mark.parametrize("fully_shard", [True, False])
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_linear_parallel(dist_init, num_loras, orientation, fully_shard,
+                         device) -> None:
+
+    torch.set_default_device(device)
+    max_loras = 8
+    lora_config = LoRAConfig(max_loras=max_loras,
+                             max_lora_rank=8,
+                             fully_sharded_loras=fully_shard,
+                             lora_dtype=torch.float16)
+
+    def create_random_linear_parallel_layer():
+        if orientation == "row":
+            linear = RowParallelLinear(4096,
+                                       4096,
+                                       bias=False,
+                                       params_dtype=torch.float16)
+            linear.weight.data = torch.rand_like(linear.weight.data)
+            lora_linear = (RowParallelLinearWithLoRA(linear) if not fully_shard
+                           else RowParallelLinearWithShardedLoRA(linear))
+        else:
+            linear = ColumnParallelLinear(4096,
+                                          4096,
+                                          bias=False,
+                                          params_dtype=torch.float16)
+            linear.weight.data = torch.rand_like(linear.weight.data)
+            lora_linear = (ColumnParallelLinearWithLoRA(linear)
+                           if not fully_shard else
+                           ColumnParallelLinearWithShardedLoRA(linear))
+        lora_linear.create_lora_weights(max_loras, lora_config)
+
+        return linear, lora_linear
+
+    for i in range(10):
+        set_random_seed(i)
+
+        id_to_index = get_random_id_to_index(num_loras, max_loras)
+        linear, lora_linear = create_random_linear_parallel_layer()
+
+        lora_dict, _ = populate_loras(
+            id_to_index,
+            layer=lora_linear,
+            layer_weights=linear.weight,
+        )
+
+        inputs, index_mapping, prompt_mapping = create_random_inputs(
+            active_lora_ids=list(lora_dict.keys()),
+            num_inputs=32 * num_loras,
+            input_size=(1, 4096),
+            input_range=(0, 1),
+            input_type=torch.float16,
+        )
+        lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
+
+        mapping_info = convert_mapping(
+            lora_mapping,
+            id_to_index,
+            max_loras,
+            512,
+            lora_config.lora_extra_vocab_size,
+        )
+        lora_linear.set_mapping(*mapping_info, )
+
+        lora_result = lora_linear(torch.cat(inputs))[0]
+
+        expected_results = []
+        for input_, lora_id in zip(inputs, prompt_mapping):
+            lora = lora_dict[lora_id]
+            result = linear(input_)[0]
+            result += input_ @ lora.lora_a @ lora.lora_b * lora.scaling
+            expected_results.append(result)
+        expected_result = torch.cat(expected_results)
+
+        rtol, atol = TOLERANCES[lora_result.dtype]
+        assert torch.allclose(lora_result,
+                              expected_result,
+                              rtol=rtol,
+                              atol=atol)
+
+        # Check that resetting the lora weights succeeds
+
+        for slot_idx in range(max_loras):
+            lora_linear.reset_lora(slot_idx)
+
+        inputs, index_mapping, prompt_mapping = create_random_inputs(
+            active_lora_ids=[0],
+            num_inputs=32 * num_loras,
+            input_size=(1, 4096),
+            input_range=(0, 1),
+            input_type=torch.float16,
+        )
+        lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
+
+        mapping_info = convert_mapping(lora_mapping, id_to_index, max_loras,
+                                       512, lora_config.lora_extra_vocab_size)
+        lora_linear.set_mapping(*mapping_info, )
+
+        lora_result = lora_linear(torch.cat(inputs))[0]
+        expected_result = linear(torch.cat(inputs))[0]
+
+        rtol, atol = TOLERANCES[lora_result.dtype]
+        assert torch.allclose(lora_result,
+                              expected_result,
+                              rtol=rtol,
+                              atol=atol)
+
+
+@torch.inference_mode()
+@pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
+@pytest.mark.parametrize("repeats", [1, 2, 3])
+@pytest.mark.parametrize("fully_shard", [True, False])
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard,
+                                device) -> None:
+
+    torch.set_default_device(device)
+    max_loras = 8
+    lora_config = LoRAConfig(max_loras=max_loras,
+                             max_lora_rank=8,
+                             fully_sharded_loras=fully_shard,
+                             lora_dtype=torch.float16)
+
+    def create_column_parallel_packed_layer():
+        if repeats == 2:
+            linear = MergedColumnParallelLinear(4096, [4096] * repeats,
+                                                bias=False,
+                                                params_dtype=torch.float16)
+            linear.weight.data = torch.rand_like(linear.weight.data)
+            lora_linear = (MergedColumnParallelLinearWithLoRA(linear)
+                           if not fully_shard else
+                           MergedColumnParallelLinearWithShardedLoRA(linear))
+        elif repeats == 3:
+            linear = QKVParallelLinear(4096,
+                                       64,
+                                       32,
+                                       bias=False,
+                                       params_dtype=torch.float16)
+            linear.weight.data = torch.rand_like(linear.weight.data)
+            lora_linear = (MergedQKVParallelLinearWithLora(linear)
+                           if not fully_shard else
+                           MergedQKVParallelLinearWithShardedLora(linear))
+        else:
+            linear = QKVParallelLinear(4096,
+                                       64,
+                                       32,
+                                       bias=False,
+                                       params_dtype=torch.float16)
+            linear.weight.data = torch.rand_like(linear.weight.data)
+            lora_linear = QKVParallelLinearWithLora(linear)
+
+        @dataclass
+        class FakeConfig:
+            hidden_size = 4096
+            num_key_value_heads = 32
+            num_attention_heads = 32
+
+        lora_linear.create_lora_weights(max_loras,
+                                        lora_config,
+                                        model_config=FakeConfig())
+
+        return linear, lora_linear
+
+    for i in range(10):
+        set_random_seed(i)
+
+        id_to_index = get_random_id_to_index(num_loras, max_loras)
+
+        linear, lora_linear = create_column_parallel_packed_layer()
+
+        lora_dict, sublora_dict = populate_loras(
+            id_to_index,
+            layer=lora_linear,
+            layer_weights=linear.weight,
+            repeats=repeats,
+        )
+
+        inputs, index_mapping, prompt_mapping = create_random_inputs(
+            active_lora_ids=list(lora_dict.keys()),
+            num_inputs=32 * num_loras,
+            input_size=(1, 4096),
+            input_range=(0, 1),
+            input_type=torch.float16,
+        )
+        lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
+
+        mapping_info = convert_mapping(
+            lora_mapping,
+            id_to_index,
+            max_loras,
+            512,
+            lora_config.lora_extra_vocab_size,
+        )
+        lora_linear.set_mapping(*mapping_info)
+
+        lora_result = lora_linear(torch.cat(inputs))[0]
+
+        expected_results = []
+        for input_, lora_id in zip(inputs, prompt_mapping):
+            result = linear(input_)[0]
+            subloras = sublora_dict[lora_id]
+            for i, sublora in enumerate(subloras):
+                result[:, sublora.lora_b.shape[1] * i:sublora.lora_b.shape[1] *
+                       (i + 1)] += (input_ @ sublora.lora_a @ sublora.lora_b *
+                                    sublora.scaling)
+            expected_results.append(result)
+        expected_result = torch.cat(expected_results)
+
+        rtol, atol = TOLERANCES[lora_result.dtype]
+        assert torch.allclose(lora_result,
+                              expected_result,
+                              rtol=rtol,
+                              atol=atol)
+
+        for slot_idx in range(max_loras):
+            lora_linear.reset_lora(slot_idx)
+
+        inputs, index_mapping, prompt_mapping = create_random_inputs(
+            active_lora_ids=[0],
+            num_inputs=32 * num_loras,
+            input_size=(1, 4096),
+            input_range=(0, 1),
+            input_type=torch.float16,
+        )
+        lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
+
+        mapping_info = convert_mapping(
+            lora_mapping,
+            id_to_index,
+            max_loras,
+            512,
+            lora_config.lora_extra_vocab_size,
+        )
+        lora_linear.set_mapping(*mapping_info)
+
+        lora_result = lora_linear(torch.cat(inputs))[0]
+        expected_result = linear(torch.cat(inputs))[0]
+
+        rtol, atol = TOLERANCES[lora_result.dtype]
+        assert torch.allclose(lora_result,
+                              expected_result,
+                              rtol=rtol,
+                              atol=atol)
--- a/tests/lora/test_llama.py
+++ b/tests/lora/test_llama.py
@@ -0,0 +1,148 @@
+import pytest
+import ray
+
+import vllm
+from vllm.lora.request import LoRARequest
+
+from .conftest import cleanup
+
+MODEL_PATH = "meta-llama/Llama-2-7b-hf"
+
+
+def do_sample(llm, lora_path: str, lora_id: int):
+    prompts = [
+        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",  # noqa: E501
+        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]",  # noqa: E501
+        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_95 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a low tone mora with a gloss of /˩okiru/ [òkìɽɯ́]? [/user] [assistant]",  # noqa: E501
+        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. [/user] [assistant]",  # noqa: E501
+        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? [/user] [assistant]",  # noqa: E501
+        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]"  # noqa: E501
+    ]
+    sampling_params = vllm.SamplingParams(temperature=0,
+                                          max_tokens=256,
+                                          stop=["[/assistant]"])
+    outputs = llm.generate(
+        prompts,
+        sampling_params,
+        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
+        if lora_id else None)
+    # Print the outputs.
+    generated_texts = []
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        generated_texts.append(generated_text)
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    return generated_texts
+
+
+@pytest.mark.parametrize("tp_size", [1])
+def test_llama_lora(sql_lora_files, tp_size):
+    # Cannot use as it will initialize torch.cuda too early...
+    # if torch.cuda.device_count() < tp_size:
+    #     pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
+
+    llm = vllm.LLM(MODEL_PATH,
+                   enable_lora=True,
+                   max_num_seqs=16,
+                   max_loras=4,
+                   tensor_parallel_size=tp_size)
+
+    expected_no_lora_output = [
+        "\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_75 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_76 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_77 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_78 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user]",  # noqa: E501
+        " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? ",  # noqa: E501
+        "\n\n answer: 1\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_96 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a high tone mora with a gloss of /˧kot/ [kòt]? [/user] [assistant]\n\n answer: 2\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_97 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a high tone mora with a gloss of /˧kot/ [kòt]? [/user] [assistant]\n\n answer: 2\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_98 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one m",  # noqa: E501
+        " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. ",  # noqa: E501
+        " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? ",  # noqa: E501
+        "\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE",  # noqa: E501
+    ]
+    expected_lora_output = [
+        "  SELECT icao FROM table_name_74 WHERE airport = 'lilongwe international airport' ",  # noqa: E501
+        "  SELECT nationality FROM table_name_11 WHERE elector = 'anchero pantaleone' ",  # noqa: E501
+        "  SELECT one_mora FROM table_name_95 WHERE gloss = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] AND accented_mora = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] ",  # noqa: E501
+        "  SELECT sex FROM people WHERE people_id IN (SELECT people_id FROM candidate GROUP BY sex ORDER BY COUNT(people_id) DESC LIMIT 1) ",  # noqa: E501
+        "  SELECT pick FROM table_name_60 WHERE former_wnba_team = 'Minnesota Lynx' ",  # noqa: E501
+        "  SELECT womens_doubles FROM table_28138035_4 WHERE mens_singles = 'Werner Schlager' "  # noqa: E501
+    ]
+
+    print("lora adapter created")
+    assert do_sample(llm, sql_lora_files, lora_id=0) == expected_no_lora_output
+
+    print("lora 1")
+    assert do_sample(llm, sql_lora_files, lora_id=1) == expected_lora_output
+
+    print("no lora")
+    assert do_sample(llm, sql_lora_files, lora_id=0) == expected_no_lora_output
+
+    print("lora 2")
+    assert do_sample(llm, sql_lora_files, lora_id=2) == expected_lora_output
+
+    print("removing lora")
+
+
+@pytest.mark.skip("Requires multiple GPUs")
+def test_llama_tensor_parallel_equality(sql_lora_files):
+    # Cannot use as it will initialize torch.cuda too early...
+    # if torch.cuda.device_count() < 4:
+    #     pytest.skip(f"Not enough GPUs for tensor parallelism {4}")
+
+    llm_tp1 = vllm.LLM(MODEL_PATH,
+                       enable_lora=True,
+                       max_num_seqs=16,
+                       max_loras=4,
+                       tensor_parallel_size=1)
+    output_tp1 = do_sample(llm_tp1, sql_lora_files, lora_id=1)
+
+    del llm_tp1
+    cleanup()
+
+    llm_tp2 = vllm.LLM(MODEL_PATH,
+                       enable_lora=True,
+                       max_num_seqs=16,
+                       max_loras=4,
+                       tensor_parallel_size=2)
+    output_tp2 = do_sample(llm_tp2, sql_lora_files, lora_id=1)
+
+    del llm_tp2
+    cleanup()
+
+    assert output_tp1 == output_tp2
+
+    llm_tp4 = vllm.LLM(MODEL_PATH,
+                       enable_lora=True,
+                       max_num_seqs=16,
+                       max_loras=4,
+                       tensor_parallel_size=4)
+    output_tp4 = do_sample(llm_tp4, sql_lora_files, lora_id=1)
+
+    del llm_tp4
+    cleanup()
+
+    assert output_tp1 == output_tp4
+
+
+def test_llama_lora_warmup(sql_lora_files):
+    """Test that the LLM initialization works with a warmup LORA path and
+    is more conservative"""
+
+    @ray.remote(num_gpus=1)
+    def get_num_gpu_blocks_lora():
+        llm = vllm.LLM(MODEL_PATH, enable_lora=True, max_num_seqs=16)
+        num_gpu_blocks_lora_warmup = llm.llm_engine.cache_config.num_gpu_blocks
+        return num_gpu_blocks_lora_warmup
+
+    @ray.remote(num_gpus=1)
+    def get_num_gpu_blocks_no_lora():
+        llm = vllm.LLM(MODEL_PATH, max_num_seqs=16)
+        num_gpu_blocks_no_lora_warmup = (
+            llm.llm_engine.cache_config.num_gpu_blocks)
+        return num_gpu_blocks_no_lora_warmup
+
+    num_gpu_blocks_lora_warmup = ray.get(get_num_gpu_blocks_lora.remote())
+    num_gpu_blocks_no_lora_warmup = ray.get(
+        get_num_gpu_blocks_no_lora.remote())
+    assert num_gpu_blocks_lora_warmup < num_gpu_blocks_no_lora_warmup, (
+        "The warmup with lora should be more "
+        "conservative than without lora, therefore the number of "
+        "memory blocks for the KV cache should be "
+        "less when using lora than when not using lora")
--- a/tests/lora/test_lora.py
+++ b/tests/lora/test_lora.py
@@ -0,0 +1,224 @@
+import pytest
+import torch
+
+from vllm.lora.layers import _apply_lora, _apply_lora_packed_nslice
+
+from .utils import DummyLoRAManager
+
+TENSOR_SIZES = [128, 1024, 2048, 4096, 8192, 11008, 11008 // 2, 11008 // 4]
+QKV_TENSOR_SIZES = [
+    (8192, 1024, 1024),
+    (8192 // 8, 1024 // 8, 1024 // 8),
+    (4096, 4096, 4096),
+    (4096 // 2, 4096 // 2, 4096 // 2),
+]
+BATCH_SIZES = [8, 32, 256]
+RANKS = [8]
+DTYPES = [torch.float16]
+TOLERANCES = {
+    torch.float16: (5e-3, 5e-3),
+    torch.bfloat16: (3e-2, 2e-2),
+}
+
+
+@pytest.mark.parametrize("m", TENSOR_SIZES)
+@pytest.mark.parametrize("n", TENSOR_SIZES)
+@pytest.mark.parametrize("k", BATCH_SIZES)
+@pytest.mark.parametrize("rank", RANKS)
+@pytest.mark.parametrize("dtype", DTYPES)
+def test_apply_lora(m, n, k, rank, dtype) -> None:
+    manager = DummyLoRAManager()
+
+    module_name = "module"
+    weight = torch.rand([m, n], device="cuda", dtype=dtype)
+
+    manager.init_random_lora(module_name, weight, rank=rank)
+    lora = manager.get_module_lora(module_name)
+
+    input = torch.rand(k, n, device="cuda", dtype=dtype)
+    expected = input @ lora.lora_a @ lora.lora_b * lora.scaling
+
+    lora_a_stack = torch.zeros(8,
+                               1,
+                               lora.lora_a.shape[1],
+                               lora.lora_a.shape[0],
+                               device="cuda",
+                               dtype=dtype)
+    lora_b_stack = torch.zeros(8,
+                               1,
+                               lora.lora_b.shape[1],
+                               lora.lora_b.shape[0],
+                               device="cuda",
+                               dtype=dtype)
+    for i in range(lora_a_stack.shape[0]):
+        lora_a_stack[i][0] = lora.lora_a.T
+        lora_b_stack[i][0] = (lora.lora_b * lora.scaling).T
+
+    output = torch.zeros(k, m, device="cuda", dtype=dtype)
+    _apply_lora(
+        input, lora_a_stack, lora_b_stack,
+        torch.randint(0, lora_a_stack.shape[0], (len(input), ), device="cuda"),
+        output)
+
+    rtol, atol = TOLERANCES[dtype]
+    assert torch.allclose(expected, output, rtol=rtol, atol=atol)
+
+    output[:] = 0
+    _apply_lora(input, lora_a_stack, lora_b_stack,
+                torch.full((len(input), ), -1, device="cuda"), output)
+    assert torch.allclose(torch.zeros_like(output), output)
+
+    manager.reset_lora()
+
+
+@pytest.mark.parametrize("m", TENSOR_SIZES)
+@pytest.mark.parametrize("n", TENSOR_SIZES)
+@pytest.mark.parametrize("k", BATCH_SIZES)
+@pytest.mark.parametrize("rank", RANKS)
+@pytest.mark.parametrize("dtype", DTYPES)
+def test_apply_lora_packed_2slice(m, n, k, rank, dtype) -> None:
+    if m % 2 != 0:
+        pytest.skip("m must be divisible by 2")
+    if m // 2 not in TENSOR_SIZES:
+        pytest.skip("m//2 must be in TENSOR_SIZES")
+
+    manager = DummyLoRAManager()
+
+    module_name = "module"
+    weight = torch.rand([m // 2, n], device="cuda", dtype=dtype)
+
+    manager.init_random_lora(module_name + "1", weight, rank=rank)
+    lora_1 = manager.get_module_lora(module_name + "1")
+    manager.init_random_lora(module_name + "2", weight, rank=rank)
+    lora_2 = manager.get_module_lora(module_name + "2")
+
+    input = torch.rand(k, n, device="cuda", dtype=dtype)
+    expected = torch.cat([
+        input @ lora_1.lora_a @ lora_1.lora_b * lora_1.scaling,
+        input @ lora_2.lora_a @ lora_2.lora_b * lora_2.scaling
+    ],
+                         dim=1)
+
+    lora_a_stacks = [
+        torch.zeros(8,
+                    1,
+                    lora_1.lora_a.shape[1],
+                    lora_1.lora_a.shape[0],
+                    device="cuda",
+                    dtype=dtype) for i in range(2)
+    ]
+    lora_b_stacks = [
+        torch.zeros(8,
+                    1,
+                    lora_1.lora_b.shape[1],
+                    lora_1.lora_b.shape[0],
+                    device="cuda",
+                    dtype=dtype) for i in range(2)
+    ]
+    for i in range(lora_a_stacks[0].shape[0]):
+        lora_a_stacks[0][i][0] = lora_1.lora_a.T
+        lora_b_stacks[0][i][0] = (lora_1.lora_b * lora_1.scaling).T
+        lora_a_stacks[1][i][0] = lora_2.lora_a.T
+        lora_b_stacks[1][i][0] = (lora_2.lora_b * lora_2.scaling).T
+
+    output = torch.zeros(k, m, device="cuda", dtype=dtype)
+    _apply_lora_packed_nslice(
+        input, lora_a_stacks, lora_b_stacks,
+        torch.randint(0,
+                      lora_a_stacks[0].shape[0], (len(input), ),
+                      device="cuda"), output, (m // 2, m // 2))
+
+    rtol, atol = TOLERANCES[dtype]
+    assert torch.allclose(expected, output, rtol=rtol, atol=atol)
+
+    output[:] = 0
+    _apply_lora_packed_nslice(input, lora_a_stacks, lora_b_stacks,
+                              torch.full((len(input), ), -1, device="cuda"),
+                              output, (m // 2, m // 2))
+    assert torch.allclose(torch.zeros_like(output), output)
+
+    manager.reset_lora()
+
+
+@pytest.mark.parametrize("qkv", QKV_TENSOR_SIZES)
+@pytest.mark.parametrize("n", TENSOR_SIZES)
+@pytest.mark.parametrize("k", BATCH_SIZES)
+@pytest.mark.parametrize("rank", RANKS)
+@pytest.mark.parametrize("dtype", DTYPES)
+def test_apply_lora_packed_3slice(qkv, n, k, rank, dtype) -> None:
+    manager = DummyLoRAManager()
+
+    module_name = "module"
+    weight_q = torch.empty(qkv[0], n, device="cuda", dtype=dtype)
+    weight_kv = torch.empty(qkv[1], n, device="cuda", dtype=dtype)
+
+    manager.init_random_lora(module_name + "q", weight_q, rank=rank)
+    lora_q = manager.get_module_lora(module_name + "q")
+    manager.init_random_lora(module_name + "k", weight_kv, rank=rank)
+    lora_k = manager.get_module_lora(module_name + "k")
+    manager.init_random_lora(module_name + "v", weight_kv, rank=rank)
+    lora_v = manager.get_module_lora(module_name + "v")
+
+    input = torch.rand(k, n, device="cuda", dtype=dtype)
+    expected = torch.cat([
+        input @ lora_q.lora_a @ lora_q.lora_b * lora_q.scaling,
+        input @ lora_k.lora_a @ lora_k.lora_b * lora_k.scaling,
+        input @ lora_v.lora_a @ lora_v.lora_b * lora_v.scaling
+    ],
+                         dim=1)
+
+    lora_a_stacks = [
+        torch.zeros(8,
+                    1,
+                    lora_q.lora_a.shape[1],
+                    lora_q.lora_a.shape[0],
+                    device="cuda",
+                    dtype=dtype)
+    ] + [
+        torch.zeros(8,
+                    1,
+                    lora_k.lora_a.shape[1],
+                    lora_k.lora_a.shape[0],
+                    device="cuda",
+                    dtype=dtype) for i in range(2)
+    ]
+    lora_b_stacks = [
+        torch.zeros(8,
+                    1,
+                    lora_q.lora_b.shape[1],
+                    lora_q.lora_b.shape[0],
+                    device="cuda",
+                    dtype=dtype)
+    ] + [
+        torch.zeros(8,
+                    1,
+                    lora_k.lora_b.shape[1],
+                    lora_k.lora_b.shape[0],
+                    device="cuda",
+                    dtype=dtype) for i in range(2)
+    ]
+    for i in range(lora_a_stacks[0].shape[0]):
+        lora_a_stacks[0][i][0] = lora_q.lora_a.T
+        lora_b_stacks[0][i][0] = (lora_q.lora_b * lora_q.scaling).T
+        lora_a_stacks[1][i][0] = lora_k.lora_a.T
+        lora_b_stacks[1][i][0] = (lora_k.lora_b * lora_k.scaling).T
+        lora_a_stacks[2][i][0] = lora_v.lora_a.T
+        lora_b_stacks[2][i][0] = (lora_v.lora_b * lora_v.scaling).T
+
+    output = torch.zeros(k, sum(qkv), device="cuda", dtype=dtype)
+    _apply_lora_packed_nslice(
+        input, lora_a_stacks, lora_b_stacks,
+        torch.randint(0,
+                      lora_a_stacks[0].shape[0], (len(input), ),
+                      device="cuda"), output, (qkv[0], qkv[1], qkv[2]))
+
+    rtol, atol = TOLERANCES[dtype]
+    assert torch.allclose(expected, output, rtol=rtol, atol=atol)
+
+    output[:] = 0
+    _apply_lora_packed_nslice(input, lora_a_stacks, lora_b_stacks,
+                              torch.full((len(input), ), -1, device="cuda"),
+                              output, (qkv[0], qkv[1], qkv[2]))
+    assert torch.allclose(torch.zeros_like(output), output)
+
+    manager.reset_lora()
--- a/tests/lora/test_lora_checkpoints.py
+++ b/tests/lora/test_lora_checkpoints.py
@@ -0,0 +1,58 @@
+import pytest
+
+from vllm.lora.models import LoRAModel
+from vllm.model_executor.models.baichuan import BaiChuanBaseForCausalLM
+
+lora_lst = ["baichuan7B", "baichuan7B-zero", "chatglm3-6b"]
+
+
+@pytest.mark.parametrize("lora_name", lora_lst)
+def test_load_checkpoints(
+    lora_name,
+    baichuan_lora_files,
+    baichuan_zero_lora_files,
+    chatglm3_lora_files,
+):
+    supported_lora_modules = BaiChuanBaseForCausalLM.supported_lora_modules
+    packed_modules_mapping = BaiChuanBaseForCausalLM.packed_modules_mapping
+    embedding_modules = BaiChuanBaseForCausalLM.embedding_modules
+    embed_padding_modules = BaiChuanBaseForCausalLM.embedding_padding_modules
+    expected_lora_modules = []
+    for module in supported_lora_modules:
+        if module in packed_modules_mapping:
+            expected_lora_modules.extend(packed_modules_mapping[module])
+        else:
+            expected_lora_modules.append(module)
+    if lora_name == "baichuan7B":
+        # For the baichuan7B model, load it's LoRA,
+        # and the test should pass.
+        LoRAModel.from_local_checkpoint(
+            baichuan_lora_files,
+            expected_lora_modules,
+            lora_model_id=1,
+            device="cpu",
+            embedding_modules=embedding_modules,
+            embedding_padding_modules=embed_padding_modules)
+    elif lora_name == "baichuan7B-zero":
+        #Test that the target_modules contain prefix
+        # such as "model.layers.0.self_atten.W_pack", and
+        # the test should pass.
+        LoRAModel.from_local_checkpoint(
+            baichuan_zero_lora_files,
+            expected_lora_modules,
+            lora_model_id=1,
+            device="cpu",
+            embedding_modules=embedding_modules,
+            embedding_padding_modules=embed_padding_modules)
+    else:
+        # For the baichuan7B model, load chatglm3-6b's LoRA,
+        # and the test should raise the following error.
+        expected_error = "Please verify that the loaded LoRA module is correct"  # noqa: E501
+        with pytest.raises(ValueError, match=expected_error):
+            LoRAModel.from_local_checkpoint(
+                chatglm3_lora_files,
+                expected_lora_modules,
+                lora_model_id=1,
+                device="cpu",
+                embedding_modules=embedding_modules,
+                embedding_padding_modules=embed_padding_modules)
--- a/tests/lora/test_lora_manager.py
+++ b/tests/lora/test_lora_manager.py
@@ -0,0 +1,487 @@
+import os
+from typing import List
+
+import pytest
+import torch
+from safetensors.torch import load_file
+from torch import nn
+
+from vllm.config import LoRAConfig
+from vllm.lora.layers import (ColumnParallelLinearWithLoRA,
+                              MergedColumnParallelLinearWithLoRA,
+                              RowParallelLinearWithLoRA)
+from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights
+from vllm.lora.models import (LoRAMapping, LoRAModel, LoRAModelManager,
+                              LRUCacheLoRAModelManager)
+from vllm.lora.request import LoRARequest
+from vllm.lora.worker_manager import (LRUCacheWorkerLoRAManager,
+                                      WorkerLoRAManager)
+from vllm.model_executor.layers.linear import RowParallelLinear
+
+EMBEDDING_MODULES = {
+    "embed_tokens": "input_embeddings",
+    "lm_head": "output_embeddings",
+}
+
+EMBEDDING_PADDING_MODULES = ["lm_head"]
+
+
+def test_from_lora_tensors(sql_lora_files):
+    tensors = load_file(
+        os.path.join(sql_lora_files, "adapter_model.safetensors"))
+    new_embeddings = load_file(
+        os.path.join(sql_lora_files, "new_embeddings.safetensors"))
+    lora_model = LoRAModel.from_lora_tensors(
+        1,
+        8,
+        16,
+        tensors,
+        "cuda",
+        embeddings=new_embeddings,
+        embedding_modules=EMBEDDING_MODULES,
+        embedding_padding_modules=EMBEDDING_PADDING_MODULES)
+    for module_name, lora in lora_model.loras.items():
+        assert lora.module_name == module_name
+        assert lora.rank == 8
+        assert lora.lora_alpha == 16
+        assert lora.lora_a is not None
+        assert lora.lora_b is not None
+        assert (lora.lora_a.shape[1] == lora.lora_b.shape[0]
+                ), f"{lora.lora_a.shape=}, {lora.lora_b.shape=}"
+        assert lora.lora_a.shape[1] == 8
+        embeddings_module = next(
+            (k for k in EMBEDDING_MODULES if k in module_name), None)
+        if embeddings_module:
+            assert torch.equal(
+                lora.embeddings_tensor,
+                new_embeddings[EMBEDDING_MODULES[embeddings_module]].to(
+                    device=lora.embeddings_tensor.device))
+        else:
+            assert lora.embeddings_tensor is None
+
+
+def create_lora(lora_id: int, model: nn.Module,
+                sub_modules: List[str]) -> LoRAModel:
+    loras = {}
+    for name in sub_modules:
+        w = model.get_submodule(name).weight
+        loras[name] = LoRALayerWeights(
+            name,
+            8,
+            16,
+            torch.rand([w.shape[1], 8], device="cuda"),
+            torch.rand([8, w.shape[0]], device="cuda"),
+        )
+    return LoRAModel(lora_id, 8, loras)
+
+
+def create_packed_lora(
+    lora_id: int,
+    model: nn.Module,
+    module_name,
+    replaced_module_names,
+    empty_replaced_module_name=None,
+) -> LoRAModel:
+    w = model.get_submodule(module_name).weight
+    loras = {}
+    for replaced_module_name in replaced_module_names:
+        if replaced_module_name == empty_replaced_module_name:
+            continue
+        loras[replaced_module_name] = LoRALayerWeights(
+            replaced_module_name,
+            8,
+            16,
+            torch.rand([w.shape[1], 8], device="cuda"),
+            torch.rand([8, w.shape[0] // len(replaced_module_names)],
+                       device="cuda"),
+        )
+    return LoRAModel(lora_id, 8, loras)
+
+
+def test_replace_submodules(dist_init, dummy_model):
+    model = dummy_model
+    model.supported_lora_modules = ["dense1", "layer1.dense2"]
+    model.packed_modules_mapping = {}
+    manager = LoRAModelManager(
+        model, 1, 1, 1,
+        LoRAConfig(max_lora_rank=8, max_cpu_loras=8, max_loras=8))
+    model = manager.model
+
+    assert isinstance(model.get_submodule("dense1"),
+                      ColumnParallelLinearWithLoRA)
+    assert isinstance(model.get_submodule("layer1.dense1"),
+                      ColumnParallelLinearWithLoRA)
+    assert isinstance(model.get_submodule("dense2"), RowParallelLinear)
+    assert isinstance(model.get_submodule("layer1.dense2"),
+                      RowParallelLinearWithLoRA)
+
+
+def test_lora_model_manager(dist_init, dummy_model):
+    model = dummy_model
+    model.supported_lora_modules = ["dense1", "dense2", "lm_head"]
+    model.packed_modules_mapping = {}
+    model_lora1 = create_lora(1, model, ["layer1.dense1", "dense2", "lm_head"])
+    model_lora2 = create_lora(2, model, ["dense1", "dense2", "lm_head"])
+    model_lora3 = create_lora(3, model, ["dense1", "dense2", "lm_head"])
+    manager = LoRAModelManager(
+        model, 2, 2, 2,
+        LoRAConfig(max_lora_rank=8, max_cpu_loras=3, max_loras=2))
+    assert all(x is None for x in manager.lora_index_to_id)
+    assert manager.add_lora(model_lora1)
+    assert manager.activate_lora(1)
+    assert manager.lora_index_to_id[0] == 1
+    assert not manager.add_lora(model_lora1)
+    assert not manager.activate_lora(1)
+    assert manager.add_lora(model_lora2)
+    assert manager.activate_lora(2)
+    assert manager.lora_index_to_id[0] == 1
+    assert manager.lora_index_to_id[1] == 2
+    assert not manager.add_lora(model_lora2)
+    assert not manager.activate_lora(2)
+    assert manager.add_lora(model_lora3)
+    assert manager.lora_index_to_id[0] == 1
+    assert manager.lora_index_to_id[1] == 2
+    with pytest.raises(ValueError):
+        assert manager.activate_lora(3)
+    assert manager.lora_index_to_id[0] == 1
+    assert manager.lora_index_to_id[1] == 2
+    assert manager.remove_lora(model_lora2.id)
+    assert manager.lora_index_to_id[1] is None
+    assert not manager.remove_lora(model_lora2.id)
+    assert manager.remove_lora(model_lora1.id)
+    assert not manager.remove_lora(model_lora1.id)
+    assert manager.add_lora(model_lora1)
+    assert manager.lora_index_to_id[0] is None
+    assert manager.lora_index_to_id[1] is None
+    assert manager.add_lora(model_lora2)
+    assert manager.activate_lora(3)
+    assert manager.lora_index_to_id[0] == 3
+    assert manager.lora_index_to_id[1] is None
+    assert manager.activate_lora(2)
+    assert manager.lora_index_to_id[0] == 3
+    assert manager.lora_index_to_id[1] == 2
+
+
+def test_lora_lru_cache_model_manager(dist_init, dummy_model):
+    model = dummy_model
+    model.supported_lora_modules = ["dense1", "dense2", "lm_head"]
+    model.packed_modules_mapping = {}
+    model_lora1 = create_lora(1, model, ["layer1.dense1", "dense2", "lm_head"])
+    model_lora2 = create_lora(2, model, ["dense1", "dense2", "lm_head"])
+    model_lora3 = create_lora(3, model, ["dense1", "dense2", "lm_head"])
+    manager = LRUCacheLoRAModelManager(
+        model, 2, 2, 2,
+        LoRAConfig(max_lora_rank=8, max_cpu_loras=3, max_loras=2))
+    assert all(x is None for x in manager.lora_index_to_id)
+    assert manager.add_lora(model_lora1)
+    assert manager.activate_lora(1)
+    assert manager.lora_index_to_id[0] == 1
+    assert not manager.add_lora(model_lora1)
+    assert not manager.activate_lora(1)
+    assert manager.add_lora(model_lora2)
+    assert manager.activate_lora(2)
+    assert manager.lora_index_to_id[0] == 1
+    assert manager.lora_index_to_id[1] == 2
+    assert not manager.add_lora(model_lora2)
+    assert not manager.activate_lora(2)
+    assert manager.add_lora(model_lora3)
+    assert manager.lora_index_to_id[0] == 1
+    assert manager.lora_index_to_id[1] == 2
+    assert manager.activate_lora(3)
+    assert manager.lora_index_to_id[0] == 3
+    assert manager.lora_index_to_id[1] == 2
+    assert manager.remove_lora(model_lora2.id)
+    assert manager.lora_index_to_id[1] is None
+    assert not manager.remove_lora(model_lora2.id)
+    assert manager.remove_lora(model_lora1.id)
+    assert not manager.remove_lora(model_lora1.id)
+    assert manager.add_lora(model_lora1)
+    assert manager.activate_lora(1)
+    assert manager.lora_index_to_id[0] == 3
+    assert manager.lora_index_to_id[1] == 1
+    assert manager.add_lora(model_lora2)
+    assert manager.deactivate_lora(3)
+    assert manager.lora_index_to_id[0] is None
+    assert manager.lora_index_to_id[1] == 1
+    assert manager.activate_lora(2)
+    assert manager.lora_index_to_id[0] == 2
+    assert manager.lora_index_to_id[1] == 1
+    assert manager.activate_lora(3)
+    assert manager.lora_index_to_id[0] == 2
+    assert manager.lora_index_to_id[1] == 3
+
+
+def test_lru_lora_model_manager(dist_init, dummy_model):
+    # This tests just the LRU cache functionality, everything else is
+    # tested in test_lora_model_manager
+    model = dummy_model
+    model.supported_lora_modules = ["dense1", "dense2", "lm_head"]
+    model.packed_modules_mapping = {}
+    model_lora1 = create_lora(1, model, ["layer1.dense1", "dense2", "lm_head"])
+    model_lora2 = create_lora(2, model, ["dense1", "dense2", "lm_head"])
+    model_lora3 = create_lora(3, model, ["dense1", "dense2", "lm_head"])
+    model_lora4 = create_lora(4, model, ["dense1", "dense2", "lm_head"])
+    manager = LRUCacheLoRAModelManager(
+        model, 2, 2, 2,
+        LoRAConfig(max_lora_rank=8, max_cpu_loras=2, max_loras=2))
+
+    assert all(x is None for x in manager.lora_index_to_id)
+
+    # Add up to capacity
+    assert manager.add_lora(model_lora1)
+    assert manager.add_lora(model_lora2)
+    assert manager.activate_lora(1)
+    assert manager.activate_lora(2)
+
+    assert set(manager.list_loras()) == {1, 2}
+    assert manager.lora_index_to_id[0] == 1
+    assert manager.lora_index_to_id[1] == 2
+
+    # Add over capacity
+    assert manager.add_lora(model_lora3)
+    assert manager.add_lora(model_lora4)
+    assert manager.activate_lora(3)
+    assert manager.activate_lora(4)
+
+    assert set(manager.list_loras()) == {3, 4}
+    assert manager.lora_index_to_id[0] == 3
+    assert manager.lora_index_to_id[1] == 4
+
+    # Add 3 again to move it to the top and then add 2
+    # should return false since it's in already
+    assert not manager.add_lora(model_lora3)
+    assert not manager.activate_lora(3)
+    assert manager.add_lora(model_lora2)
+    assert manager.activate_lora(2)
+
+    assert set(manager.list_loras()) == {3, 2}
+    assert manager.lora_index_to_id[0] == 3
+    assert manager.lora_index_to_id[1] == 2
+
+    # Remove manually
+    assert manager.remove_lora(3)
+    assert not manager.remove_lora(3)
+
+    assert set(manager.list_loras()) == {2}
+    assert manager.lora_index_to_id[0] is None
+    assert manager.lora_index_to_id[1] == 2
+
+    assert manager.add_lora(model_lora3)
+    assert manager.activate_lora(3)
+    assert manager.add_lora(model_lora4)
+    assert manager.activate_lora(4)
+
+    assert set(manager.list_loras()) == {3, 4}
+    assert manager.lora_index_to_id[0] == 3
+    assert manager.lora_index_to_id[1] == 4
+
+    assert manager.remove_oldest_lora()
+    assert set(manager.list_loras()) == {4}
+    assert manager.lora_index_to_id[0] is None
+    assert manager.lora_index_to_id[1] == 4
+
+    assert manager.remove_oldest_lora()
+    assert set(manager.list_loras()) == set()
+    assert all(x is None for x in manager.lora_index_to_id)
+
+    assert not manager.remove_oldest_lora()
+    assert set(manager.list_loras()) == set()
+    assert all(x is None for x in manager.lora_index_to_id)
+
+
+def test_lru_cache_worker_lora_manager(llama_2_7b_model_extra_embeddings,
+                                       sql_lora_files):
+    lora_config = LoRAConfig(max_lora_rank=8, max_cpu_loras=4, max_loras=4)
+    worker_lora_manager = LRUCacheWorkerLoRAManager(
+        4, 2, llama_2_7b_model_extra_embeddings.unpadded_vocab_size -
+        lora_config.lora_extra_vocab_size, lora_config, torch.device("cuda"),
+        EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES)
+    worker_lora_manager.create_lora_manager(llama_2_7b_model_extra_embeddings)
+
+    mapping = LoRAMapping([], [])
+    worker_lora_manager.set_active_loras([
+        LoRARequest("1", 1, sql_lora_files),
+        LoRARequest("2", 2, sql_lora_files)
+    ], mapping)
+    assert worker_lora_manager.list_loras() == {1, 2}
+    assert worker_lora_manager._lora_manager.lora_index_to_id[0] == 1
+    assert worker_lora_manager._lora_manager.lora_index_to_id[1] == 2
+
+    worker_lora_manager.set_active_loras([
+        LoRARequest("1", 1, sql_lora_files),
+        LoRARequest("3", 3, sql_lora_files),
+        LoRARequest("4", 4, sql_lora_files)
+    ], mapping)
+    assert worker_lora_manager.list_loras() == {1, 2, 3, 4}
+    assert worker_lora_manager._lora_manager.lora_index_to_id[0] == 1
+    assert worker_lora_manager._lora_manager.lora_index_to_id[1] == 2
+    assert worker_lora_manager._lora_manager.lora_index_to_id[2] == 3
+    assert worker_lora_manager._lora_manager.lora_index_to_id[3] == 4
+
+    worker_lora_manager.set_active_loras([
+        LoRARequest("1", 1, sql_lora_files),
+        LoRARequest("2", 2, sql_lora_files),
+        LoRARequest("5", 5, sql_lora_files)
+    ], mapping)
+    assert worker_lora_manager.list_loras() == {1, 2, 4, 5}
+    assert worker_lora_manager._lora_manager.lora_index_to_id[0] == 1
+    assert worker_lora_manager._lora_manager.lora_index_to_id[1] == 2
+    assert worker_lora_manager._lora_manager.lora_index_to_id[2] == 5
+    assert worker_lora_manager._lora_manager.lora_index_to_id[3] == 4
+
+    worker_lora_manager.set_active_loras([
+        LoRARequest("1", 1, sql_lora_files),
+        LoRARequest("1", 1, sql_lora_files),
+        LoRARequest("1", 1, sql_lora_files)
+    ], mapping)
+    assert worker_lora_manager.list_loras() == {1, 2, 4, 5}
+    assert worker_lora_manager._lora_manager.lora_index_to_id[0] == 1
+    assert worker_lora_manager._lora_manager.lora_index_to_id[1] == 2
+    assert worker_lora_manager._lora_manager.lora_index_to_id[2] == 5
+    assert worker_lora_manager._lora_manager.lora_index_to_id[3] == 4
+
+    worker_lora_manager.set_active_loras([
+        LoRARequest("6", 6, sql_lora_files),
+        LoRARequest("7", 7, sql_lora_files),
+        LoRARequest("8", 8, sql_lora_files)
+    ], mapping)
+    assert worker_lora_manager.list_loras() == {1, 6, 7, 8}
+    assert worker_lora_manager._lora_manager.lora_index_to_id[0] == 1
+    assert worker_lora_manager._lora_manager.lora_index_to_id[1] == 7
+    assert worker_lora_manager._lora_manager.lora_index_to_id[2] == 8
+    assert worker_lora_manager._lora_manager.lora_index_to_id[3] == 6
+
+    # Over capacity
+    with pytest.raises(RuntimeError):
+        worker_lora_manager.set_active_loras([
+            LoRARequest("10", 10, sql_lora_files),
+            LoRARequest("11", 11, sql_lora_files),
+            LoRARequest("12", 12, sql_lora_files),
+            LoRARequest("13", 13, sql_lora_files),
+            LoRARequest("14", 14, sql_lora_files)
+        ], mapping)
+
+
+def test_worker_lora_manager(llama_2_7b_model_extra_embeddings,
+                             sql_lora_files):
+    # Should remove every LoRA not specified in the request.
+    lora_config = LoRAConfig(max_lora_rank=8, max_cpu_loras=4, max_loras=4)
+    worker_lora_manager = WorkerLoRAManager(
+        4, 2, llama_2_7b_model_extra_embeddings.unpadded_vocab_size -
+        lora_config.lora_extra_vocab_size, lora_config, torch.device("cuda"),
+        EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES)
+    worker_lora_manager.create_lora_manager(llama_2_7b_model_extra_embeddings)
+
+    mapping = LoRAMapping([], [])
+    worker_lora_manager.set_active_loras([
+        LoRARequest("1", 1, sql_lora_files),
+        LoRARequest("2", 2, sql_lora_files)
+    ], mapping)
+    assert worker_lora_manager.list_loras() == {1, 2}
+    assert worker_lora_manager._lora_manager.lora_index_to_id[0] == 1
+    assert worker_lora_manager._lora_manager.lora_index_to_id[1] == 2
+
+    worker_lora_manager.set_active_loras([
+        LoRARequest("1", 1, sql_lora_files),
+        LoRARequest("3", 3, sql_lora_files),
+        LoRARequest("4", 4, sql_lora_files)
+    ], mapping)
+    assert worker_lora_manager.list_loras() == {1, 3, 4}
+    assert worker_lora_manager._lora_manager.lora_index_to_id[0] == 1
+    assert worker_lora_manager._lora_manager.lora_index_to_id[1] == 3
+    assert worker_lora_manager._lora_manager.lora_index_to_id[2] == 4
+
+    worker_lora_manager.set_active_loras([
+        LoRARequest("1", 1, sql_lora_files),
+        LoRARequest("2", 2, sql_lora_files),
+        LoRARequest("5", 5, sql_lora_files)
+    ], mapping)
+    assert worker_lora_manager.list_loras() == {1, 2, 5}
+    assert worker_lora_manager._lora_manager.lora_index_to_id[0] == 1
+    assert worker_lora_manager._lora_manager.lora_index_to_id[1] == 2
+    assert worker_lora_manager._lora_manager.lora_index_to_id[2] == 5
+
+    worker_lora_manager.set_active_loras([
+        LoRARequest("1", 1, sql_lora_files),
+        LoRARequest("1", 1, sql_lora_files),
+        LoRARequest("1", 1, sql_lora_files)
+    ], mapping)
+    assert worker_lora_manager.list_loras() == {1}
+    assert worker_lora_manager._lora_manager.lora_index_to_id[0] == 1
+    assert worker_lora_manager._lora_manager.lora_index_to_id[1] is None
+    assert worker_lora_manager._lora_manager.lora_index_to_id[2] is None
+
+    worker_lora_manager.set_active_loras([
+        LoRARequest("6", 6, sql_lora_files),
+        LoRARequest("7", 7, sql_lora_files),
+        LoRARequest("8", 8, sql_lora_files)
+    ], mapping)
+    assert worker_lora_manager.list_loras() == {6, 7, 8}
+    assert worker_lora_manager._lora_manager.lora_index_to_id[0] == 8
+    assert worker_lora_manager._lora_manager.lora_index_to_id[1] == 6
+    assert worker_lora_manager._lora_manager.lora_index_to_id[2] == 7
+
+    # Over capacity
+    with pytest.raises(RuntimeError):
+        worker_lora_manager.set_active_loras([
+            LoRARequest("10", 10, sql_lora_files),
+            LoRARequest("11", 11, sql_lora_files),
+            LoRARequest("12", 12, sql_lora_files),
+            LoRARequest("13", 13, sql_lora_files),
+            LoRARequest("14", 14, sql_lora_files)
+        ], mapping)
+
+
+def test_packed_loras(dist_init, dummy_model_gate_up):
+    model = dummy_model_gate_up
+    model.supported_lora_modules = ["gate_up_proj"]
+    model.packed_modules_mapping = {
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+    model_lora = create_packed_lora(
+        1,
+        model,
+        module_name="gate_up_proj",
+        replaced_module_names=["gate_proj", "up_proj"])
+    model_lora1 = create_packed_lora(
+        2,
+        model,
+        module_name="gate_up_proj",
+        replaced_module_names=["gate_proj", "up_proj"],
+        empty_replaced_module_name="gate_proj",
+    )
+
+    manager = LoRAModelManager(
+        model, 2, 2, 2,
+        LoRAConfig(max_lora_rank=8, max_cpu_loras=2, max_loras=2))
+    model = manager.model
+
+    assert isinstance(model.get_submodule("gate_up_proj"),
+                      MergedColumnParallelLinearWithLoRA)
+    assert manager.add_lora(model_lora)
+    assert manager.add_lora(model_lora1)
+
+    packed_lora = model_lora.get_lora("gate_up_proj")
+    assert packed_lora and isinstance(packed_lora, PackedLoRALayerWeights)
+
+    assert torch.allclose(packed_lora.lora_a[0],
+                          model_lora.get_lora("gate_proj").lora_a)
+    assert torch.allclose(packed_lora.lora_b[0],
+                          model_lora.get_lora("gate_proj").lora_b)
+    assert torch.allclose(packed_lora.lora_a[1],
+                          model_lora.get_lora("up_proj").lora_a)
+    assert torch.allclose(packed_lora.lora_b[1],
+                          model_lora.get_lora("up_proj").lora_b)
+
+    packed_lora1 = model_lora1.get_lora("gate_up_proj")
+    assert packed_lora1 and isinstance(packed_lora1, PackedLoRALayerWeights)
+
+    assert packed_lora1.lora_a[0] is None
+    assert packed_lora1.lora_b[0] is None
+    assert torch.allclose(packed_lora1.lora_a[1],
+                          model_lora1.get_lora("up_proj").lora_a)
+    assert torch.allclose(packed_lora1.lora_b[1],
+                          model_lora1.get_lora("up_proj").lora_b)
--- a/tests/lora/test_mixtral.py
+++ b/tests/lora/test_mixtral.py
@@ -0,0 +1,53 @@
+import pytest
+import torch
+
+import vllm
+from vllm.lora.request import LoRARequest
+
+MODEL_PATH = "mistralai/Mixtral-8x7B-Instruct-v0.1"
+
+
+def do_sample(llm, lora_path: str, lora_id: int):
+    prompts = [
+        "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nSpellForce 3 is a pretty bad game. The developer Grimlore Games is clearly a bunch of no-talent hacks, and 2017 was a terrible year for games anyway. [/user] [assistant]",  # noqa: E501
+        "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nI wanted to like Grimlore Games' 2017 entry, but in SpellForce 3 they just didn't get anything right. [/user] [assistant]",  # noqa: E501
+        "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nBioShock is a good role-playing, action-adventure, shooter that released for PlayStation, Xbox, and PC in 2007. It is available on Steam, and it has a Mac release but not a Linux release. [/user] [assistant]",  # noqa: E501
+    ]
+    sampling_params = vllm.SamplingParams(temperature=0, max_tokens=256)
+    outputs = llm.generate(
+        prompts,
+        sampling_params,
+        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
+        if lora_id else None)
+    # Print the outputs.
+    generated_texts = []
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text.strip()
+        generated_texts.append(generated_text)
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    return generated_texts
+
+
+@pytest.mark.parametrize("tp_size", [4])
+def test_mixtral_lora(mixtral_lora_files, tp_size):
+    if torch.cuda.device_count() < tp_size:
+        pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
+
+    llm = vllm.LLM(MODEL_PATH,
+                   enable_lora=True,
+                   max_num_seqs=16,
+                   max_loras=4,
+                   tensor_parallel_size=tp_size,
+                   worker_use_ray=True)
+
+    expected_lora_output = [
+        "give_opinion(name[SpellForce 3], release_year[2017], developer[Grimlore Games], rating[poor])",  # noqa: E501
+        "give_opinion(name[SpellForce 3], release_year[2017], developer[Grimlore Games], rating[poor])",  # noqa: E501
+        "inform(name[BioShock], release_year[2007], rating[good], genres[action-adventure, role-playing, shooter], platforms[PlayStation, Xbox, PC], available_on_steam[yes], has_linux_release[no], has_mac_release[yes])",  # noqa: E501
+    ]
+
+    assert do_sample(llm, mixtral_lora_files,
+                     lora_id=1) == expected_lora_output
+    assert do_sample(llm, mixtral_lora_files,
+                     lora_id=2) == expected_lora_output
--- a/tests/lora/test_punica.py
+++ b/tests/lora/test_punica.py
@@ -0,0 +1,231 @@
+# Based on code from https://github.com/punica-ai/punica
+
+import pytest
+import torch
+
+import vllm.lora.punica as punica
+
+
+def assert_close(a, b):
+    rtol, atol = {
+        torch.float16: (5e-3, 5e-3),
+        torch.bfloat16: (3e-2, 2e-2),
+        torch.float32: (None, None),
+    }[a.dtype]
+    torch.testing.assert_close(a, b, rtol=rtol, atol=atol)
+
+
+def _lora_ref_impl(
+    y_final: torch.Tensor,
+    x: torch.Tensor,
+    wa_T_all: torch.Tensor,
+    wb_T_all: torch.Tensor,
+    indicies: torch.LongTensor,
+    layer_idx: int,
+    scale: float,
+):
+    y_stage_1 = torch.empty(
+        (x.size(0), wa_T_all.size(-2)),
+        dtype=torch.float32,
+        device=x.device,
+    )
+    bs = x.shape[0]
+    s = torch.tensor(scale, dtype=torch.float32, device=x.device)
+    for i, lora_idx in zip(range(bs), indicies.cpu().tolist()):
+        xi = x[i].unsqueeze(0).to(torch.float32)
+        wa = wa_T_all[lora_idx, layer_idx].transpose(-1, -2).to(torch.float32)
+        if wb_T_all is not None:
+            wb = wb_T_all[lora_idx, layer_idx].transpose(-1,
+                                                         -2).to(torch.float32)
+
+        tmp = xi @ wa
+        y_stage_1[i] = tmp.squeeze(0)
+        y_final[i] += ((tmp @ wb).squeeze(0) *
+                       s if wb_T_all is not None else y_stage_1[i])
+    return y_final, y_stage_1
+
+
+H1 = H2 = [
+    128,
+    256,
+    512,
+    1024,
+    1152,
+    1280,
+    1536,
+    2048,
+    2304,
+    2560,
+    2752,
+    3072,
+    3456,
+    3584,
+    4096,
+    4608,
+    5120,
+    5504,
+    5632,
+    6144,
+    6848,
+    6912,
+    7168,
+    8192,
+    9216,
+    10240,
+    11008,
+    13824,
+    14336,
+    15360,
+    22016,
+    24576,
+    27392,
+    32000,
+    32256,
+    32512,
+    32768,
+    33024,
+    36864,
+    43264,
+    49152,
+    64000,
+    64256,
+    102400,
+    102656,
+    128000,
+    128256,
+]
+H2 = [64] + H2
+R = [1, 2, 4]
+SEED = [0xabcdabcd987]
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+]
+
+
+@pytest.mark.parametrize("dtype_str", ["float16", "bfloat16"])
+@pytest.mark.parametrize("h1", H1)
+@pytest.mark.parametrize("r", R)
+@pytest.mark.parametrize("seed", SEED)
+@torch.inference_mode()
+def test_lora_a_extra_shapes(dtype_str, h1, r, seed):
+    torch.manual_seed(seed)
+    num_loras = 4
+    num_layers = 1
+    bs = 32
+    dtype = getattr(torch, dtype_str)
+    device = torch.device("cuda")
+
+    wa_T_all = torch.randn(num_loras,
+                           num_layers,
+                           r,
+                           h1,
+                           dtype=dtype,
+                           device=device)
+    indices = torch.randint(num_loras, (bs, ), dtype=torch.long, device=device)
+
+    for layer_idx in range(num_layers):
+        x = torch.randn(bs, h1, dtype=dtype, device=device)
+        y = torch.randn(bs, r, dtype=dtype, device=device)
+
+        y_ref = y.clone()
+        _lora_ref_impl(
+            y_ref,
+            x,
+            wa_T_all,
+            None,
+            indices,
+            layer_idx,
+            1.0,
+        )
+
+        y_our = y.clone()
+        punica.bgmv(y_our, x, wa_T_all, indices, layer_idx, 1.0)
+
+        assert_close(y_ref, y_our)
+
+
+@pytest.mark.parametrize("dtype_str", ["float16", "bfloat16"])
+@pytest.mark.parametrize("h1", H1)
+@pytest.mark.parametrize("h2", H2)
+@pytest.mark.parametrize("seed", SEED)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@torch.inference_mode()
+def test_lora_correctness(dtype_str, h1, h2, seed, device):
+    torch.manual_seed(seed)
+    num_loras = 4
+    num_layers = 1
+    r = 8
+    bs = 32
+    scale = 0.123
+    dtype = getattr(torch, dtype_str)
+    torch.set_default_device(device)
+
+    wa_T_all = torch.randn(num_loras, num_layers, r, h1, dtype=dtype)
+    wb_T_all = torch.randn(num_loras, num_layers, h2, r, dtype=dtype)
+    indices = torch.randint(num_loras, (bs, ), dtype=torch.long)
+
+    for layer_idx in range(num_layers):
+        x = torch.randn(bs, h1, dtype=dtype)
+        y = torch.randn(bs, h2, dtype=dtype)
+
+        y_ref = y.clone()
+        _lora_ref_impl(y_ref, x, wa_T_all, wb_T_all, indices, layer_idx, scale)
+
+        y_our = y.clone()
+        punica.add_lora(y_our, x, wa_T_all, wb_T_all, indices, layer_idx,
+                        scale)
+
+        assert_close(y_ref, y_our)
+
+
+@pytest.mark.parametrize("dtype_str", ["float16", "bfloat16"])
+@pytest.mark.parametrize("h1", H1)
+@pytest.mark.parametrize("h2", H2)
+@pytest.mark.parametrize("seed", SEED)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@torch.inference_mode()
+def test_lora_correctness_slice(dtype_str, h1, h2, seed, device):
+    if h2 % 3 != 0 or h2 // 3 not in H1:
+        pytest.skip("h2 must be divisible by 3 and in supported shapes")
+    torch.manual_seed(seed)
+    num_loras = 4
+    num_layers = 1
+    r = 8
+    bs = 32
+    scale = 0.123
+    dtype = getattr(torch, dtype_str)
+    torch.set_default_device(device)
+
+    wa_T_all_0 = torch.randn(num_loras, num_layers, r, h1, dtype=dtype)
+    wa_T_all_1 = torch.randn(num_loras, num_layers, r, h1, dtype=dtype)
+    wa_T_all_2 = torch.randn(num_loras, num_layers, r, h1, dtype=dtype)
+    wb_T_all_0 = torch.randn(num_loras, num_layers, h2 // 3, r, dtype=dtype)
+    wb_T_all_1 = torch.randn(num_loras, num_layers, h2 // 3, r, dtype=dtype)
+    wb_T_all_2 = torch.randn(num_loras, num_layers, h2 // 3, r, dtype=dtype)
+
+    indices = torch.randint(num_loras, (bs, ), dtype=torch.long)
+
+    for layer_idx in range(num_layers):
+        x = torch.randn(bs, h1, dtype=dtype)
+        y = torch.randn(bs, h2, dtype=dtype)
+        s = h2 // 3
+
+        y_ref = y.clone()
+        _lora_ref_impl(y_ref[:, :s], x, wa_T_all_0, wb_T_all_0, indices,
+                       layer_idx, scale)
+        _lora_ref_impl(y_ref[:, s:s * 2], x, wa_T_all_1, wb_T_all_1, indices,
+                       layer_idx, scale)
+        _lora_ref_impl(y_ref[:, s * 2:], x, wa_T_all_2, wb_T_all_2, indices,
+                       layer_idx, scale)
+
+        y_our = y.clone()
+        punica.add_lora_slice(y_our, x, wa_T_all_0, wb_T_all_0, indices,
+                              layer_idx, scale, 0, s)
+        punica.add_lora_slice(y_our, x, wa_T_all_1, wb_T_all_1, indices,
+                              layer_idx, scale, s, s)
+        punica.add_lora_slice(y_our, x, wa_T_all_2, wb_T_all_2, indices,
+                              layer_idx, scale, s * 2, s)
+
+        assert_close(y_ref[:, :s], y_our[:, :s])
+        assert_close(y_ref[:, s:s * 2], y_our[:, s:s * 2])
+        assert_close(y_ref[:, s * 2:], y_our[:, s * 2:])
--- a/tests/lora/test_quant_model.py
+++ b/tests/lora/test_quant_model.py
@@ -0,0 +1,179 @@
+# Adapted from
+# https://github.com/fmmoret/vllm/blob/fm-support-lora-on-quantized-models/tests/lora/test_llama.py
+from dataclasses import dataclass
+from typing import List
+
+import pytest
+
+import vllm
+from vllm.lora.request import LoRARequest
+
+from .conftest import cleanup
+
+
+@dataclass
+class ModelWithQuantization:
+    model_path: str
+    quantization: str
+
+
+MODELS: List[ModelWithQuantization] = [
+    ModelWithQuantization(model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ",
+                          quantization="AWQ"),
+    ModelWithQuantization(model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
+                          quantization="GPTQ"),
+]
+
+
+def do_sample(llm, lora_path: str, lora_id: int, max_tokens=256):
+    raw_prompts = [
+        "Give me an orange-ish brown color",
+        "Give me a neon pink color",
+    ]
+
+    def format_prompt_tuples(prompt):
+        return f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
+
+    prompts = [format_prompt_tuples(p) for p in raw_prompts]
+
+    sampling_params = vllm.SamplingParams(temperature=0,
+                                          max_tokens=max_tokens,
+                                          stop=["<|im_end|>"])
+    outputs = llm.generate(
+        prompts,
+        sampling_params,
+        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
+        if lora_id else None)
+    # Print the outputs.
+    generated_texts = []
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        generated_texts.append(generated_text)
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    return generated_texts
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("tp_size", [1])
+def test_quant_model_lora(tinyllama_lora_files, model, tp_size):
+    # Cannot use as it will initialize torch.cuda too early...
+    # if torch.cuda.device_count() < tp_size:
+    #     pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
+
+    llm = vllm.LLM(model=model.model_path,
+                   enable_lora=True,
+                   max_num_seqs=16,
+                   max_loras=4,
+                   max_model_len=400,
+                   tensor_parallel_size=tp_size,
+                   quantization=model.quantization,
+                   trust_remote_code=True)
+
+    if model.quantization is None:
+        expected_no_lora_output = [
+            "Here are some examples of orange-brown colors",
+            "I'm sorry, I don't have"
+        ]
+        expected_lora_output = [
+            "#ff8050",
+            "#ff8080",
+        ]
+    elif model.quantization == "AWQ":
+        expected_no_lora_output = [
+            "I'm sorry, I don't understand",
+            "I'm sorry, I don't understand",
+        ]
+        expected_lora_output = [
+            "#f07700: A v",
+            "#f00000: A v",
+        ]
+    elif model.quantization == "GPTQ":
+        expected_no_lora_output = [
+            "I'm sorry, I don't have",
+            "I'm sorry, I don't have",
+        ]
+        expected_lora_output = [
+            "#f08800: This is",
+            "#f07788 \n#",
+        ]
+
+    def expect_match(output, expected_output):
+        # HACK: GPTQ lora outputs are just incredibly unstable.
+        # Assert that the outputs changed.
+        if (model.quantization == "GPTQ"
+                and expected_output is expected_lora_output):
+            assert output != expected_no_lora_output
+            for i, o in enumerate(output):
+                assert o.startswith(
+                    '#'), f"Expected example {i} to start with # but got {o}"
+            return
+        assert output == expected_output
+
+    max_tokens = 10
+
+    print("lora adapter created")
+    output = do_sample(llm,
+                       tinyllama_lora_files,
+                       lora_id=0,
+                       max_tokens=max_tokens)
+    expect_match(output, expected_no_lora_output)
+
+    print("lora 1")
+    output = do_sample(llm,
+                       tinyllama_lora_files,
+                       lora_id=1,
+                       max_tokens=max_tokens)
+    expect_match(output, expected_lora_output)
+
+    print("no lora")
+    output = do_sample(llm,
+                       tinyllama_lora_files,
+                       lora_id=0,
+                       max_tokens=max_tokens)
+    expect_match(output, expected_no_lora_output)
+
+    print("lora 2")
+    output = do_sample(llm,
+                       tinyllama_lora_files,
+                       lora_id=2,
+                       max_tokens=max_tokens)
+    expect_match(output, expected_lora_output)
+
+    print("removing lora")
+
+    del llm
+    cleanup()
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.skip("Requires multiple GPUs")
+def test_quant_model_tp_equality(tinyllama_lora_files, model):
+    # Cannot use as it will initialize torch.cuda too early...
+    # if torch.cuda.device_count() < 2:
+    #     pytest.skip(f"Not enough GPUs for tensor parallelism {2}")
+
+    llm_tp1 = vllm.LLM(model=model.model_path,
+                       enable_lora=True,
+                       max_num_seqs=16,
+                       max_loras=4,
+                       tensor_parallel_size=1,
+                       quantization=model.quantization,
+                       trust_remote_code=True)
+    output_tp1 = do_sample(llm_tp1, tinyllama_lora_files, lora_id=1)
+
+    del llm_tp1
+    cleanup()
+
+    llm_tp2 = vllm.LLM(model=model.model_path,
+                       enable_lora=True,
+                       max_num_seqs=16,
+                       max_loras=4,
+                       tensor_parallel_size=2,
+                       quantization=model.quantization)
+    output_tp2 = do_sample(llm_tp2, tinyllama_lora_files, lora_id=1)
+
+    del llm_tp2
+    cleanup()
+
+    assert output_tp1 == output_tp2
--- a/tests/lora/test_tokenizer_group.py
+++ b/tests/lora/test_tokenizer_group.py
@@ -0,0 +1,55 @@
+import pytest
+from transformers import AutoTokenizer, PreTrainedTokenizerBase
+
+from vllm.lora.request import LoRARequest
+from vllm.transformers_utils.tokenizer import get_lora_tokenizer
+from vllm.transformers_utils.tokenizer_group import get_tokenizer_group
+
+from ..conftest import get_tokenizer_pool_config
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("tokenizer_group_type", [None, "ray"])
+async def test_tokenizer_group_lora(sql_lora_files, tokenizer_group_type):
+    reference_tokenizer = AutoTokenizer.from_pretrained(sql_lora_files)
+    tokenizer_group = get_tokenizer_group(
+        get_tokenizer_pool_config(tokenizer_group_type),
+        tokenizer_id="gpt2",
+        enable_lora=True,
+        max_num_seqs=1,
+        max_input_length=None,
+    )
+    lora_request = LoRARequest("1", 1, sql_lora_files)
+    assert reference_tokenizer.encode("prompt") == tokenizer_group.encode(
+        request_id="request_id", prompt="prompt", lora_request=lora_request)
+    assert reference_tokenizer.encode(
+        "prompt") == await tokenizer_group.encode_async(
+            request_id="request_id",
+            prompt="prompt",
+            lora_request=lora_request)
+    assert isinstance(tokenizer_group.get_lora_tokenizer(None),
+                      PreTrainedTokenizerBase)
+    assert tokenizer_group.get_lora_tokenizer(
+        None) == await tokenizer_group.get_lora_tokenizer_async(None)
+
+    assert isinstance(tokenizer_group.get_lora_tokenizer(lora_request),
+                      PreTrainedTokenizerBase)
+    assert tokenizer_group.get_lora_tokenizer(
+        lora_request) != tokenizer_group.get_lora_tokenizer(None)
+    assert tokenizer_group.get_lora_tokenizer(
+        lora_request) == await tokenizer_group.get_lora_tokenizer_async(
+            lora_request)
+
+
+def test_get_lora_tokenizer(sql_lora_files, tmpdir):
+    lora_request = None
+    tokenizer = get_lora_tokenizer(lora_request)
+    assert not tokenizer
+
+    lora_request = LoRARequest("1", 1, sql_lora_files)
+    tokenizer = get_lora_tokenizer(lora_request)
+    assert tokenizer.get_added_vocab()
+
+    lora_request = LoRARequest("1", 1, str(tmpdir))
+    tokenizer = get_lora_tokenizer(lora_request)
+    assert not tokenizer
--- a/tests/lora/test_utils.py
+++ b/tests/lora/test_utils.py
@@ -0,0 +1,172 @@
+from collections import OrderedDict
+
+from torch import nn
+
+from vllm.lora.utils import parse_fine_tuned_lora_name, replace_submodule
+from vllm.utils import LRUCache
+
+
+def test_parse_fine_tuned_lora_name():
+    fixture = {
+        ("base_model.model.lm_head.lora_A.weight", "lm_head", True),
+        ("base_model.model.lm_head.lora_B.weight", "lm_head", False),
+        (
+            "base_model.model.model.embed_tokens.lora_embedding_A",
+            "model.embed_tokens",
+            True,
+        ),
+        (
+            "base_model.model.model.embed_tokens.lora_embedding_B",
+            "model.embed_tokens",
+            False,
+        ),
+        (
+            "base_model.model.model.layers.9.mlp.down_proj.lora_A.weight",
+            "model.layers.9.mlp.down_proj",
+            True,
+        ),
+        (
+            "base_model.model.model.layers.9.mlp.down_proj.lora_B.weight",
+            "model.layers.9.mlp.down_proj",
+            False,
+        ),
+    }
+    for name, module_name, is_lora_a in fixture:
+        assert (module_name, is_lora_a) == parse_fine_tuned_lora_name(name)
+
+
+def test_replace_submodule():
+    model = nn.Sequential(
+        OrderedDict([
+            ("dense1", nn.Linear(764, 100)),
+            ("act1", nn.ReLU()),
+            ("dense2", nn.Linear(100, 50)),
+            (
+                "seq1",
+                nn.Sequential(
+                    OrderedDict([
+                        ("dense1", nn.Linear(100, 10)),
+                        ("dense2", nn.Linear(10, 50)),
+                    ])),
+            ),
+            ("act2", nn.ReLU()),
+            ("output", nn.Linear(50, 10)),
+            ("outact", nn.Sigmoid()),
+        ]))
+
+    sigmoid = nn.Sigmoid()
+
+    replace_submodule(model, "act1", sigmoid)
+    assert dict(model.named_modules())["act1"] == sigmoid
+
+    dense2 = nn.Linear(1, 5)
+    replace_submodule(model, "seq1.dense2", dense2)
+    assert dict(model.named_modules())["seq1.dense2"] == dense2
+
+
+class TestLRUCache(LRUCache):
+
+    def _on_remove(self, key, value):
+        if not hasattr(self, "_remove_counter"):
+            self._remove_counter = 0
+        self._remove_counter += 1
+
+
+def test_lru_cache():
+    cache = TestLRUCache(3)
+
+    cache.put(1, 1)
+    assert len(cache) == 1
+
+    cache.put(1, 1)
+    assert len(cache) == 1
+
+    cache.put(2, 2)
+    assert len(cache) == 2
+
+    cache.put(3, 3)
+    assert len(cache) == 3
+    assert set(cache.cache) == {1, 2, 3}
+
+    cache.put(4, 4)
+    assert len(cache) == 3
+    assert set(cache.cache) == {2, 3, 4}
+    assert cache._remove_counter == 1
+    assert cache.get(2) == 2
+
+    cache.put(5, 5)
+    assert set(cache.cache) == {2, 4, 5}
+    assert cache._remove_counter == 2
+
+    assert cache.pop(5) == 5
+    assert len(cache) == 2
+    assert set(cache.cache) == {2, 4}
+    assert cache._remove_counter == 3
+
+    cache.pop(10)
+    assert len(cache) == 2
+    assert set(cache.cache) == {2, 4}
+    assert cache._remove_counter == 3
+
+    cache.get(10)
+    assert len(cache) == 2
+    assert set(cache.cache) == {2, 4}
+    assert cache._remove_counter == 3
+
+    cache.put(6, 6)
+    assert len(cache) == 3
+    assert set(cache.cache) == {2, 4, 6}
+    assert 2 in cache
+    assert 4 in cache
+    assert 6 in cache
+
+    cache.remove_oldest()
+    assert len(cache) == 2
+    assert set(cache.cache) == {2, 6}
+    assert cache._remove_counter == 4
+
+    cache.clear()
+    assert len(cache) == 0
+    assert cache._remove_counter == 6
+
+    cache._remove_counter = 0
+
+    cache[1] = 1
+    assert len(cache) == 1
+
+    cache[1] = 1
+    assert len(cache) == 1
+
+    cache[2] = 2
+    assert len(cache) == 2
+
+    cache[3] = 3
+    assert len(cache) == 3
+    assert set(cache.cache) == {1, 2, 3}
+
+    cache[4] = 4
+    assert len(cache) == 3
+    assert set(cache.cache) == {2, 3, 4}
+    assert cache._remove_counter == 1
+    assert cache[2] == 2
+
+    cache[5] = 5
+    assert set(cache.cache) == {2, 4, 5}
+    assert cache._remove_counter == 2
+
+    del cache[5]
+    assert len(cache) == 2
+    assert set(cache.cache) == {2, 4}
+    assert cache._remove_counter == 3
+
+    cache.pop(10)
+    assert len(cache) == 2
+    assert set(cache.cache) == {2, 4}
+    assert cache._remove_counter == 3
+
+    cache[6] = 6
+    assert len(cache) == 3
+    assert set(cache.cache) == {2, 4, 6}
+    assert 2 in cache
+    assert 4 in cache
+    assert 6 in cache
--- a/tests/lora/test_worker.py
+++ b/tests/lora/test_worker.py
@@ -0,0 +1,69 @@
+import os
+import random
+import tempfile
+from unittest.mock import patch
+
+from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
+                         ModelConfig, ParallelConfig, SchedulerConfig)
+from vllm.lora.models import LoRAMapping
+from vllm.lora.request import LoRARequest
+from vllm.worker.worker import Worker
+
+
+@patch.dict(os.environ, {"RANK": "0"})
+def test_worker_apply_lora(sql_lora_files):
+    worker = Worker(
+        model_config=ModelConfig(
+            "meta-llama/Llama-2-7b-hf",
+            "meta-llama/Llama-2-7b-hf",
+            tokenizer_mode="auto",
+            trust_remote_code=False,
+            seed=0,
+            dtype="float16",
+            revision=None,
+        ),
+        load_config=LoadConfig(
+            download_dir=None,
+            load_format="dummy",
+        ),
+        parallel_config=ParallelConfig(1, 1, False),
+        scheduler_config=SchedulerConfig(32, 32, 32),
+        device_config=DeviceConfig("cuda"),
+        cache_config=CacheConfig(block_size=16,
+                                 gpu_memory_utilization=1.,
+                                 swap_space=0,
+                                 cache_dtype="auto"),
+        local_rank=0,
+        rank=0,
+        lora_config=LoRAConfig(max_lora_rank=8, max_cpu_loras=32,
+                               max_loras=32),
+        distributed_init_method=f"file://{tempfile.mkstemp()[1]}",
+    )
+    worker.init_device()
+    worker.load_model()
+
+    worker.model_runner.set_active_loras([], LoRAMapping([], []))
+    assert worker.list_loras() == set()
+
+    n_loras = 32
+    lora_requests = [
+        LoRARequest(str(i + 1), i + 1, sql_lora_files) for i in range(n_loras)
+    ]
+
+    worker.model_runner.set_active_loras(lora_requests, LoRAMapping([], []))
+    assert worker.list_loras() == {
+        lora_request.lora_int_id
+        for lora_request in lora_requests
+    }
+
+    for i in range(32):
+        random.seed(i)
+        iter_lora_requests = random.choices(lora_requests,
+                                            k=random.randint(1, n_loras))
+        random.shuffle(iter_lora_requests)
+        iter_lora_requests = iter_lora_requests[:-random.randint(0, n_loras)]
+        worker.model_runner.set_active_loras(iter_lora_requests,
+                                             LoRAMapping([], []))
+        assert worker.list_loras().issuperset(
+            {lora_request.lora_int_id
+             for lora_request in iter_lora_requests})
--- a/tests/lora/utils.py
+++ b/tests/lora/utils.py
@@ -0,0 +1,88 @@
+from typing import List, Optional
+
+import torch
+
+from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights
+
+
+class DummyLoRAManager:
+
+    def __init__(self):
+        super().__init__()
+        self._loras = {}
+
+    def set_module_lora(self, module_name: str, lora: LoRALayerWeights):
+        self._loras[module_name] = lora
+
+    def get_module_lora(self, module_name: str) -> Optional[LoRALayerWeights]:
+        return self._loras.get(module_name, None)
+
+    def init_random_lora(self,
+                         module_name: str,
+                         weight: torch.Tensor,
+                         rank: int = 8,
+                         generate_embeddings_tensor: int = 0):
+        lora = LoRALayerWeights(
+            module_name,
+            rank=rank,
+            lora_alpha=1,
+            lora_a=torch.rand([weight.shape[1], rank],
+                              dtype=weight.dtype,
+                              device="cuda"),
+            lora_b=torch.rand([rank, weight.shape[0]],
+                              dtype=weight.dtype,
+                              device="cuda"),
+        )
+        if generate_embeddings_tensor:
+            lora.embeddings_tensor = torch.rand(5,
+                                                generate_embeddings_tensor,
+                                                dtype=weight.dtype,
+                                                device="cuda")
+        self.set_module_lora(module_name, lora)
+
+        return lora
+
+    def init_lora(self,
+                  module_name: str,
+                  input_dim: int,
+                  output_dim: int,
+                  rank=8,
+                  noop=False,
+                  embeddings_tensor=None):
+        lora = LoRALayerWeights(
+            module_name,
+            rank=rank,
+            lora_alpha=1,
+            lora_a=torch.rand([input_dim, rank], device="cuda"),
+            lora_b=torch.rand([rank, output_dim], device="cuda"),
+            embeddings_tensor=embeddings_tensor,
+        )
+        self.set_module_lora(module_name, lora)
+        return lora
+
+    def reset_lora(self):
+        self._loras = {}
+
+    def init_packed_lora(
+        self,
+        module_name: str,
+        input_dim: int,
+        output_dims: List[int],
+        noop_lora_index: List[int] = None,
+        rank=8,
+    ):
+        base_loras = []
+        noop_lora_index = set(noop_lora_index or [])
+
+        for i, out_dim in enumerate(output_dims):
+            base_lora = self.init_lora(
+                module_name + "_000_" + str(i),
+                input_dim,
+                out_dim,
+                rank=rank,
+                noop=i in noop_lora_index,
+            )
+            base_loras.append(base_lora)
+        packed_lora = PackedLoRALayerWeights.pack(base_loras)
+        self.set_module_lora(module_name, packed_lora)
+        return packed_lora
--- a/tests/metrics/test_metrics.py
+++ b/tests/metrics/test_metrics.py
@@ -0,0 +1,194 @@
+from typing import List
+
+import pytest
+from prometheus_client import REGISTRY
+
+from vllm import EngineArgs, LLMEngine
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.engine.async_llm_engine import AsyncLLMEngine
+from vllm.sampling_params import SamplingParams
+
+MODELS = [
+    "facebook/opt-125m",
+]
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize("max_tokens", [128])
+def test_metric_counter_prompt_tokens(
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+) -> None:
+    vllm_model = vllm_runner(model,
+                             dtype=dtype,
+                             disable_log_stats=False,
+                             gpu_memory_utilization=0.4)
+    tokenizer = vllm_model.model.get_tokenizer()
+    prompt_token_counts = [len(tokenizer.encode(p)) for p in example_prompts]
+    # This test needs at least 2 prompts in a batch of different lengths to
+    # verify their token count is correct despite padding.
+    assert len(example_prompts) > 1, "at least 2 prompts are required"
+    assert prompt_token_counts[0] != prompt_token_counts[1], (
+        "prompts of different lengths are required")
+    vllm_prompt_token_count = sum(prompt_token_counts)
+
+    _ = vllm_model.generate_greedy(example_prompts, max_tokens)
+    stat_logger = vllm_model.model.llm_engine.stat_logger
+    metric_count = stat_logger.metrics.counter_prompt_tokens.labels(
+        **stat_logger.labels)._value.get()
+
+    assert vllm_prompt_token_count == metric_count, (
+        f"prompt token count: {vllm_prompt_token_count!r}\n"
+        f"metric: {metric_count!r}")
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize("max_tokens", [128])
+def test_metric_counter_generation_tokens(
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+) -> None:
+    vllm_model = vllm_runner(model,
+                             dtype=dtype,
+                             disable_log_stats=False,
+                             gpu_memory_utilization=0.4)
+    vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+    tokenizer = vllm_model.model.get_tokenizer()
+    stat_logger = vllm_model.model.llm_engine.stat_logger
+    metric_count = stat_logger.metrics.counter_generation_tokens.labels(
+        **stat_logger.labels)._value.get()
+    vllm_generation_count = 0
+    for i in range(len(example_prompts)):
+        vllm_output_ids, vllm_output_str = vllm_outputs[i]
+        prompt_ids = tokenizer.encode(example_prompts[i])
+        # vllm_output_ids contains both prompt tokens and generation tokens.
+        # We're interested only in the count of the generation tokens.
+        vllm_generation_count += len(vllm_output_ids) - len(prompt_ids)
+
+    assert vllm_generation_count == metric_count, (
+        f"generation token count: {vllm_generation_count!r}\n"
+        f"metric: {metric_count!r}")
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize(
+    "served_model_name",
+    [None, [], ["ModelName0"], ["ModelName0", "ModelName1", "ModelName2"]])
+def test_metric_set_tag_model_name(vllm_runner, model: str, dtype: str,
+                                   served_model_name: List[str]) -> None:
+    vllm_model = vllm_runner(model,
+                             dtype=dtype,
+                             disable_log_stats=False,
+                             gpu_memory_utilization=0.3,
+                             served_model_name=served_model_name)
+    stat_logger = vllm_model.model.llm_engine.stat_logger
+    metrics_tag_content = stat_logger.labels["model_name"]
+
+    del vllm_model
+
+    if served_model_name is None or served_model_name == []:
+        assert metrics_tag_content == model, (
+            f"Metrics tag model_name is wrong! expect: {model!r}\n"
+            f"actual: {metrics_tag_content!r}")
+    else:
+        assert metrics_tag_content == served_model_name[0], (
+            f"Metrics tag model_name is wrong! expect: "
+            f"{served_model_name[0]!r}\n"
+            f"actual: {metrics_tag_content!r}")
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [4])
+@pytest.mark.parametrize("disable_log_stats", [True, False])
+@pytest.mark.asyncio
+async def test_async_engine_log_metrics_regression(
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+    disable_log_stats: bool,
+) -> None:
+    """
+    Regression test ensuring async engine generates metrics
+    when disable_log_stats=False
+    (see: https://github.com/vllm-project/vllm/pull/4150#pullrequestreview-2008176678)
+    """
+    engine_args = AsyncEngineArgs(model=model,
+                                  dtype=dtype,
+                                  disable_log_stats=disable_log_stats)
+    async_engine = AsyncLLMEngine.from_engine_args(engine_args)
+    for i, prompt in enumerate(example_prompts):
+        results = async_engine.generate(
+            prompt,
+            SamplingParams(max_tokens=max_tokens),
+            f"request-id-{i}",
+        )
+        # Exhaust the async iterator to make the async engine work
+        async for _ in results:
+            pass
+
+    assert_metrics(async_engine.engine, disable_log_stats,
+                   len(example_prompts))
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [4])
+@pytest.mark.parametrize("disable_log_stats", [True, False])
+def test_engine_log_metrics_regression(
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+    disable_log_stats: bool,
+) -> None:
+    engine_args = EngineArgs(model=model,
+                             dtype=dtype,
+                             disable_log_stats=disable_log_stats)
+    engine = LLMEngine.from_engine_args(engine_args)
+    for i, prompt in enumerate(example_prompts):
+        engine.add_request(
+            f"request-id-{i}",
+            prompt,
+            SamplingParams(max_tokens=max_tokens),
+        )
+    while engine.has_unfinished_requests():
+        engine.step()
+
+    assert_metrics(engine, disable_log_stats, len(example_prompts))
+
+
+def assert_metrics(engine: LLMEngine, disable_log_stats: bool,
+                   num_requests: int) -> None:
+    if disable_log_stats:
+        with pytest.raises(AttributeError):
+            _ = engine.stat_logger
+    else:
+        assert (engine.stat_logger
+                is not None), "engine.stat_logger should be set"
+        # Ensure the count bucket of request-level histogram metrics matches
+        # the number of requests as a simple sanity check to ensure metrics are
+        # generated
+        labels = {'model_name': engine.model_config.model}
+        request_histogram_metrics = [
+            "vllm:e2e_request_latency_seconds",
+            "vllm:request_prompt_tokens",
+            "vllm:request_generation_tokens",
+            "vllm:request_params_best_of",
+            "vllm:request_params_n",
+        ]
+        for metric_name in request_histogram_metrics:
+            metric_value = REGISTRY.get_sample_value(f"{metric_name}_count",
+                                                     labels)
+            assert (
+                metric_value == num_requests), "Metrics should be collected"
--- a/tests/model_executor/weight_utils.py
+++ b/tests/model_executor/weight_utils.py
@@ -0,0 +1,54 @@
+import os
+import tempfile
+
+import huggingface_hub.constants
+import pytest
+from huggingface_hub.utils import LocalEntryNotFoundError
+
+from vllm.model_executor.model_loader.weight_utils import (
+    download_weights_from_hf, enable_hf_transfer)
+
+
+def test_hf_transfer_auto_activation():
+    if "HF_HUB_ENABLE_HF_TRANSFER" in os.environ:
+        # in case it is already set, we can't test the auto activation
+        pytest.skip(
+            "HF_HUB_ENABLE_HF_TRANSFER is set, can't test auto activation")
+    enable_hf_transfer()
+    try:
+        # enable hf hub transfer if available
+        import hf_transfer  # type: ignore # noqa
+        HF_TRANFER_ACTIVE = True
+    except ImportError:
+        HF_TRANFER_ACTIVE = False
+    assert (huggingface_hub.constants.HF_HUB_ENABLE_HF_TRANSFER ==
+            HF_TRANFER_ACTIVE)
+
+
+def test_download_weights_from_hf():
+    with tempfile.TemporaryDirectory() as tmpdir:
+        # assert LocalEntryNotFoundError error is thrown
+        # if offline is set and model is not cached
+        huggingface_hub.constants.HF_HUB_OFFLINE = True
+        with pytest.raises(LocalEntryNotFoundError):
+            download_weights_from_hf("facebook/opt-125m",
+                                     allow_patterns=["*.safetensors", "*.bin"],
+                                     cache_dir=tmpdir)
+
+        # download the model
+        huggingface_hub.constants.HF_HUB_OFFLINE = False
+        download_weights_from_hf("facebook/opt-125m",
+                                 allow_patterns=["*.safetensors", "*.bin"],
+                                 cache_dir=tmpdir)
+
+        # now it should work offline
+        huggingface_hub.constants.HF_HUB_OFFLINE = True
+        assert download_weights_from_hf(
+            "facebook/opt-125m",
+            allow_patterns=["*.safetensors", "*.bin"],
+            cache_dir=tmpdir) is not None
+
+
+if __name__ == "__main__":
+    test_hf_transfer_auto_activation()
+    test_download_weights_from_hf()
--- a/tests/models/test_aqlm.py
+++ b/tests/models/test_aqlm.py
@@ -0,0 +1,95 @@
+"""Compare the outputs of a AQLM model between vLLM and HF Transformers
+
+Run `pytest tests/models/test_aqlm.py`.
+"""
+
+import pytest
+import torch
+
+from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
+
+capability = torch.cuda.get_device_capability()
+capability = capability[0] * 10 + capability[1]
+aqlm_not_supported = (capability <
+                      QUANTIZATION_METHODS["aqlm"].get_min_capability())
+
+# In this test we hardcode prompts and generations for the model so we don't
+# need to require the AQLM package as a dependency
+example_prompts = [
+    'vLLM is a high-throughput and memory-efficient inference and serving '
+    'engine for LLMs.\n',
+    'Briefly describe the major milestones in the development of artificial '
+    'intelligence from 1950 to 2020.\n',
+    'Compare and contrast artificial intelligence with human intelligence in '
+    'terms of processing information.\n',
+    'Describe the basic components of a neural network and how it can be '
+    'trained.\n',
+    'Write a short story about a robot that dreams for the first time.\n',
+    'Analyze the impact of the COVID-19 pandemic on global economic structures '
+    'and future business models.\n',
+    'Explain the cultural significance of the Mona Lisa painting, and how its '
+    'perception might vary in Western versus Eastern societies.\n',
+    "Translate the following English sentence into Japanese, French, and "
+    "Swahili: 'The early bird catches the worm.'\n"
+]
+
+# These ground truth generations were generated using `transformers==4.38.1
+# aqlm==1.1.0 torch==2.2.0`
+# and the below code:
+# ```python
+# from transformers import AutoTokenizer, AutoModelForCausalLM
+# model_id = "ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf"
+# quantized_model = AutoModelForCausalLM.from_pretrained(model_id,
+# torch_dtype="auto", device_map="cuda").cuda()
+# tokenizer = AutoTokenizer.from_pretrained(model_id)
+# outputs = []
+# for prompt in example_prompts:
+#     input_ids = tokenizer(prompt, return_tensors="pt")["input_ids"].to("cuda")
+#     hf_outputs = quantized_model.generate(input_ids, max_new_tokens=32)
+#     outputs.append(tokenizer.decode(hf_outputs[0][input_ids.shape[1]:]))
+# print(outputs)
+# ```
+ground_truth_generations = [
+    '\n### Features\n\n- **High-throughput**: v',
+    'The major milestones in the development of artificial intelligence from '
+    '195',
+    'Compare and contrast artificial intelligence with human intelligence in '
+    'terms of processing information. The',
+    'Explain the difference between supervised and unsupervised learning.'
+    '\nExplain',
+    'Write a short story about a robot that dreams for the first time. The',
+    'Analyze the impact of the COVID-19 pandemic on global economic',
+    'The Mona Lisa is a painting by Leonardo da Vinci, and it',
+    'The early bird catches the worm.\nThe early bird catches the'
+]
+
+
+@pytest.mark.skipif(aqlm_not_supported,
+                    reason="AQLM is not supported on this GPU type.")
+@pytest.mark.parametrize("model", ["ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf"])
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [16])
+@pytest.mark.parametrize("num_logprobs", [1])
+def test_models(
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
+
+    vllm_model = vllm_runner(model, dtype=dtype)
+    vllm_outputs = vllm_model.generate_greedy_logprobs(example_prompts,
+                                                       max_tokens,
+                                                       num_logprobs)
+
+    # loop through the prompts to compare against the ground truth generations
+    for prompt_idx in range(len(example_prompts)):
+        vllm_output_ids, vllm_output_str, vllm_logprobs = vllm_outputs[
+            prompt_idx]
+
+        print("Prompt:          ", repr(example_prompts[prompt_idx]))
+        print("Reference output:", repr(ground_truth_generations[prompt_idx]))
+        print("Output output:   ", repr(vllm_output_str))
+        assert vllm_output_str == ground_truth_generations[prompt_idx]
--- a/tests/models/test_big_models.py
+++ b/tests/models/test_big_models.py
@@ -0,0 +1,60 @@
+"""Compare the outputs of HF and vLLM when using greedy sampling.
+
+This tests bigger models and use half precision.
+
+Run `pytest tests/models/test_big_models.py`.
+"""
+import pytest
+
+MODELS = [
+    "meta-llama/Llama-2-7b-hf",
+    # "mistralai/Mistral-7B-v0.1",  # Broken
+    # "Deci/DeciLM-7b",  # Broken
+    # "tiiuae/falcon-7b",  # Broken
+    "EleutherAI/gpt-j-6b",
+    "mosaicml/mpt-7b",
+    # "Qwen/Qwen1.5-0.5B"  # Broken,
+]
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [32])
+def test_models(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+) -> None:
+    hf_model = hf_runner(model, dtype=dtype)
+    hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+    del hf_model
+
+    vllm_model = vllm_runner(model, dtype=dtype)
+    vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+    del vllm_model
+
+    for i in range(len(example_prompts)):
+        hf_output_ids, hf_output_str = hf_outputs[i]
+        vllm_output_ids, vllm_output_str = vllm_outputs[i]
+        assert hf_output_str == vllm_output_str, (
+            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
+        assert hf_output_ids == vllm_output_ids, (
+            f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+def test_model_print(
+    vllm_runner,
+    model: str,
+    dtype: str,
+) -> None:
+    vllm_model = vllm_runner(model, dtype=dtype)
+    # This test is for verifying whether the model's extra_repr
+    # can be printed correctly.
+    print(vllm_model.model.llm_engine.model_executor.driver_worker.
+          model_runner.model)
+    del vllm_model
--- a/tests/models/test_fp8.py
+++ b/tests/models/test_fp8.py
@@ -0,0 +1,90 @@
+# flake8: noqa
+"""Tests fp8 models against ground truth generation
+Note: these tests will only pass on L4 GPU.
+"""
+import os
+
+import pytest
+import torch
+from transformers import AutoTokenizer
+
+from vllm import LLM, SamplingParams
+from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
+
+os.environ["TOKENIZERS_PARALLELISM"] = "true"
+
+MAX_MODEL_LEN = 1024
+
+MODELS = [
+    "nm-testing/Meta-Llama-3-8B-Instruct-FP8",
+    "meta-llama/Meta-Llama-3-8B-Instruct",
+]
+
+EXPECTED_STRS_MAP = {
+    "nm-testing/Meta-Llama-3-8B-Instruct-FP8": [
+        'LLaMA is a high-throughput and memory-efficient inference and serving engine for Large Language Models (',
+        'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ',
+        'Artificial intelligence (AI) and human intelligence (HI) differ significantly in how they process information.',
+        'A neural network is a complex system modeled after the human brain, composed of interconnected nodes or "ne',
+        'Zeta-5, a highly advanced robot designed for menial labor, whirred and beep',
+        'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. Here',
+        'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of',
+        'Here are the translations:\n\n**Japanese:** (Haya tori, nemuri nemuri)\n\n**'
+    ],
+    "meta-llama/Meta-Llama-3-8B-Instruct": [
+        'LLM (Large Language Model) is a type of artificial intelligence (AI) model that is trained',
+        'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ',
+        'Artificial intelligence (AI) and human intelligence (HI) differ significantly in how they process information.',
+        'A neural network is a complex system modeled after the human brain, composed of interconnected nodes or "ne',
+        'In the year 2154, the robotics lab at NeuroSpark Industries was on the cusp of',
+        'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. The',
+        'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of',
+        'Here are the translations:\n\n**Japanese:** (Haya aki wa mushi o tsukamu'
+    ],
+}
+
+capability = torch.cuda.get_device_capability()
+capability = capability[0] * 10 + capability[1]
+fp8_not_supported = (capability <
+                     QUANTIZATION_METHODS["fp8"].get_min_capability())
+
+
+@pytest.mark.skipif(fp8_not_supported,
+                    reason="fp8 is not supported on this GPU type.")
+@pytest.mark.parametrize("model_name", MODELS)
+def test_models(
+    example_prompts,
+    model_name,
+) -> None:
+    model = LLM(model=model_name,
+                max_model_len=MAX_MODEL_LEN,
+                enforce_eager=True,
+                quantization="fp8")
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    formatted_prompts = [
+        tokenizer.apply_chat_template([{
+            "role": "user",
+            "content": prompt
+        }],
+                                      tokenize=False,
+                                      add_generation_prompt=True)
+        for prompt in example_prompts
+    ]
+
+    params = SamplingParams(max_tokens=20, temperature=0)
+    generations = []
+    # Note: these need to be run 1 at a time due to numerical precision,
+    # since the expected strs were generated this way.
+    for prompt in formatted_prompts:
+        outputs = model.generate(prompt, params)
+        generations.append(outputs[0].outputs[0].text)
+    del model
+
+    print(generations)
+    expected_strs = EXPECTED_STRS_MAP[model_name]
+    for i in range(len(example_prompts)):
+        generated_str = generations[i]
+        expected_str = expected_strs[i]
+        assert expected_str == generated_str, (
+            f"Test{i}:\nExpected: {expected_str!r}\nvLLM: {generated_str!r}")
--- a/tests/models/test_gptq_marlin.py
+++ b/tests/models/test_gptq_marlin.py
@@ -0,0 +1,98 @@
+"""Compares the outputs of gptq vs gptq_marlin 
+Note: GPTQ and Marlin do not have bitwise correctness.
+As a result, in this test, we just confirm that the top selected tokens of the
+Marlin/GPTQ models are in the top 3 selections of each other.
+Note: Marlin internally uses locks to synchronize the threads. This can
+result in very slight nondeterminism for Marlin. As a result, we re-run the test
+up to 3 times to see if we pass.
+Note: This test currently fails running with --forked with the following:
+    RuntimeError: Cannot re-initialize CUDA in forked subprocess.
+    To use CUDA with multiprocessing, you must use the 'spawn' start method
+Run `pytest tests/models/test_gptq_marlin.py`.
+"""
+import os
+
+import pytest
+import torch
+
+from tests.models.utils import check_logprobs_close
+from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
+
+os.environ["TOKENIZERS_PARALLELISM"] = "true"
+
+MAX_MODEL_LEN = 1024
+
+capability = torch.cuda.get_device_capability()
+capability = capability[0] * 10 + capability[1]
+gptq_marlin_not_supported = (
+    capability < QUANTIZATION_METHODS["gptq_marlin"].get_min_capability())
+
+MODELS = [
+    # act_order==False, group_size=channelwise
+    ("robertgshaw2/zephyr-7b-beta-channelwise-gptq", "main"),
+    # act_order==False, group_size=128
+    ("TheBloke/Llama-2-7B-GPTQ", "main"),
+
+    # act_order==True, group_size=128
+    ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "main"),
+    # act_order==True, group_size=64
+    ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-4bit-64g-actorder_True"),
+    # act_order==True, group_size=32
+    ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-4bit-32g-actorder_True"),
+
+    # 8-bit, act_order==True, group_size=channelwise
+    ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-8bit--1g-actorder_True"),
+    # 8-bit, act_order==True, group_size=128
+    ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-8bit-128g-actorder_True"),
+    # 8-bit, act_order==True, group_size=32
+    ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-8bit-32g-actorder_True"),
+]
+
+
+@pytest.mark.flaky(reruns=2)
+@pytest.mark.skipif(gptq_marlin_not_supported,
+                    reason="gptq_marlin is not supported on this GPU type.")
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [32])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_models(
+    vllm_runner,
+    example_prompts,
+    model,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
+    model_name, revision = model
+
+    # Run marlin.
+    gptq_marlin_model = vllm_runner(model_name=model_name,
+                                    revision=revision,
+                                    dtype=dtype,
+                                    quantization="marlin",
+                                    max_model_len=MAX_MODEL_LEN,
+                                    tensor_parallel_size=1)
+
+    gptq_marlin_outputs = gptq_marlin_model.generate_greedy_logprobs(
+        example_prompts, max_tokens, num_logprobs)
+    del gptq_marlin_model
+
+    # Run gptq.
+    gptq_model = vllm_runner(model_name=model_name,
+                             revision=revision,
+                             dtype=dtype,
+                             quantization="gptq",
+                             max_model_len=MAX_MODEL_LEN,
+                             tensor_parallel_size=1)
+    gptq_outputs = gptq_model.generate_greedy_logprobs(example_prompts,
+                                                       max_tokens,
+                                                       num_logprobs)
+    del gptq_model
+
+    check_logprobs_close(
+        outputs_0_lst=gptq_outputs,
+        outputs_1_lst=gptq_marlin_outputs,
+        name_0="gptq",
+        name_1="gptq_marlin",
+    )
--- a/tests/models/test_llava.py
+++ b/tests/models/test_llava.py
@@ -0,0 +1,107 @@
+import gc
+from dataclasses import fields
+from enum import Enum
+from typing import Dict, List, Tuple
+
+import pytest
+import torch
+from transformers import AutoTokenizer
+
+from vllm.config import VisionLanguageConfig
+
+model_and_vl_config = [
+    ("llava-hf/llava-1.5-7b-hf",
+     VisionLanguageConfig(
+         image_input_type=VisionLanguageConfig.ImageInputType.PIXEL_VALUES,
+         image_feature_size=576,
+         image_token_id=32000,
+         image_input_shape=(1, 3, 336, 336))),
+    ("llava-hf/llava-1.5-7b-hf",
+     VisionLanguageConfig(
+         image_input_type=VisionLanguageConfig.ImageInputType.IMAGE_FEATURES,
+         image_feature_size=576,
+         image_token_id=32000,
+         image_input_shape=(1, 576, 1024)))
+]
+
+
+def as_dict(vision_language_config: VisionLanguageConfig) -> Dict:
+    """Flatten vision language config to pure args.
+
+    Compatible with what llm entrypoint expects.
+    """
+    result = {}
+    for field in fields(vision_language_config):
+        value = getattr(vision_language_config, field.name)
+        if isinstance(value, Enum):
+            result[field.name] = value.name.lower()
+        elif isinstance(value, tuple):
+            result[field.name] = ",".join([str(item) for item in value])
+        else:
+            result[field.name] = value
+    return result
+
+
+def sanitize_vllm_output(vllm_output: Tuple[List[int], str],
+                         vision_language_config: VisionLanguageConfig,
+                         model_id: str):
+    """Sanitize vllm output to be comparable with hf output.
+    The function reduces `input_ids` from 1, 32000, 32000, ..., 32000,
+    x1, x2, x3 ... to 1, 32000, x1, x2, x3 ...
+    It also reduces `output_str` from "<image><image>bla" to "bla".
+    """
+    tokenizer = AutoTokenizer.from_pretrained(model_id)
+    image_token_str = tokenizer.decode(vision_language_config.image_token_id)
+    image_token_str_len = len(image_token_str)
+    input_ids, output_str = vllm_output
+    sanitized_input_ids = input_ids[0:2] + input_ids[2 + vision_language_config
+                                                     .image_feature_size - 1:]
+    sanitzied_output_str = output_str[vision_language_config.
+                                      image_feature_size *
+                                      image_token_str_len:]
+    return sanitized_input_ids, sanitzied_output_str
+
+
+@pytest.mark.parametrize("worker_use_ray", [False])
+@pytest.mark.parametrize("model_and_config", model_and_vl_config)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [128])
+def test_models(hf_runner, vllm_runner, hf_image_prompts, hf_images,
+                vllm_image_prompts, vllm_images, model_and_config: tuple,
+                dtype: str, max_tokens: int, worker_use_ray: bool) -> None:
+    """Inference result should be the same between hf and vllm.
+
+    All the image fixtures for the test is under tests/images.
+    For huggingface runner, we provide the raw images as input.
+    For vllm runner, we provide image tensors and corresponding
+    vision language config as input.
+    Note, the text input is also adjusted to abide by vllm contract.
+    The text output is sanitized to be able to compare with hf.
+    """
+    model_id, vision_language_config = model_and_config
+    hf_model = hf_runner(model_id, dtype=dtype)
+    hf_outputs = hf_model.generate_greedy(hf_image_prompts,
+                                          max_tokens,
+                                          images=hf_images)
+    del hf_model
+
+    vllm_model = vllm_runner(model_id,
+                             dtype=dtype,
+                             worker_use_ray=worker_use_ray,
+                             **as_dict(vision_language_config))
+    vllm_outputs = vllm_model.generate_greedy(vllm_image_prompts,
+                                              max_tokens,
+                                              images=vllm_images)
+    del vllm_model
+
+    gc.collect()
+    torch.cuda.empty_cache()
+
+    for i in range(len(hf_image_prompts)):
+        hf_output_ids, hf_output_str = hf_outputs[i]
+        vllm_output_ids, vllm_output_str = sanitize_vllm_output(
+            vllm_outputs[i], vision_language_config, model_id)
+        assert hf_output_str == vllm_output_str, (
+            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
+        assert hf_output_ids == vllm_output_ids, (
+            f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
--- a/tests/models/test_marlin.py
+++ b/tests/models/test_marlin.py
@@ -0,0 +1,78 @@
+"""Compare the outputs of a GPTQ model to a Marlin model.
+
+Note: GPTQ and Marlin do not have bitwise correctness.
+As a result, in this test, we just confirm that the top selected tokens of the
+Marlin/GPTQ models are in the top 3 selections of each other.
+
+Note: Marlin internally uses locks to synchronize the threads. This can
+result in very slight nondeterminism for Marlin. As a result, we re-run the test
+up to 3 times to see if we pass.
+
+Run `pytest tests/models/test_marlin.py`.
+"""
+from dataclasses import dataclass
+
+import pytest
+import torch
+
+from tests.models.utils import check_logprobs_close
+from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
+
+capability = torch.cuda.get_device_capability()
+capability = capability[0] * 10 + capability[1]
+marlin_not_supported = (capability <
+                        QUANTIZATION_METHODS["marlin"].get_min_capability())
+
+
+@dataclass
+class ModelPair:
+    model_marlin: str
+    model_gptq: str
+
+
+model_pairs = [
+    ModelPair(model_marlin="nm-testing/zephyr-beta-7b-marlin-g128",
+              model_gptq="nm-testing/zephyr-beta-7b-gptq-g128"),
+    ModelPair(model_marlin="robertgshaw2/zephyr-7b-beta-channelwise-marlin",
+              model_gptq="robertgshaw2/zephyr-7b-beta-channelwise-gptq"),
+    ModelPair(model_marlin="robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin",
+              model_gptq="robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-gptq")
+]
+
+
+@pytest.mark.flaky(reruns=2)
+@pytest.mark.skipif(marlin_not_supported,
+                    reason="Marlin is not supported on this GPU type.")
+@pytest.mark.parametrize("model_pair", model_pairs)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [32])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_models(
+    vllm_runner,
+    example_prompts,
+    model_pair: ModelPair,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
+    marlin_model = vllm_runner(model_pair.model_marlin,
+                               dtype=dtype,
+                               quantization="marlin")
+    marlin_outputs = marlin_model.generate_greedy_logprobs(
+        example_prompts, max_tokens, num_logprobs)
+    del marlin_model
+
+    gptq_model = vllm_runner(model_pair.model_gptq,
+                             dtype=dtype,
+                             quantization="gptq")
+    gptq_outputs = gptq_model.generate_greedy_logprobs(example_prompts,
+                                                       max_tokens,
+                                                       num_logprobs)
+    del gptq_model
+
+    check_logprobs_close(
+        outputs_0_lst=gptq_outputs,
+        outputs_1_lst=marlin_outputs,
+        name_0="gptq",
+        name_1="marlin",
+    )
--- a/tests/models/test_mistral.py
+++ b/tests/models/test_mistral.py
@@ -0,0 +1,40 @@
+"""Compare the outputs of HF and vLLM for Mistral models using greedy sampling.
+
+Run `pytest tests/models/test_mistral.py`.
+"""
+import pytest
+
+MODELS = [
+    "mistralai/Mistral-7B-Instruct-v0.1",
+]
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.skip(
+    "Two problems: 1. Failing correctness tests. 2. RuntimeError: expected "
+    "scalar type BFloat16 but found Half (only in CI).")
+def test_models(
+    hf_runner,
+    vllm_runner,
+    example_long_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+) -> None:
+    hf_model = hf_runner(model, dtype=dtype)
+    hf_outputs = hf_model.generate_greedy(example_long_prompts, max_tokens)
+    del hf_model
+
+    vllm_model = vllm_runner(model, dtype=dtype)
+    vllm_outputs = vllm_model.generate_greedy(example_long_prompts, max_tokens)
+    del vllm_model
+
+    for i in range(len(example_long_prompts)):
+        hf_output_ids, hf_output_str = hf_outputs[i]
+        vllm_output_ids, vllm_output_str = vllm_outputs[i]
+        assert hf_output_str == vllm_output_str, (
+            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
+        assert hf_output_ids == vllm_output_ids, (
+            f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
--- a/tests/models/test_models.py
+++ b/tests/models/test_models.py
@@ -0,0 +1,66 @@
+"""Compare the outputs of HF and vLLM when using greedy sampling.
+
+This test only tests small models. Big models such as 7B should be tested from
+test_big_models.py because it could use a larger instance to run tests.
+
+Run `pytest tests/models/test_models.py`.
+"""
+import pytest
+
+MODELS = [
+    "facebook/opt-125m",
+    "gpt2",
+    "bigcode/tiny_starcoder_py",
+    "EleutherAI/pythia-70m",
+    "bigscience/bloom-560m",  # Testing alibi slopes.
+    "microsoft/phi-2",
+    "stabilityai/stablelm-3b-4e1t",
+    # "allenai/OLMo-1B",  # Broken
+    "bigcode/starcoder2-3b",
+]
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize("max_tokens", [96])
+def test_models(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+) -> None:
+    # To pass the small model tests, we need full precision.
+    assert dtype == "float"
+
+    hf_model = hf_runner(model, dtype=dtype)
+    hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+    del hf_model
+
+    vllm_model = vllm_runner(model, dtype=dtype)
+    vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+    del vllm_model
+
+    for i in range(len(example_prompts)):
+        hf_output_ids, hf_output_str = hf_outputs[i]
+        vllm_output_ids, vllm_output_str = vllm_outputs[i]
+        assert hf_output_str == vllm_output_str, (
+            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
+        assert hf_output_ids == vllm_output_ids, (
+            f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["float"])
+def test_model_print(
+    vllm_runner,
+    model: str,
+    dtype: str,
+) -> None:
+    vllm_model = vllm_runner(model, dtype=dtype)
+    # This test is for verifying whether the model's extra_repr
+    # can be printed correctly.
+    print(vllm_model.model.llm_engine.model_executor.driver_worker.
+          model_runner.model)
+    del vllm_model
--- a/tests/models/test_oot_registration.py
+++ b/tests/models/test_oot_registration.py
@@ -0,0 +1,32 @@
+import torch
+
+from vllm import LLM, ModelRegistry, SamplingParams
+from vllm.model_executor.models.opt import OPTForCausalLM
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+
+
+class MyOPTForCausalLM(OPTForCausalLM):
+
+    def compute_logits(self, hidden_states: torch.Tensor,
+                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        # this dummy model always predicts the first token
+        logits = super().compute_logits(hidden_states, sampling_metadata)
+        logits.zero_()
+        logits[:, 0] += 1.0
+        return logits
+
+
+def test_oot_registration():
+    # register our dummy model
+    ModelRegistry.register_model("OPTForCausalLM", MyOPTForCausalLM)
+    prompts = ["Hello, my name is", "The text does not matter"]
+    sampling_params = SamplingParams(temperature=0)
+    llm = LLM(model="facebook/opt-125m")
+    first_token = llm.get_tokenizer().decode(0)
+    outputs = llm.generate(prompts, sampling_params)
+
+    for output in outputs:
+        generated_text = output.outputs[0].text
+        # make sure only the first token is generated
+        rest = generated_text.replace(first_token, "")
+        assert rest == ""
--- a/tests/models/utils.py
+++ b/tests/models/utils.py
@@ -0,0 +1,29 @@
+def check_logprobs_close(outputs_0_lst, outputs_1_lst, name_0, name_1):
+    """Compare the logprobs of two sequences generated by different models, 
+    which should be similar but not necessarily equal.
+    """
+    # Loop through responses to each prompt.
+    for prompt_idx, (outputs_0,
+                     outputs_1) in enumerate(zip(outputs_0_lst,
+                                                 outputs_1_lst)):
+        output_ids_0, output_str_0, logprobs_0 = outputs_0
+        output_ids_1, output_str_1, logprobs_1 = outputs_1
+
+        # Loop through generated tokens.
+        for idx, (output_id_0,
+                  output_id_1) in enumerate(zip(output_ids_0, output_ids_1)):
+
+            # If generated tokens don't match, then
+            if output_id_0 != output_id_1:
+                # Each predicted token must be in top N logprobs of the other
+                assert output_id_0 in logprobs_1[idx], (
+                    f"Test{prompt_idx}:"
+                    f"\n{name_0}:\t{output_str_0!r}"
+                    f"\n{name_1}:\t{output_str_1!r}")
+                assert output_id_1 in logprobs_0[idx], (
+                    f"Test{prompt_idx}:"
+                    f"\n{name_0}:\t{output_str_0!r}"
+                    f"\n{name_1}:\t{output_str_1!r}")
+
+                # Break out since sequences will now diverge.
+                break
--- a/tests/prefix_caching/test_prefix_caching.py
+++ b/tests/prefix_caching/test_prefix_caching.py
@@ -0,0 +1,75 @@
+"""Compare the with and without prefix caching.
+
+Run `pytest tests/prefix_caching/test_prefix_caching.py`.
+"""
+import pytest
+
+from vllm.core.block_manager_v1 import CachedBlockAllocator
+from vllm.utils import Device
+
+
+@pytest.mark.parametrize("block_size", [16])
+@pytest.mark.parametrize("num_blocks", [16])
+def test_block_allocator(
+    block_size: int,
+    num_blocks: int,
+):
+    block_hash = 1
+    block_allocator = CachedBlockAllocator(Device.CPU, block_size, num_blocks)
+
+    # Allocate two PysicalTokenBlocks with the same hash and check
+    # that they are the same PhysicalTokenBlock
+    first_block = block_allocator.allocate(block_hash, 0)
+    second_block = block_allocator.allocate(block_hash, 0)
+    assert (first_block == second_block)
+    assert (second_block.ref_count == 2)
+
+    # Free the first_block and confirm that the ref_count is correctly
+    # decremented on the second block
+    block_allocator.free(first_block)
+    assert (second_block.ref_count == 1)
+
+    # Free the second block
+    block_allocator.free(second_block)
+
+    # Reallocate the first block and confirm that, even after the block
+    # had its ref_count go to 0, we still get the same block back
+    first_block = block_allocator.allocate(block_hash, 0)
+    assert (first_block == second_block)
+    assert (first_block.block_hash == block_hash)
+
+
+@pytest.mark.parametrize("num_blocks", [16])
+def test_eviction(num_blocks: int, ):
+    block_size = 16
+    block_allocator = CachedBlockAllocator(Device.CPU, block_size, num_blocks)
+    blocks = []
+
+    for i in range(num_blocks):
+        # use i as the block_hash
+        blocks.append(block_allocator.allocate(i, 0))
+
+    #Free all blocks
+    for block in blocks:
+        block_allocator.free(block)
+
+    # Allocate a new block and confirm that it's the first block freed.
+    # I.E The Least Recently Used block
+    new_block_hash = block_size
+    new_block = block_allocator.allocate(new_block_hash, 0)
+    assert (new_block == blocks[0])
+    assert (new_block.block_hash == new_block_hash)
+
+    # Reallocate the second in blocks to remove it from the free list
+    realloc_block_hash = 1
+    realloc_block = block_allocator.allocate(realloc_block_hash, 0)
+    assert (realloc_block == blocks[realloc_block_hash])
+    assert (realloc_block.block_hash == realloc_block_hash)
+
+    # Allocate a new block and confirm that it's not the realloc_block,
+    # since the realloc_block shouldn't be in the free list
+    new_block_hash = block_size + 1
+    new_block = block_allocator.allocate(new_block_hash, 0)
+    assert (realloc_block != new_block)
+    assert (new_block.block_hash == new_block_hash)
+    assert (new_block.block_number == 2)
--- a/tests/prompts/example.txt
+++ b/tests/prompts/example.txt
@@ -0,0 +1,8 @@
+vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs.
+Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020.
+Compare and contrast artificial intelligence with human intelligence in terms of processing information.
+Describe the basic components of a neural network and how it can be trained.
+Write a short story about a robot that dreams for the first time.
+Analyze the impact of the COVID-19 pandemic on global economic structures and future business models.
+Explain the cultural significance of the Mona Lisa painting, and how its perception might vary in Western versus Eastern societies.
+Translate the following English sentence into Japanese, French, and Swahili: 'The early bird catches the worm.'
--- a/tests/prompts/summary.txt
+++ b/tests/prompts/summary.txt
--- a/tests/quantization/test_configs.py
+++ b/tests/quantization/test_configs.py
@@ -0,0 +1,73 @@
+"""Tests whether Marlin models can be loaded from the autogptq config.
+
+Run `pytest tests/quantization/test_configs.py --forked`.
+"""
+
+from dataclasses import dataclass
+
+import pytest
+
+from vllm.config import ModelConfig
+
+
+@dataclass
+class ModelPair:
+    model_marlin: str
+    model_gptq: str
+
+
+# Model Id // Quantization Arg // Expected Type
+MODEL_ARG_EXPTYPES = [
+    # AUTOGPTQ
+    # compat: autogptq <=0.7.1 is_marlin_format: bool
+    # Model Serialized in Marlin Format should always use Marlin kernel.
+    ("neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin", None, "marlin"),
+    ("neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin", "marlin", "marlin"),
+    ("neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin", "gptq", "marlin"),
+    ("neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin", "awq", "ERROR"),
+    # Model Serialized in Exllama Format.
+    ("TheBloke/Llama-2-7B-Chat-GPTQ", None, "gptq_marlin"),
+    ("TheBloke/Llama-2-7B-Chat-GPTQ", "marlin", "gptq_marlin"),
+    ("TheBloke/Llama-2-7B-Chat-GPTQ", "gptq", "gptq"),
+    ("TheBloke/Llama-2-7B-Chat-GPTQ", "awq", "ERROR"),
+    # compat: autogptq >=0.8.0 use checkpoint_format: str
+    # Model Serialized in Marlin Format should always use Marlin kernel.
+    ("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-Marlin-4bit", None, "marlin"),
+    ("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-Marlin-4bit", "marlin", "marlin"),
+    ("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-Marlin-4bit", "gptq", "marlin"),
+    ("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-Marlin-4bit", "awq", "ERROR"),
+    # Model Serialized in Exllama Format.
+    ("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", None, "gptq_marlin"),
+    ("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", "marlin", "gptq_marlin"),
+    ("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", "gptq", "gptq"),
+    ("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", "awq", "ERROR"),
+
+    # AUTOAWQ
+    ("TheBloke/OpenHermes-2.5-Mistral-7B-AWQ", None, "awq"),
+    ("TheBloke/OpenHermes-2.5-Mistral-7B-AWQ", "awq", "awq"),
+    ("TheBloke/OpenHermes-2.5-Mistral-7B-AWQ", "marlin", "ERROR"),
+    ("TheBloke/OpenHermes-2.5-Mistral-7B-AWQ", "gptq", "ERROR"),
+]
+
+
+@pytest.mark.parametrize("model_arg_exptype", MODEL_ARG_EXPTYPES)
+def test_auto_gptq(model_arg_exptype: str) -> None:
+    model_path, quantization_arg, expected_type = model_arg_exptype
+
+    try:
+        model_config = ModelConfig(model_path,
+                                   model_path,
+                                   tokenizer_mode="auto",
+                                   trust_remote_code=False,
+                                   seed=0,
+                                   dtype="float16",
+                                   revision=None,
+                                   quantization=quantization_arg)
+        found_quantization_type = model_config.quantization
+    except ValueError:
+        found_quantization_type = "ERROR"
+
+    assert found_quantization_type == expected_type, (
+        f"Expected quant_type == {expected_type} for {model_path}, "
+        f"but found {found_quantization_type} "
+        f"for no --quantization {quantization_arg} case")
--- a/tests/quantization/test_fp8.py
+++ b/tests/quantization/test_fp8.py
@@ -0,0 +1,24 @@
+"""Tests whether FP8 computation is enabled correctly.
+
+Run `pytest tests/quantization/test_fp8.py --forked`.
+"""
+import pytest
+import torch
+
+from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
+from vllm.model_executor.layers.quantization.fp8 import Fp8LinearMethod
+
+capability = torch.cuda.get_device_capability()
+capability = capability[0] * 10 + capability[1]
+
+
+@pytest.mark.skipif(
+    capability < QUANTIZATION_METHODS["fp8"].get_min_capability(),
+    reason="FP8 is not supported on this GPU type.")
+def test_load_fp16_model(vllm_runner) -> None:
+    llm = vllm_runner("facebook/opt-125m", quantization="fp8")
+
+    model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model
+    fc1 = model.model.decoder.layers[0].fc1
+    assert isinstance(fc1.quant_method, Fp8LinearMethod)
+    assert fc1.weight.dtype == torch.float8_e4m3fn
--- a/tests/samplers/test_beam_search.py
+++ b/tests/samplers/test_beam_search.py
@@ -0,0 +1,54 @@
+"""Compare the outputs of HF and vLLM when using beam search.
+
+Run `pytest tests/samplers/test_beam_search.py`.
+"""
+import gc
+
+import pytest
+import torch
+
+# FIXME(zhuohan): The test can not pass if we:
+#   1. Increase max_tokens to 256.
+#   2. Increase beam_width to 8.
+#   3. Use the model "huggyllama/llama-7b".
+MAX_TOKENS = [128]
+BEAM_WIDTHS = [4]
+MODELS = ["facebook/opt-125m"]
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", MAX_TOKENS)
+@pytest.mark.parametrize("beam_width", BEAM_WIDTHS)
+def test_beam_search_single_input(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+    beam_width: int,
+) -> None:
+    example_prompts = example_prompts[:1]
+    hf_model = hf_runner(model, dtype=dtype)
+    hf_outputs = hf_model.generate_beam_search(example_prompts, beam_width,
+                                               max_tokens)
+    del hf_model
+
+    vllm_model = vllm_runner(model, dtype=dtype)
+    vllm_outputs = vllm_model.generate_beam_search(example_prompts, beam_width,
+                                                   max_tokens)
+    del vllm_model
+    # NOTE(woosuk): For some reason, the following GC is required to avoid
+    # GPU OOM errors in the following tests using `vllm_runner`.
+    gc.collect()
+    torch.cuda.empty_cache()
+
+    for i in range(len(example_prompts)):
+        hf_output_ids, _ = hf_outputs[i]
+        vllm_output_ids, _ = vllm_outputs[i]
+        assert len(hf_output_ids) == len(vllm_output_ids)
+        for j in range(len(hf_output_ids)):
+            assert hf_output_ids[j] == vllm_output_ids[j], (
+                f"Test{i} output{j}:\nHF: {hf_output_ids}\n"
+                f"vLLM: {vllm_output_ids}")
--- a/tests/samplers/test_ignore_eos.py
+++ b/tests/samplers/test_ignore_eos.py
@@ -0,0 +1,31 @@
+"""Make sure ignore_eos works.
+
+Run `pytest tests/samplers/test_ignore_eos.py`.
+"""
+
+import pytest
+
+from vllm import SamplingParams
+
+MODELS = ["facebook/opt-125m"]
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [1024])
+def test_beam_search_single_input(
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+) -> None:
+    example_prompts = "1 + 1 is"
+
+    vllm_model = vllm_runner(model, dtype=dtype)
+    sampling_params = SamplingParams(max_tokens=max_tokens, ignore_eos=True)
+    ignore_eos_output = vllm_model.model.generate(
+        example_prompts, sampling_params=sampling_params)
+    print(len(ignore_eos_output[0].outputs[0].token_ids))
+    assert max_tokens - len(ignore_eos_output[0].outputs[0].token_ids) < 10
+    assert max_tokens - len(ignore_eos_output[0].outputs[0].token_ids) >= 0
--- a/tests/samplers/test_logits_processor.py
+++ b/tests/samplers/test_logits_processor.py
@@ -0,0 +1,62 @@
+import pytest
+import torch
+
+from vllm import SamplingParams
+
+MODELS = ["facebook/opt-125m"]
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+def test_logits_processor_force_generate(
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+) -> None:
+    vllm_model = vllm_runner(model, dtype=dtype)
+    tokenizer = vllm_model.model.get_tokenizer()
+    repeat_times = 2
+    enforced_answers = " vLLM"
+    vllm_token_ids = tokenizer.encode(enforced_answers,
+                                      add_special_tokens=False)
+    max_tokens = len(vllm_token_ids) * repeat_times
+
+    def pick_vllm(token_ids, logits):
+        token_id = vllm_token_ids[len(token_ids) % len(vllm_token_ids)]
+        logits[token_id] = torch.finfo(logits.dtype).max
+        return logits
+
+    params_with_logprobs = SamplingParams(
+        logits_processors=[pick_vllm],
+        prompt_logprobs=3,
+        max_tokens=max_tokens,
+    )
+
+    # test logits_processors when prompt_logprobs is not None
+    vllm_model.model._add_request(
+        prompt=example_prompts[0],
+        sampling_params=params_with_logprobs,
+        prompt_token_ids=None,
+    )
+
+    # test prompt_logprobs is not None
+    vllm_model.model._add_request(
+        prompt=example_prompts[1],
+        sampling_params=SamplingParams(
+            prompt_logprobs=3,
+            max_tokens=max_tokens,
+        ),
+        prompt_token_ids=None,
+    )
+
+    # test grouped requests
+    vllm_model.model._add_request(
+        prompt=example_prompts[2],
+        sampling_params=SamplingParams(max_tokens=max_tokens),
+        prompt_token_ids=None,
+    )
+
+    outputs = vllm_model.model._run_engine(False)
+
+    assert outputs[0].outputs[0].text == enforced_answers * repeat_times
--- a/tests/samplers/test_logprobs.py
+++ b/tests/samplers/test_logprobs.py
@@ -0,0 +1,124 @@
+import pytest
+import torch
+
+from tests.conftest import VllmRunner
+from vllm import SamplingParams
+
+MODELS = ["facebook/opt-125m"]
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16, -1])
+@pytest.mark.parametrize("num_top_logprobs", [6])  # 32000 == vocab_size
+def test_get_prompt_logprobs(
+    hf_runner,
+    vllm_runner,
+    model,
+    dtype,
+    chunked_prefill_token_size: int,
+    num_top_logprobs: int,
+    example_prompts,
+):
+    max_num_seqs = 256
+    enable_chunked_prefill = False
+    max_num_batched_tokens = None
+    if chunked_prefill_token_size != -1:
+        enable_chunked_prefill = True
+        max_num_seqs = min(chunked_prefill_token_size, max_num_seqs)
+        max_num_batched_tokens = chunked_prefill_token_size
+
+    max_tokens = 5
+    hf_model = hf_runner(model, dtype=dtype)
+    hf_logprobs = hf_model.generate_greedy_logprobs(
+        example_prompts,
+        max_tokens=max_tokens,
+    )
+    del hf_model
+
+    vllm_model = vllm_runner(
+        model,
+        dtype=dtype,
+        max_logprobs=num_top_logprobs,
+        enable_chunked_prefill=enable_chunked_prefill,
+        max_num_batched_tokens=max_num_batched_tokens,
+        max_num_seqs=max_num_seqs,
+    )
+    vllm_sampling_params = SamplingParams(max_tokens=max_tokens,
+                                          logprobs=num_top_logprobs,
+                                          prompt_logprobs=num_top_logprobs,
+                                          temperature=0.0)
+    vllm_results = vllm_model.model.generate(
+        example_prompts, sampling_params=vllm_sampling_params)
+
+    # Test whether logprobs are included in the results.
+    for result in vllm_results:
+        assert result.prompt_logprobs is not None
+        assert result.outputs[0].logprobs is not None
+        assert len(result.outputs[0].logprobs) == max_tokens
+        for logprobs in result.outputs[0].logprobs:
+            assert len(logprobs) == num_top_logprobs
+        output_text = result.outputs[0].text
+        output_string_from_most_likely_tokens = []
+        for top_logprobs in result.outputs[0].logprobs:
+            top_logprob = next(iter(top_logprobs.values()))
+            output_string_from_most_likely_tokens.append(
+                top_logprob.decoded_token)
+        output_string_from_most_likely_tokens = "".join(
+            output_string_from_most_likely_tokens)
+        assert output_text == output_string_from_most_likely_tokens, (
+            "The output text from the top logprob for each token position "
+            "should be the same as the output text in the result.")
+
+        # The first prompt logprob is always None
+        assert result.prompt_logprobs[0] is None
+        for prompt_logprobs in result.prompt_logprobs[1:]:
+            # If the prompt token is not included in the top X
+            # logprob, it can return 1 more data
+            assert (len(prompt_logprobs) == num_top_logprobs
+                    or len(prompt_logprobs) == num_top_logprobs + 1)
+
+    # Test whether prompt logprobs are consistent with HF
+    for vllm_result, hf_logprob in zip(vllm_results, hf_logprobs):
+        # Check prompt logprobs
+        # The first prompt logprob is always None, so we compare it from 1:.
+        vllm_prompt_logprobs = vllm_result.prompt_logprobs[1:]
+        for i, vllm_prompt_logprob_dict in enumerate(vllm_prompt_logprobs):
+            for token_id, logprob in vllm_prompt_logprob_dict.items():
+                torch.testing.assert_close(logprob.logprob,
+                                           hf_logprob[0][i][token_id].item(),
+                                           atol=1e-2,
+                                           rtol=1e-2)
+        vllm_sample_logprobs = vllm_result.outputs[0].logprobs
+        for i, top_logprobs in enumerate(vllm_sample_logprobs):
+            for token_id, sample_logprob in top_logprobs.items():
+                logprob = sample_logprob.logprob
+                torch.testing.assert_close(logprob,
+                                           hf_logprob[i][-1][token_id].item(),
+                                           atol=1e-2,
+                                           rtol=1e-2)
+                assert isinstance(sample_logprob.decoded_token, str), (
+                    "The token should be decoded by the time it is returned "
+                    " to the user.")
+
+    # Test if prompt logprobs are correctly set.
+    for vllm_result in vllm_results:
+        token_ids = vllm_result.prompt_token_ids
+        prompt_logprobs = vllm_result.prompt_logprobs
+
+        # The first token doesn't have logprob.
+        assert prompt_logprobs[0] is None
+
+        for token_id, logprob_dict in zip(token_ids[1:], prompt_logprobs[1:]):
+            assert token_id in logprob_dict
+
+
+def test_max_logprobs():
+    runner = VllmRunner("facebook/opt-125m", max_logprobs=1)
+    vllm_sampling_params = SamplingParams(logprobs=1)
+    # should pass
+    runner.generate(["Hello world"], sampling_params=vllm_sampling_params)
+
+    bad_sampling_params = SamplingParams(logprobs=2)
+    with pytest.raises(ValueError):
+        runner.generate(["Hello world"], sampling_params=bad_sampling_params)
--- a/tests/samplers/test_ranks.py
+++ b/tests/samplers/test_ranks.py
@@ -0,0 +1,50 @@
+import pytest
+
+from vllm import SamplingParams
+
+MODELS = ["facebook/opt-125m"]
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+def test_ranks(
+    vllm_runner,
+    model,
+    dtype,
+    example_prompts,
+):
+    max_tokens = 5
+    num_top_logprobs = 5
+    num_prompt_logprobs = 5
+
+    vllm_model = vllm_runner(model, dtype=dtype, max_logprobs=num_top_logprobs)
+
+    ## Test greedy logprobs ranks
+    vllm_sampling_params = SamplingParams(temperature=0.0,
+                                          top_p=1.0,
+                                          max_tokens=max_tokens,
+                                          logprobs=num_top_logprobs,
+                                          prompt_logprobs=num_prompt_logprobs)
+    vllm_results = vllm_model.generate_w_logprobs(example_prompts,
+                                                  vllm_sampling_params)
+    for result in vllm_results:
+        assert result[2] is not None
+        assert len(result[2]) == len(result[0])
+        # check whether all chosen tokens have ranks = 1
+        for token, logprobs in zip(result[0], result[2]):
+            assert token in logprobs
+            assert logprobs[token].rank == 1
+
+    ## Test non-greedy logprobs ranks
+    sampling_params = SamplingParams(temperature=1.0,
+                                     top_p=1.0,
+                                     max_tokens=max_tokens,
+                                     logprobs=num_top_logprobs,
+                                     prompt_logprobs=num_prompt_logprobs)
+    res = vllm_model.generate_w_logprobs(example_prompts, sampling_params)
+    for result in res:
+        assert result[2] is not None
+        assert len(result[2]) == len(result[0])
+        # check whether all chosen tokens have ranks
+        for token, logprobs in zip(result[0], result[2]):
+            assert logprobs[token].rank >= 1
--- a/tests/samplers/test_rejection_sampler.py
+++ b/tests/samplers/test_rejection_sampler.py
@@ -0,0 +1,385 @@
+"""Tests for rejection sampling."""
+from typing import List, Tuple
+
+import pytest
+import torch
+import torch.nn.functional as F
+
+from vllm.model_executor.layers.rejection_sampler import RejectionSampler
+from vllm.model_executor.utils import set_random_seed
+
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+]
+
+
+def mock_causal_accepted_tensor(
+        k: int, last_accepted_indices: torch.Tensor) -> torch.Tensor:
+    """Generate an "accepted" tensor which should yield causally-accepted tokens
+    up to last accepted indices.
+
+    Tokens after last_accepted_indices+1 may also be accepted, although they
+    will not be causally accepted.
+    """
+    batch_size = last_accepted_indices.shape[0]
+
+    accepted = (torch.arange(k).expand(batch_size, k) <=
+                last_accepted_indices.unsqueeze(-1).broadcast_to(
+                    batch_size, k)).to(device="cuda")
+
+    # Sprinkle accepted values after the contiguous initial accepted values.
+    # This replicates the behavior of rejection sampling, which may "accept"
+    # a token that cannot be accepted because of causality.
+    sprinkle_candidates = (
+        torch.arange(k).expand(batch_size, k) >
+        last_accepted_indices.unsqueeze(-1).broadcast_to(batch_size, k) + 1)
+    sprinkle = torch.rand(batch_size, k, device="cuda") > 0.5
+    accepted[sprinkle_candidates] = sprinkle[sprinkle_candidates]
+    return accepted
+
+
+@pytest.mark.parametrize("seed", list(range(10)))
+@pytest.mark.parametrize(
+    "which_tokens_accepted",
+    ["all_tokens_accepted", "no_tokens_accepted", "some_tokens_accepted"])
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@torch.inference_mode()
+def test_correct_output_format(which_tokens_accepted: str, seed: int,
+                               device: str):
+    """Verify the output has correct format given predetermined accepted matrix.
+    """
+    set_random_seed(seed)
+    torch.set_default_device(device)
+
+    batch_size = 10
+    k = 5
+    vocab_size = 3000
+
+    if which_tokens_accepted == "all_tokens_accepted":
+        accepted = mock_causal_accepted_tensor(
+            k, -1 + k * torch.ones((batch_size, ), dtype=torch.long))
+    elif which_tokens_accepted == "no_tokens_accepted":
+        accepted = mock_causal_accepted_tensor(
+            k, -torch.ones((batch_size, ), dtype=torch.long))
+    elif which_tokens_accepted == "some_tokens_accepted":
+        last_accepted_indices = torch.randint(low=-1,
+                                              high=k,
+                                              size=(batch_size, ))
+        accepted = mock_causal_accepted_tensor(k, last_accepted_indices)
+    else:
+        raise AssertionError()
+
+    recovered_token_ids = torch.randint(low=0,
+                                        high=vocab_size,
+                                        size=(batch_size, k),
+                                        dtype=torch.int64)
+    draft_token_ids = torch.randint(low=0,
+                                    high=vocab_size,
+                                    size=(batch_size, k),
+                                    dtype=torch.int64)
+    bonus_token_ids = torch.randint(low=0,
+                                    high=vocab_size,
+                                    size=(batch_size, 1),
+                                    dtype=torch.int64)
+
+    rejection_sampler = RejectionSampler()
+    rejection_sampler.init_gpu_tensors(rank=0)
+    output_token_ids = rejection_sampler._create_output(  # pylint: disable=protected-access
+        accepted,
+        recovered_token_ids,
+        draft_token_ids,
+        bonus_token_ids,
+    )
+
+    # Bonus tokens are currently disabled. Verify they're set to -1.
+    # See https://github.com/vllm-project/vllm/issues/4212
+    expected_bonus_token_ids = bonus_token_ids.clone() * 0 - 1
+
+    if which_tokens_accepted == "all_tokens_accepted":
+        # Expect all tokens to be equal to draft tokens.
+        assert torch.equal(output_token_ids[:, :-1], draft_token_ids)
+
+        # Expect all bonus tokens to be included.
+        assert torch.equal(output_token_ids[:, -1:], expected_bonus_token_ids)
+    elif which_tokens_accepted == "no_tokens_accepted":
+        # Expect first token to be equal to recovered tokens.
+        assert torch.equal(output_token_ids[:, 0], recovered_token_ids[:, 0])
+
+        # Expect everything else to be -1.
+        assert torch.equal(output_token_ids[:, 1:],
+                           torch.ones_like(output_token_ids[:, 1:]) * -1)
+    elif which_tokens_accepted == "some_tokens_accepted":
+        recovered_plus_bonus = torch.cat(
+            (recovered_token_ids, expected_bonus_token_ids), dim=-1)
+        # Assert first rejected token is a recovered token or bonus token.
+        assert torch.equal(
+            recovered_plus_bonus[torch.arange(0, batch_size),
+                                 last_accepted_indices + 1],
+            output_token_ids[torch.arange(0, batch_size),
+                             last_accepted_indices + 1])
+
+        # Assert every subsequent token is -1.
+        subsequent_mask = torch.arange(0, k + 1).expand(
+            batch_size, k + 1) >= (last_accepted_indices + 2).unsqueeze(-1)
+        assert torch.all(output_token_ids[subsequent_mask] == -1)
+
+
+@pytest.mark.parametrize("k", list(range(1, 6)))
+@pytest.mark.parametrize("vocab_size", [30_000, 50_000])
+@pytest.mark.parametrize("batch_size", list(range(1, 32)))
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@torch.inference_mode()
+def test_no_crash_with_varying_dims(k: int, vocab_size: int, batch_size: int,
+                                    device: str):
+    torch.set_default_device(device)
+    rejection_sampler = RejectionSampler()
+    rejection_sampler.init_gpu_tensors(rank=0)
+
+    draft_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
+    target_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
+    bonus_token_ids = torch.randint(low=0,
+                                    high=vocab_size,
+                                    size=(batch_size, 1),
+                                    dtype=torch.int64)
+    draft_token_ids = torch.randint(low=0,
+                                    high=vocab_size,
+                                    size=(batch_size, k),
+                                    dtype=torch.int64)
+
+    rejection_sampler(target_probs, bonus_token_ids, draft_probs,
+                      draft_token_ids)
+
+
+@pytest.mark.parametrize("above_or_below_vocab_range", ["above", "below"])
+@pytest.mark.parametrize("which_token_ids",
+                         ["bonus_token_ids", "draft_token_ids"])
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@torch.inference_mode()
+def test_raises_when_vocab_oob(above_or_below_vocab_range: str,
+                               which_token_ids: str, device: str):
+    k = 3
+    batch_size = 5
+    vocab_size = 30_000
+    torch.set_default_device(device)
+
+    rejection_sampler = RejectionSampler(strict_mode=True)
+    rejection_sampler.init_gpu_tensors(rank=0)
+
+    draft_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
+    target_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
+    bonus_token_ids = torch.randint(low=0,
+                                    high=vocab_size,
+                                    size=(batch_size, 1),
+                                    dtype=torch.int64)
+    draft_token_ids = torch.randint(low=0,
+                                    high=vocab_size,
+                                    size=(batch_size, k),
+                                    dtype=torch.int64)
+
+    oob_token_ids = None
+    if which_token_ids == "bonus_token_ids":
+        oob_token_ids = bonus_token_ids
+    elif which_token_ids == "draft_token_ids":
+        oob_token_ids = draft_token_ids
+    else:
+        raise AssertionError()
+
+    if above_or_below_vocab_range == "above":
+        rogue_token_id = vocab_size + 1
+    elif above_or_below_vocab_range == "below":
+        rogue_token_id = -1
+    else:
+        raise AssertionError()
+
+    oob_token_ids[0][0] = rogue_token_id
+
+    with pytest.raises(AssertionError):
+        rejection_sampler(target_probs, bonus_token_ids, draft_probs,
+                          draft_token_ids)
+
+
+@pytest.mark.parametrize("draft_and_target_probs_equal", [True, False])
+@pytest.mark.parametrize("seed", list(range(5)))
+@torch.inference_mode()
+def test_rejection_sampling_approximates_target_distribution(
+        seed: int, draft_and_target_probs_equal: bool):
+    """Verify rejection sampling approximates target distribution,
+    despite sampling from a potentially distinct draft distribution.
+
+    This is done by first creating a random target probability
+    distribution and a random draft probability distribution. We then
+    sample token ids from the rejection sampler using these draft
+    and target distributions. The samples are used to estimate
+    the output probability distribution, which we expect to approximate
+    the target distribution.
+
+    A basic distance metric is used to determine similarity between
+    distributions.
+
+    We expect that as we increase the number of samples,
+    the distance between the observed distribution and the target
+    distribution decreases. To measure this, we compare the distance
+    of the observed distribution against both the target distribution
+    and a uniform random distribution. We expect the distance between
+    the observed distribution and the target distribution to improve
+    much more than the distance improvement between the observed
+    distribution and the random distribution.
+
+    When draft_and_target_probs_equal=True, the draft and target
+    probabilities are exactly equal. Rejection sampling should
+    still work without any NaNs or exceptions.
+    """
+    torch.set_default_device("cpu")
+    set_random_seed(seed)
+
+    helper = _CorrectnessTestHelper(
+        vocab_size=10,
+        rejection_sampler=RejectionSampler(),
+    )
+
+    draft_probs, target_probs, reference_probs = helper.generate_probs_for_test(
+        draft_and_target_probs_equal)
+
+    sample_sizes = [10, 100, 1_000, 10_000, 100_000]
+    distance_wrt_reference = []
+    distance_wrt_target = []
+
+    for num_samples in sample_sizes:
+        (reference_vs_rejsample_dist,
+         target_vs_rejsample_dist) = helper.run_and_compare_distributions(
+             draft_probs,
+             target_probs,
+             reference_probs,
+             num_samples,
+         )
+
+        distance_wrt_reference.append(reference_vs_rejsample_dist)
+        distance_wrt_target.append(target_vs_rejsample_dist)
+
+        relative_change_in_distance_wrt_target = get_ratio_first_to_last(
+            distance_wrt_target)
+        relative_change_in_distance_wrt_reference = get_ratio_first_to_last(
+            distance_wrt_reference)
+
+        print(f"{num_samples=} {target_vs_rejsample_dist=:.05f} "
+              f"{reference_vs_rejsample_dist=:.05f}")
+        print(f"{num_samples=} {relative_change_in_distance_wrt_target=:.02f} "
+              f"{relative_change_in_distance_wrt_reference=:.02f}")
+
+    relative_change_in_distance_wrt_target = get_ratio_first_to_last(
+        distance_wrt_target)
+    relative_change_in_distance_wrt_reference = get_ratio_first_to_last(
+        distance_wrt_reference)
+
+    expected_improvement_multiplier = 20
+    assert (relative_change_in_distance_wrt_target >
+            relative_change_in_distance_wrt_reference *
+            expected_improvement_multiplier)
+
+
+def get_ratio_first_to_last(elements: List[float]) -> float:
+    return elements[0] / elements[-1]
+
+
+class _CorrectnessTestHelper:
+    """Class that packages together logic required for the unit-level
+    rejection sampling correctness test.
+    """
+
+    def __init__(self, vocab_size: int, rejection_sampler: RejectionSampler):
+        self.rejection_sampler = rejection_sampler
+        self.vocab_size = vocab_size
+        self.vocab_range = (0, vocab_size)
+
+        self.rejection_sampler.init_gpu_tensors(rank=0)
+
+        # Keep test simple, use k=1
+        self.k = 1
+
+        # Bonus tokens not used, but rejection sampler requires
+        # correct shape.
+        self.num_bonus_tokens = 1
+
+    def generate_probs_for_test(
+        self, draft_and_target_probs_equal: bool
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        draft_probs, target_probs = [
+            F.softmax(
+                torch.rand(self.vocab_size, dtype=torch.float32),
+                dim=-1,
+            ) for _ in range(2)
+        ]
+
+        num_reference_probs = 100
+        reference_probs = F.softmax(
+            torch.rand(num_reference_probs,
+                       self.vocab_size,
+                       dtype=torch.float32),
+            dim=-1,
+        )
+
+        if draft_and_target_probs_equal:
+            target_probs = draft_probs.clone()
+
+        return draft_probs, target_probs, reference_probs
+
+    def run_and_compare_distributions(self, draft_probs: torch.Tensor,
+                                      target_probs: torch.Tensor,
+                                      reference_probs: torch.Tensor,
+                                      num_samples: int) -> Tuple[float, float]:
+        # Sample using rejection sampling.
+        rej_sample_probs = self._estimate_rejection_sampling_pdf(
+            draft_probs, target_probs, num_samples)
+
+        # Average distance from reference probs.
+        reference_vs_rejsample_dist = torch.dist(
+            reference_probs,
+            rej_sample_probs).item() / reference_probs.shape[0]
+        target_vs_rejsample_dist = torch.dist(target_probs,
+                                              rej_sample_probs).item()
+
+        return reference_vs_rejsample_dist, target_vs_rejsample_dist
+
+    def _estimate_rejection_sampling_pdf(
+        self,
+        draft_probs: torch.Tensor,
+        target_probs: torch.Tensor,
+        num_samples: int,
+    ) -> torch.Tensor:
+        # Repeat draft probs num_samples times.
+        draft_probs = draft_probs.reshape(1, self.k, self.vocab_size).repeat(
+            num_samples, 1, 1)
+
+        # Repeat target probs num_samples * k times.
+        # Rejection sampler requires bonus token probs, but they aren't used.
+        target_probs = target_probs.reshape(1, 1, self.vocab_size).repeat(
+            num_samples, self.k, 1)
+
+        # Randomly sample draft token ids from draft probs.
+        draft_token_ids = torch.multinomial(draft_probs[:, 0, :],
+                                            num_samples=1,
+                                            replacement=True).reshape(
+                                                num_samples, self.k)
+
+        # Bonus tokens not used but required.
+        bonus_token_ids = torch.zeros((1, self.num_bonus_tokens),
+                                      dtype=torch.int64,
+                                      device="cuda").repeat(num_samples, 1)
+
+        # Get output tokens via rejection sampling.
+        output_token_ids = self.rejection_sampler(target_probs.to("cuda"),
+                                                  bonus_token_ids.to("cuda"),
+                                                  draft_probs.to("cuda"),
+                                                  draft_token_ids.to("cuda"))
+
+        # Remove bonus tokens
+        output_token_ids = output_token_ids[:, :-1].flatten()
+
+        # Estimate probability density function
+        hist = torch.histogram(output_token_ids.to(dtype=torch.float,
+                                                   device="cpu"),
+                               bins=self.vocab_size,
+                               range=self.vocab_range,
+                               density=True)
+
+        return hist.hist
--- a/tests/samplers/test_sampler.py
+++ b/tests/samplers/test_sampler.py
@@ -0,0 +1,661 @@
+import itertools
+import random
+from typing import List, Optional, Tuple
+from unittest.mock import patch
+
+import pytest
+import torch
+from transformers import GenerationConfig, GenerationMixin
+
+from vllm.model_executor.layers.sampler import Sampler
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.model_executor.utils import set_random_seed
+from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata
+from vllm.utils import Counter
+from vllm.worker.model_runner import ModelRunner
+
+
+class MockLogitsSampler(Sampler):
+
+    def __init__(self, fake_logits: torch.Tensor):
+        super().__init__()
+        self.fake_logits = fake_logits
+
+    def forward(self, *args, **kwargs):
+        return super().forward(*args, **kwargs)
+
+
+def _prepare_test(
+    batch_size: int
+) -> Tuple[torch.Tensor, torch.Tensor, MockLogitsSampler, ModelRunner]:
+    input_tensor = torch.rand((batch_size, 1024), dtype=torch.float16)
+    fake_logits = torch.full((batch_size, VOCAB_SIZE),
+                             1e-2,
+                             dtype=input_tensor.dtype)
+    sampler = MockLogitsSampler(fake_logits)
+    model_runner = ModelRunner(model_config=None,
+                               parallel_config=None,
+                               scheduler_config=None,
+                               device_config=None,
+                               load_config=None,
+                               lora_config=None)
+    return input_tensor, fake_logits, sampler, model_runner
+
+
+VOCAB_SIZE = 32000
+RANDOM_SEEDS = list(range(128))
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+]
+
+
+def _do_sample(
+    batch_size: int,
+    input_tensor: torch.Tensor,
+    sampler: MockLogitsSampler,
+    model_runner: ModelRunner,
+    sampling_params: SamplingParams,
+    device: str,
+):
+    seq_group_metadata_list = []
+    seq_lens = []
+    for i in range(batch_size):
+        seq_group_metadata_list.append(
+            SequenceGroupMetadata(
+                request_id=f"test_{i}",
+                is_prompt=True,
+                seq_data={0: SequenceData([1, 2, 3])},
+                sampling_params=sampling_params,
+                block_tables={0: [1]},
+            ))
+        seq_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())
+
+    sampling_metadata = SamplingMetadata.prepare(
+        seq_group_metadata_list,
+        seq_lens,
+        query_lens=seq_lens,
+        device=device,
+        pin_memory=model_runner.pin_memory)
+    return sampler(logits=input_tensor, sampling_metadata=sampling_metadata)
+
+
+@pytest.mark.parametrize("seed", RANDOM_SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_sampler_all_greedy(seed: int, device: str):
+    set_random_seed(seed)
+    torch.set_default_device(device)
+    batch_size = random.randint(1, 256)
+    input_tensor, fake_logits, sampler, model_runner = _prepare_test(
+        batch_size)
+
+    sampling_params = SamplingParams(temperature=0)
+    sampler_output = _do_sample(batch_size, fake_logits, sampler, model_runner,
+                                sampling_params, device)
+    expected = torch.argmax(fake_logits, dim=-1)
+    for i, sequence_output in enumerate(sampler_output):
+        for nth_output in sequence_output.samples:
+            assert nth_output.output_token == expected[i].item()
+
+    del model_runner
+
+
+@pytest.mark.parametrize("seed", RANDOM_SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_sampler_all_random(seed: int, device: str):
+    set_random_seed(seed)
+    torch.set_default_device(device)
+    batch_size = random.randint(1, 256)
+    input_tensor, fake_logits, sampler, model_runner = _prepare_test(
+        batch_size)
+
+    for i in range(batch_size):
+        fake_logits[i, i] = 1e2
+
+    sampling_params = SamplingParams(
+        temperature=1.0,
+        n=random.randint(1, 10),
+    )
+    sampler_output = _do_sample(batch_size, fake_logits, sampler, model_runner,
+                                sampling_params, device)
+
+    for i, sequence_output in enumerate(sampler_output):
+        for nth_output in sequence_output.samples:
+            assert nth_output.output_token == i
+
+    del model_runner
+
+
+@pytest.mark.parametrize("seed", RANDOM_SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_sampler_all_random_seed(seed: int, device: str):
+    set_random_seed(seed)
+    torch.set_default_device(device)
+    batch_size = random.randint(1, 256)
+    _, fake_logits, sampler, model_runner = _prepare_test(batch_size)
+
+    for i in range(batch_size):
+        fake_logits[i, i] = 1e2
+
+    sampling_params = SamplingParams(
+        temperature=1.0,
+        n=random.randint(1, 10),
+        seed=random.randint(0, 10000),
+    )
+    sampler_output = _do_sample(batch_size, fake_logits, sampler, model_runner,
+                                sampling_params, device)
+
+    for i, sequence_output in enumerate(sampler_output):
+        for nth_output in sequence_output.samples:
+            assert nth_output.output_token == i
+
+    del model_runner
+
+
+@pytest.mark.parametrize("seed", RANDOM_SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_sampler_all_random_seed_deterministic(seed: int, device: str):
+    set_random_seed(seed)
+    torch.set_default_device(device)
+    batch_size = random.randint(1, 256)
+    _, fake_logits, sampler, model_runner = _prepare_test(batch_size)
+
+    sampling_params = SamplingParams(
+        temperature=1.0,
+        n=random.randint(1, 10),
+        seed=random.randint(0, 10000),
+    )
+    first_sampler_output = _do_sample(batch_size, fake_logits, sampler,
+                                      model_runner, sampling_params, device)
+
+    second_sampler_output = _do_sample(batch_size, fake_logits, sampler,
+                                       model_runner, sampling_params, device)
+
+    assert first_sampler_output == second_sampler_output
+
+    del model_runner
+
+
+@pytest.mark.parametrize("seed", RANDOM_SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_sampler_all_beam(seed: int, device: str):
+    set_random_seed(seed)
+    torch.set_default_device(device)
+    batch_size = random.randint(1, 256)
+    _, fake_logits, sampler, model_runner = _prepare_test(batch_size)
+
+    sampling_params = SamplingParams(
+        temperature=0,
+        best_of=2,
+        use_beam_search=True,
+    )
+    _do_sample(batch_size, fake_logits, sampler, model_runner, sampling_params,
+               device)
+    # no assertion here as I am not sure how to determine whether
+    # the outputs are expected - in other words, this just tests
+    # whether there are no exceptions in the sampler
+    # when handling an all-beam search case.
+    del model_runner
+
+
+@pytest.mark.parametrize("seed", RANDOM_SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_sampler_min_tokens_penalty(seed: int, device: str):
+    seq_id_counter = Counter(start=random.randint(0, 100))
+    set_random_seed(seed)
+    torch.set_default_device(device)
+
+    def create_sampling_params(min_tokens,
+                               eos_token_id=0,
+                               *,
+                               stop_token_ids: Optional[List[int]] = None,
+                               prompt_logprobs: Optional[int] = None):
+        sampling_params = SamplingParams(
+            min_tokens=min_tokens,
+            max_tokens=9999,  # keep higher than max of min_tokens
+            stop_token_ids=stop_token_ids,
+            # requesting prompt_logprobs changes the structure of `logits`
+            prompt_logprobs=prompt_logprobs,
+        )
+        sampling_params.all_stop_token_ids.add(eos_token_id)
+        return sampling_params
+
+    def create_sequence_data(num_input=3, num_generated=0):
+        seq_data = SequenceData(
+            random.choices(range(0, VOCAB_SIZE), k=num_input))
+        if num_generated > 0:
+            seq_data.output_token_ids = random.choices(range(0, VOCAB_SIZE),
+                                                       k=num_generated)
+        return seq_data
+
+    def generate_test_case():
+        # generate multiple seq groups but limit total batch size
+        batch_size = random.randint(1, 128)
+
+        expected_penalization = []
+        sequence_metadata_list = []
+        # 20% chance to generate seq group metadata list with all prompts
+        is_prompt = random.random() < 0.2
+        while batch_size > 0:
+            num_seqs = 1 if is_prompt else random.randint(1, batch_size)
+
+            eos_token_id = random.randint(0, VOCAB_SIZE - 1)
+            min_tokens = random.randint(0, 50)
+            num_stop_tokens = random.randint(0, 8)
+            if num_stop_tokens > 0:
+                stop_token_ids = random.choices(range(0, VOCAB_SIZE - 1),
+                                                k=num_stop_tokens)
+            else:
+                stop_token_ids = None
+
+            sampling_params = create_sampling_params(
+                min_tokens=min_tokens,
+                eos_token_id=eos_token_id,
+                stop_token_ids=stop_token_ids)
+
+            seq_data = {}
+            seq_group_penalization = []
+            for _ in range(num_seqs):
+                num_input = random.randint(1, 100)
+                num_generated = 0 if is_prompt else random.randint(1, 100)
+                seq_data[next(seq_id_counter)] = create_sequence_data(
+                    num_input=num_input, num_generated=num_generated)
+                seq_group_penalization.append(num_generated < min_tokens)
+
+            expected_penalization.extend(seq_group_penalization)
+            sequence_metadata_list.append(
+                SequenceGroupMetadata(
+                    request_id=f"test_{batch_size}",
+                    is_prompt=is_prompt,
+                    seq_data=seq_data,
+                    sampling_params=sampling_params,
+                    block_tables={},
+                ))
+            batch_size -= num_seqs
+
+        return {
+            "expected_penalization": expected_penalization,
+            "seq_group_metadata_list": sequence_metadata_list,
+        }
+
+    # define some explicit test cases for edge case behavior
+    prompt_without_penalization = {
+        "expected_penalization": [False],
+        "seq_group_metadata_list": [
+            SequenceGroupMetadata(
+                request_id="test_1",
+                is_prompt=True,
+                seq_data={
+                    next(seq_id_counter): create_sequence_data(),
+                },
+                sampling_params=create_sampling_params(0),
+                block_tables={},
+            ),
+        ]
+    }
+
+    prompt_with_penalization = {
+        "expected_penalization": [True],
+        "seq_group_metadata_list": [
+            SequenceGroupMetadata(
+                request_id="test_1",
+                is_prompt=True,
+                seq_data={
+                    next(seq_id_counter): create_sequence_data(),
+                },
+                sampling_params=create_sampling_params(1),
+                block_tables={},
+            ),
+        ]
+    }
+
+    prompt_with_penalization_and_prompt_logprobs = {
+        "expected_penalization": [False, False, True],
+        "seq_group_metadata_list": [
+            SequenceGroupMetadata(
+                request_id="test_1",
+                is_prompt=True,
+                seq_data={
+                    next(seq_id_counter): create_sequence_data(num_input=3),
+                },
+                sampling_params=create_sampling_params(1, prompt_logprobs=3),
+                block_tables={},
+            ),
+        ]
+    }
+
+    stop_penalizing_after_min_tokens = {
+        "expected_penalization": [False],
+        "seq_group_metadata_list": [
+            SequenceGroupMetadata(
+                request_id="test_1",
+                is_prompt=False,
+                seq_data={
+                    next(seq_id_counter):
+                    create_sequence_data(num_generated=1),
+                },
+                sampling_params=create_sampling_params(1),
+                block_tables={},
+            )
+        ]
+    }
+
+    stop_token_ids = [42, 99, 42, 0]  # intentional duplication
+    prompt_combination = {
+        "expected_penalization": [False, True, False],
+        "seq_group_metadata_list": [
+            SequenceGroupMetadata(
+                request_id="test_2",
+                is_prompt=True,
+                seq_data={
+                    next(seq_id_counter): create_sequence_data(num_input=2),
+                },
+                sampling_params=create_sampling_params(1, prompt_logprobs=3),
+                block_tables={},
+            ),
+            SequenceGroupMetadata(
+                request_id="test_3",
+                is_prompt=True,
+                seq_data={
+                    next(seq_id_counter): create_sequence_data(),
+                },
+                sampling_params=create_sampling_params(
+                    0, stop_token_ids=stop_token_ids),
+                block_tables={},
+            )
+        ]
+    }
+
+    stop_token_ids = [1, 999, 37, 37]  # intentional duplication
+    decode_combination = {
+        "expected_penalization": [True, False, False, True, False],
+        "seq_group_metadata_list": [
+            SequenceGroupMetadata(
+                request_id="test_1",
+                is_prompt=False,
+                seq_data={
+                    next(seq_id_counter):
+                    create_sequence_data(num_generated=1),
+                    next(seq_id_counter):
+                    create_sequence_data(num_generated=100),
+                },
+                sampling_params=create_sampling_params(
+                    2, stop_token_ids=stop_token_ids),
+                block_tables={},
+            ),
+            SequenceGroupMetadata(
+                request_id="test_2",
+                is_prompt=False,
+                seq_data={
+                    next(seq_id_counter):
+                    create_sequence_data(num_generated=20),
+                    next(seq_id_counter):
+                    create_sequence_data(num_generated=1),
+                    next(seq_id_counter):
+                    create_sequence_data(num_generated=10),
+                },
+                sampling_params=create_sampling_params(
+                    10, prompt_logprobs=5, stop_token_ids=stop_token_ids),
+                block_tables={},
+            ),
+        ]
+    }
+
+    if seed == 0:
+        test_cases = [
+            prompt_without_penalization,
+            prompt_with_penalization,
+            prompt_with_penalization_and_prompt_logprobs,
+            stop_penalizing_after_min_tokens,
+            prompt_combination,
+            decode_combination,
+        ]
+    else:
+        test_cases = [generate_test_case()]
+
+    def run_test_case(*,
+                      expected_penalization=None,
+                      seq_group_metadata_list=None):
+        assert expected_penalization, \
+            "Invalid test case, need expected_penalization"
+        assert seq_group_metadata_list, \
+            "Invalid test case, need seq_group_metadata_list"
+
+        batch_size = 0
+        seq_lens = []
+        sampling_params_per_row = []
+        for sgm in seq_group_metadata_list:
+            sampling_params = sgm.sampling_params
+
+            num_rows = len(sgm.seq_data)
+            if sgm.is_prompt:
+                # a prompt seq_group has only one sequence
+                seq_data = next(iter(sgm.seq_data.values()))
+                prompt_len = seq_data.get_prompt_len()
+                seq_lens.append(prompt_len)
+
+                if sgm.sampling_params.prompt_logprobs:
+                    # with prompt_logprobs each token in the prompt has a row in
+                    # logits
+                    num_rows = prompt_len
+
+            batch_size += num_rows
+            sampling_params_per_row.extend(
+                itertools.repeat(sampling_params, num_rows))
+
+        assert len(
+            expected_penalization
+        ) == batch_size, \
+            ("Invalid test case, expected_penalization does not match computed"
+             "batch size")
+
+        _, fake_logits, sampler, model_runner = _prepare_test(batch_size)
+        sampling_metadata = SamplingMetadata.prepare(
+            seq_group_metadata_list,
+            seq_lens=seq_lens if seq_lens else None,
+            query_lens=seq_lens if seq_lens else None,
+            device=device,
+            pin_memory=model_runner.pin_memory)
+        # the logits tensor is modified in-place by the sampler
+        _ = sampler(logits=fake_logits, sampling_metadata=sampling_metadata)
+
+        for logits_idx, (should_penalize, sampling_params) in enumerate(
+                zip(expected_penalization, sampling_params_per_row)):
+
+            tokens_to_check = sampling_params.all_stop_token_ids
+
+            if should_penalize:
+                for token_id in tokens_to_check:
+                    assert fake_logits[logits_idx, token_id] == -float(
+                        'inf'
+                    ), f"Expected token {token_id} for logits row {logits_idx}"
+                    " to be penalized"
+                # no other tokens should be set to -inf
+                assert torch.count_nonzero(
+                    fake_logits[logits_idx, :] == -float('inf')) == len(
+                        tokens_to_check
+                    ), f"Expected only {len(tokens_to_check)} to be penalized"
+            else:
+                # no tokens should be set to -inf
+                assert torch.count_nonzero(
+                    fake_logits[logits_idx, :] ==
+                    -float('inf')) == 0, "No tokens should have been penalized"
+
+        del model_runner
+
+    for test_case in test_cases:
+        run_test_case(**test_case)
+
+
+@pytest.mark.parametrize("seed", RANDOM_SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_sampler_mixed(seed: int, device: str):
+    set_random_seed(seed)
+    torch.set_default_device(device)
+    batch_size = random.randint(1, 256)
+    input_tensor, fake_logits, sampler, model_runner = _prepare_test(
+        batch_size)
+
+    seq_group_metadata_list = []
+    expected_tokens: List[Optional[List[int]]] = []
+    seq_lens = []
+    for i in range(batch_size):
+        expected: Optional[List[int]] = None
+        sampling_type = random.randint(0, 3)
+        if sampling_type == 0:
+            sampling_params = SamplingParams(temperature=0)
+            expected = [torch.argmax(fake_logits[i], dim=-1).item()]
+        elif sampling_type in (1, 2):
+            n = random.randint(1, 10)
+            sampling_params = SamplingParams(
+                temperature=random.random() + 0.1,
+                top_p=min(random.random() + 0.1, 1),
+                top_k=random.randint(0, 10) or -1,
+                n=n,
+                presence_penalty=random.randint(0, 1),
+            )
+            if sampling_type == 2:
+                sampling_params.seed = random.randint(0, 10000)
+            else:
+                for idx in range(n):
+                    fake_logits[i, i + idx] = 1e2
+                expected = list(range(i, i + n))
+        else:
+            sampling_params = SamplingParams(temperature=0,
+                                             use_beam_search=True,
+                                             best_of=2)
+        expected_tokens.append(expected)
+        seq_group_metadata_list.append(
+            SequenceGroupMetadata(
+                request_id=f"test_{i}",
+                is_prompt=True,
+                seq_data={0: SequenceData([1, 2, 3])},
+                sampling_params=sampling_params,
+                block_tables={0: [1]},
+            ))
+        seq_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())
+
+    def test_sampling(model_runner: ModelRunner):
+        sampling_metadata = SamplingMetadata.prepare(
+            seq_group_metadata_list,
+            seq_lens,
+            query_lens=seq_lens,
+            device=device,
+            pin_memory=model_runner.pin_memory)
+        sampler_output = sampler(logits=fake_logits,
+                                 sampling_metadata=sampling_metadata)
+
+        for i, (sequence_output, metadata) in enumerate(
+                zip(sampler_output, seq_group_metadata_list)):
+            if metadata.sampling_params.use_beam_search:
+                continue
+
+            if (metadata.sampling_params.seed is not None
+                    and expected_tokens[i] is None):
+                # Record seeded random result to compare with results of
+                # second invocation
+                expected_tokens[i] = [
+                    nth_output.output_token
+                    for nth_output in sequence_output.samples
+                ]
+                continue
+
+            for n, nth_output in enumerate(sequence_output.samples):
+                if (metadata.sampling_params.temperature == 0
+                        or metadata.sampling_params.seed is not None):
+                    # Ensure exact matches for greedy or random with seed
+                    assert nth_output.output_token == expected_tokens[i][n]
+                else:
+                    # For non-seeded random check that one of the high-logit
+                    # tokens were chosen
+                    assert nth_output.output_token in expected_tokens[i]
+
+    # Test batch
+    test_sampling(model_runner)
+
+    # Shuffle the batch and resample
+    target_index = list(range(batch_size))
+    for list_to_shuffle in (target_index, seq_group_metadata_list,
+                            expected_tokens, seq_lens):
+        random.Random(seed).shuffle(list_to_shuffle)
+    target_index = torch.tensor(target_index)
+    input_tensor.data = input_tensor.index_select(0, target_index)
+    fake_logits.data = fake_logits.index_select(0, target_index)
+
+    # This time, results of seeded random samples will be compared with
+    # the corresponding sample in the pre-shuffled batch
+    test_sampling(model_runner)
+
+    del model_runner
+
+
+@pytest.mark.parametrize("seed", RANDOM_SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_sampler_top_k_top_p(seed: int, device: str):
+    set_random_seed(seed)
+    batch_size = random.randint(1, 256)
+    top_k = random.randint(100, 500)
+    top_p = random.random() * 0.1
+    vocab_size = 32000
+    input_tensor = torch.rand((batch_size, 1024),
+                              device=device,
+                              dtype=torch.float16)
+    fake_logits = torch.normal(0,
+                               5,
+                               size=(batch_size, vocab_size),
+                               device=input_tensor.device,
+                               dtype=input_tensor.dtype)
+    sampler = MockLogitsSampler(fake_logits)
+    model_runner = ModelRunner(model_config=None,
+                               parallel_config=None,
+                               scheduler_config=None,
+                               device_config=None,
+                               load_config=None,
+                               lora_config=None)
+
+    generation_model = GenerationMixin()
+    generation_config = GenerationConfig(top_k=top_k,
+                                         top_p=top_p,
+                                         do_sample=True)
+    warpers = generation_model._get_logits_warper(generation_config)
+    assert len(warpers) == 2  # top_p and top_k
+
+    seq_group_metadata_list = []
+    seq_lens = []
+    for i in range(batch_size):
+        seq_group_metadata_list.append(
+            SequenceGroupMetadata(
+                request_id=f"test_{i}",
+                is_prompt=True,
+                seq_data={0: SequenceData([1, 2, 3])},
+                sampling_params=SamplingParams(
+                    temperature=1,
+                    top_k=top_k,
+                    top_p=top_p,
+                ),
+                block_tables={0: [1]},
+            ))
+        seq_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())
+
+    sampling_metadata = SamplingMetadata.prepare(
+        seq_group_metadata_list,
+        seq_lens,
+        query_lens=seq_lens,
+        device=device,
+        pin_memory=model_runner.pin_memory)
+
+    sample_probs = None
+
+    def mock_sample(probs, *args, **kwargs):
+        nonlocal sample_probs
+        sample_probs = probs
+        return ([[prob.topk(1, dim=-1).indices.tolist(), [0]]
+                 for prob in probs], None)
+
+    with patch("vllm.model_executor.layers.sampler._sample", mock_sample):
+        sampler(logits=fake_logits, sampling_metadata=sampling_metadata)
+    hf_probs = warpers(torch.zeros_like(fake_logits), fake_logits.clone())
+    hf_probs = torch.softmax(hf_probs, dim=-1, dtype=torch.float)
+    assert torch.allclose(hf_probs, sample_probs, atol=1e-5)
+    assert torch.equal(hf_probs.eq(0), sample_probs.eq(0))
+
+    del model_runner
--- a/Show More
+++ b/Show More