init

2026-01-09 13:34:11 +08:00
parent dfa6476b58
commit b2ef04d792
538 changed files with 105693 additions and 2 deletions
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -0,0 +1,8 @@
+# Benchmarking vLLM
+
+## Downloading the ShareGPT dataset
+
+You can download the dataset by running:
+```bash
+wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+```
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -0,0 +1,389 @@
+import json
+import os
+import sys
+import time
+import traceback
+from dataclasses import dataclass, field
+from typing import List, Optional
+
+import aiohttp
+from tqdm.asyncio import tqdm
+
+AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
+
+
+@dataclass
+class RequestFuncInput:
+    prompt: str
+    api_url: str
+    prompt_len: int
+    output_len: int
+    model: str
+    best_of: int = 1
+    use_beam_search: bool = False
+
+
+@dataclass
+class RequestFuncOutput:
+    generated_text: str = ""
+    success: bool = False
+    latency: float = 0.0
+    ttft: float = 0.0  # Time to first token
+    itl: List[float] = field(
+        default_factory=list)  # List of inter-token latencies
+    prompt_len: int = 0
+    error: str = ""
+
+
+async def async_request_tgi(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    api_url = request_func_input.api_url
+    assert api_url.endswith("generate_stream")
+
+    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+        assert not request_func_input.use_beam_search
+        params = {
+            "best_of": request_func_input.best_of,
+            "max_new_tokens": request_func_input.output_len,
+            "do_sample": True,
+            "temperature": 0.01,  # TGI does not accept 0.0 temperature.
+            "top_p": 0.99,  # TGI does not accept 1.0 top_p.
+        }
+        payload = {
+            "inputs": request_func_input.prompt,
+            "parameters": params,
+        }
+        output = RequestFuncOutput()
+        output.prompt_len = request_func_input.prompt_len
+
+        ttft = 0.0
+        st = time.perf_counter()
+        most_recent_timestamp = st
+        try:
+            async with session.post(url=api_url, json=payload) as response:
+                if response.status == 200:
+                    async for chunk_bytes in response.content:
+                        chunk_bytes = chunk_bytes.strip()
+                        if not chunk_bytes:
+                            continue
+
+                        chunk = remove_prefix(chunk_bytes.decode("utf-8"),
+                                              "data:")
+
+                        data = json.loads(chunk)
+                        timestamp = time.perf_counter()
+                        # First token
+                        if ttft == 0.0:
+                            ttft = time.perf_counter() - st
+                            output.ttft = ttft
+
+                        # Decoding phase
+                        else:
+                            output.itl.append(timestamp -
+                                              most_recent_timestamp)
+
+                        most_recent_timestamp = timestamp
+
+                    output.latency = most_recent_timestamp - st
+                    output.success = True
+                    output.generated_text = data["generated_text"]
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
+
+        if pbar:
+            pbar.update(1)
+        return output
+
+
+async def async_request_trt_llm(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    api_url = request_func_input.api_url
+    assert api_url.endswith("generate_stream")
+
+    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+        assert not request_func_input.use_beam_search
+        assert request_func_input.best_of == 1
+        payload = {
+            "accumulate_tokens": True,
+            "text_input": request_func_input.prompt,
+            "temperature": 0.0,
+            "top_p": 1.0,
+            "max_tokens": request_func_input.output_len,
+            "stream": True,
+        }
+        output = RequestFuncOutput()
+        output.prompt_len = request_func_input.prompt_len
+
+        ttft = 0.0
+        st = time.perf_counter()
+        most_recent_timestamp = st
+        try:
+            async with session.post(url=api_url, json=payload) as response:
+                if response.status == 200:
+                    async for chunk_bytes in response.content:
+                        chunk_bytes = chunk_bytes.strip()
+                        if not chunk_bytes:
+                            continue
+
+                        chunk = remove_prefix(chunk_bytes.decode("utf-8"),
+                                              "data:")
+
+                        data = json.loads(chunk)
+                        output.generated_text += data["text_output"]
+                        timestamp = time.perf_counter()
+                        # First token
+                        if ttft == 0.0:
+                            ttft = time.perf_counter() - st
+                            output.ttft = ttft
+
+                        # Decoding phase
+                        else:
+                            output.itl.append(timestamp -
+                                              most_recent_timestamp)
+
+                        most_recent_timestamp = timestamp
+
+                    output.latency = most_recent_timestamp - st
+                    output.success = True
+
+                else:
+                    output.error = response.reason or ""
+                    output.success = False
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
+
+        if pbar:
+            pbar.update(1)
+        return output
+
+
+async def async_request_deepspeed_mii(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+        assert request_func_input.best_of == 1
+        assert not request_func_input.use_beam_search
+
+        payload = {
+            "prompt": request_func_input.prompt,
+            "max_tokens": request_func_input.output_len,
+            "temperature": 0.01,  # deepspeed-mii does not accept 0.0 temp.
+            "top_p": 1.0,
+        }
+        output = RequestFuncOutput()
+        output.prompt_len = request_func_input.prompt_len
+
+        # NOTE: DeepSpeed-MII doesn't support streaming as of Jan 28 2024,
+        # will use 0 as placeholder.
+        # See https://github.com/microsoft/DeepSpeed-MII/pull/311
+        output.ttft = 0
+
+        st = time.perf_counter()
+        try:
+            async with session.post(url=request_func_input.api_url,
+                                    json=payload) as response:
+                if response.status == 200:
+                    parsed_resp = await response.json()
+                    output.latency = time.perf_counter() - st
+                    output.generated_text = parsed_resp["text"][0]
+                    output.success = True
+                else:
+                    output.error = response.reason or ""
+                    output.success = False
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
+
+        if pbar:
+            pbar.update(1)
+        return output
+
+
+async def async_request_openai_completions(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    api_url = request_func_input.api_url
+    assert api_url.endswith(
+        "v1/completions"
+    ), "OpenAI Completions API URL must end with 'v1/completions'."
+
+    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+        assert not request_func_input.use_beam_search
+        payload = {
+            "model": request_func_input.model,
+            "prompt": request_func_input.prompt,
+            "temperature": 0.0,
+            "best_of": request_func_input.best_of,
+            "max_tokens": request_func_input.output_len,
+            "stream": True,
+        }
+        headers = {
+            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
+        }
+
+        output = RequestFuncOutput()
+        output.prompt_len = request_func_input.prompt_len
+
+        generated_text = ""
+        ttft = 0.0
+        st = time.perf_counter()
+        most_recent_timestamp = st
+        try:
+            async with session.post(url=api_url, json=payload,
+                                    headers=headers) as response:
+                if response.status == 200:
+                    async for chunk_bytes in response.content:
+                        chunk_bytes = chunk_bytes.strip()
+                        if not chunk_bytes:
+                            continue
+
+                        chunk = remove_prefix(chunk_bytes.decode("utf-8"),
+                                              "data: ")
+                        if chunk == "[DONE]":
+                            latency = time.perf_counter() - st
+                        else:
+                            data = json.loads(chunk)
+
+                            if data["choices"][0]["text"]:
+                                timestamp = time.perf_counter()
+                                # First token
+                                if ttft == 0.0:
+                                    ttft = time.perf_counter() - st
+                                    output.ttft = ttft
+
+                                # Decoding phase
+                                # NOTE: Some completion API might have a last
+                                # usage summary response without a token so we
+                                # do not want to include as inter-token-latency
+                                elif data.get("usage", None) is None:
+                                    output.itl.append(timestamp -
+                                                      most_recent_timestamp)
+
+                                most_recent_timestamp = timestamp
+                                generated_text += data["choices"][0]["text"]
+
+                    output.generated_text = generated_text
+                    output.success = True
+                    output.latency = latency
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
+
+    if pbar:
+        pbar.update(1)
+    return output
+
+
+async def async_request_openai_chat_completions(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    api_url = request_func_input.api_url
+    assert api_url.endswith(
+        "v1/chat/completions"
+    ), "OpenAI Chat Completions API URL must end with 'v1/chat/completions'."
+
+    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+        assert not request_func_input.use_beam_search
+        payload = {
+            "model": request_func_input.model,
+            "messages": [
+                {
+                    "role": "user",
+                    "content": request_func_input.prompt,
+                },
+            ],
+            "temperature": 0.0,
+            "max_tokens": request_func_input.output_len,
+            "stream": True,
+        }
+        headers = {
+            "Content-Type": "application/json",
+            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
+        }
+
+        output = RequestFuncOutput()
+        output.prompt_len = request_func_input.prompt_len
+
+        generated_text = ""
+        ttft = 0.0
+        st = time.perf_counter()
+        most_recent_timestamp = st
+        try:
+            async with session.post(url=api_url, json=payload,
+                                    headers=headers) as response:
+                if response.status == 200:
+                    async for chunk_bytes in response.content:
+                        chunk_bytes = chunk_bytes.strip()
+                        if not chunk_bytes:
+                            continue
+
+                        chunk = remove_prefix(chunk_bytes.decode("utf-8"),
+                                              "data: ")
+                        if chunk == "[DONE]":
+                            latency = time.perf_counter() - st
+                        else:
+                            timestamp = time.perf_counter()
+                            data = json.loads(chunk)
+
+                            delta = data["choices"][0]["delta"]
+                            if delta.get("content", None):
+                                # First token
+                                if ttft == 0.0:
+                                    ttft = time.perf_counter() - st
+                                    output.ttft = ttft
+
+                                # Decoding phase
+                                else:
+                                    output.itl.append(timestamp -
+                                                      most_recent_timestamp)
+
+                                generated_text += delta["content"]
+
+                            most_recent_timestamp = timestamp
+
+                    output.generated_text = generated_text
+                    output.success = True
+                    output.latency = latency
+                else:
+                    output.error = response.reason or ""
+                    output.success = False
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
+
+    if pbar:
+        pbar.update(1)
+    return output
+
+
+# Since vllm must support Python 3.8, we can't use str.removeprefix(prefix)
+# introduced in Python 3.9
+def remove_prefix(text: str, prefix: str) -> str:
+    if text.startswith(prefix):
+        return text[len(prefix):]
+    return text
+
+
+ASYNC_REQUEST_FUNCS = {
+    "tgi": async_request_tgi,
+    "vllm": async_request_openai_completions,
+    "lmdeploy": async_request_openai_completions,
+    "deepspeed-mii": async_request_deepspeed_mii,
+    "openai": async_request_openai_completions,
+    "openai-chat": async_request_openai_chat_completions,
+    "tensorrt-llm": async_request_trt_llm,
+}
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -0,0 +1,195 @@
+"""Benchmark the latency of processing a single batch of requests."""
+import argparse
+import time
+from pathlib import Path
+from typing import Optional
+
+import numpy as np
+import torch
+from tqdm import tqdm
+
+from vllm import LLM, SamplingParams
+from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
+
+
+def main(args: argparse.Namespace):
+    print(args)
+
+    # NOTE(woosuk): If the request cannot be processed in a single batch,
+    # the engine will automatically process the request in multiple batches.
+    llm = LLM(model=args.model,
+              tokenizer=args.tokenizer,
+              quantization=args.quantization,
+              tensor_parallel_size=args.tensor_parallel_size,
+              trust_remote_code=args.trust_remote_code,
+              dtype=args.dtype,
+              enforce_eager=args.enforce_eager,
+              kv_cache_dtype=args.kv_cache_dtype,
+              quantization_param_path=args.quantization_param_path,
+              device=args.device,
+              ray_workers_use_nsight=args.ray_workers_use_nsight,
+              enable_chunked_prefill=args.enable_chunked_prefill,
+              download_dir=args.download_dir,
+              block_size=args.block_size)
+
+    sampling_params = SamplingParams(
+        n=args.n,
+        temperature=0.0 if args.use_beam_search else 1.0,
+        top_p=1.0,
+        use_beam_search=args.use_beam_search,
+        ignore_eos=True,
+        max_tokens=args.output_len,
+    )
+    print(sampling_params)
+    dummy_prompt_token_ids = np.random.randint(10000,
+                                               size=(args.batch_size,
+                                                     args.input_len))
+    dummy_prompt_token_ids = dummy_prompt_token_ids.tolist()
+
+    def run_to_completion(profile_dir: Optional[str] = None):
+        if profile_dir:
+            with torch.profiler.profile(
+                    activities=[
+                        torch.profiler.ProfilerActivity.CPU,
+                        torch.profiler.ProfilerActivity.CUDA,
+                    ],
+                    on_trace_ready=torch.profiler.tensorboard_trace_handler(
+                        str(profile_dir))) as p:
+                llm.generate(prompt_token_ids=dummy_prompt_token_ids,
+                             sampling_params=sampling_params,
+                             use_tqdm=False)
+            print(p.key_averages())
+        else:
+            start_time = time.perf_counter()
+            llm.generate(prompt_token_ids=dummy_prompt_token_ids,
+                         sampling_params=sampling_params,
+                         use_tqdm=False)
+            end_time = time.perf_counter()
+            latency = end_time - start_time
+            return latency
+
+    print("Warming up...")
+    for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"):
+        run_to_completion(profile_dir=None)
+
+    if args.profile:
+        profile_dir = args.profile_result_dir
+        if not profile_dir:
+            profile_dir = Path(
+                "."
+            ) / "vllm_benchmark_result" / f"latency_result_{time.time()}"
+        print(f"Profiling (results will be saved to '{profile_dir}')...")
+        run_to_completion(profile_dir=profile_dir)
+        return
+
+    # Benchmark.
+    latencies = []
+    for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):
+        latencies.append(run_to_completion(profile_dir=None))
+    latencies = np.array(latencies)
+    percentages = [10, 25, 50, 75, 90]
+    percentiles = np.percentile(latencies, percentages)
+    print(f'Avg latency: {np.mean(latencies)} seconds')
+    for percentage, percentile in zip(percentages, percentiles):
+        print(f'{percentage}% percentile latency: {percentile} seconds')
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        description='Benchmark the latency of processing a single batch of '
+        'requests till completion.')
+    parser.add_argument('--model', type=str, default='facebook/opt-125m')
+    parser.add_argument('--tokenizer', type=str, default=None)
+    parser.add_argument('--quantization',
+                        '-q',
+                        choices=[*QUANTIZATION_METHODS, None],
+                        default=None)
+    parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
+    parser.add_argument('--input-len', type=int, default=32)
+    parser.add_argument('--output-len', type=int, default=128)
+    parser.add_argument('--batch-size', type=int, default=8)
+    parser.add_argument('--n',
+                        type=int,
+                        default=1,
+                        help='Number of generated sequences per prompt.')
+    parser.add_argument('--use-beam-search', action='store_true')
+    parser.add_argument('--num-iters-warmup',
+                        type=int,
+                        default=10,
+                        help='Number of iterations to run for warmup.')
+    parser.add_argument('--num-iters',
+                        type=int,
+                        default=30,
+                        help='Number of iterations to run.')
+    parser.add_argument('--trust-remote-code',
+                        action='store_true',
+                        help='trust remote code from huggingface')
+    parser.add_argument(
+        '--dtype',
+        type=str,
+        default='auto',
+        choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'],
+        help='data type for model weights and activations. '
+        'The "auto" option will use FP16 precision '
+        'for FP32 and FP16 models, and BF16 precision '
+        'for BF16 models.')
+    parser.add_argument('--enforce-eager',
+                        action='store_true',
+                        help='enforce eager mode and disable CUDA graph')
+    parser.add_argument(
+        "--kv-cache-dtype",
+        type=str,
+        choices=['auto', 'fp8'],
+        default='auto',
+        help=
+        'Data type for kv cache storage. If "auto", will use model data type. '
+        'FP8_E5M2 (without scaling) is only supported on cuda version greater '
+        'than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead supported for '
+        'common inference criteria.')
+    parser.add_argument(
+        '--quantization-param-path',
+        type=str,
+        default=None,
+        help='Path to the JSON file containing the KV cache scaling factors. '
+        'This should generally be supplied, when KV cache dtype is FP8. '
+        'Otherwise, KV cache scaling factors default to 1.0, which may cause '
+        'accuracy issues. FP8_E5M2 (without scaling) is only supported on '
+        'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is '
+        'instead supported for common inference criteria.')
+    parser.add_argument(
+        '--profile',
+        action='store_true',
+        help='profile the generation process of a single batch')
+    parser.add_argument(
+        '--profile-result-dir',
+        type=str,
+        default=None,
+        help=('path to save the pytorch profiler output. Can be visualized '
+              'with ui.perfetto.dev or Tensorboard.'))
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="cuda",
+        choices=["cuda", "cpu"],
+        help='device type for vLLM execution, supporting CUDA and CPU.')
+    parser.add_argument('--block-size',
+                        type=int,
+                        default=16,
+                        help='block size of key/value cache')
+    parser.add_argument(
+        '--enable-chunked-prefill',
+        action='store_true',
+        help='If True, the prefill requests can be chunked based on the '
+        'max_num_batched_tokens')
+    parser.add_argument(
+        "--ray-workers-use-nsight",
+        action='store_true',
+        help="If specified, use nsight to profile ray workers",
+    )
+    parser.add_argument('--download-dir',
+                        type=str,
+                        default=None,
+                        help='directory to download and load the weights, '
+                        'default to the default cache dir of huggingface')
+    args = parser.parse_args()
+    main(args)
--- a/benchmarks/benchmark_prefix_caching.py
+++ b/benchmarks/benchmark_prefix_caching.py
@@ -0,0 +1,62 @@
+import argparse
+import time
+
+from vllm import LLM, SamplingParams
+
+PROMPT = "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as fellows. You need to answer my question about the table.\n# Table\n|Opening|Opening|Sl. No.|Film|Cast|Director|Music Director|Notes|\n|----|----|----|----|----|----|----|----|\n|J A N|9|1|Agni Pushpam|Jayabharathi, Kamalahasan|Jeassy|M. K. Arjunan||\n|J A N|16|2|Priyamvada|Mohan Sharma, Lakshmi, KPAC Lalitha|K. S. Sethumadhavan|V. Dakshinamoorthy||\n|J A N|23|3|Yakshagaanam|Madhu, Sheela|Sheela|M. S. Viswanathan||\n|J A N|30|4|Paalkkadal|Sheela, Sharada|T. K. Prasad|A. T. Ummer||\n|F E B|5|5|Amma|Madhu, Srividya|M. Krishnan Nair|M. K. Arjunan||\n|F E B|13|6|Appooppan|Thikkurissi Sukumaran Nair, Kamal Haasan|P. Bhaskaran|M. S. Baburaj||\n|F E B|20|7|Srishti|Chowalloor Krishnankutty, Ravi Alummoodu|K. T. Muhammad|M. S. Baburaj||\n|F E B|20|8|Vanadevatha|Prem Nazir, Madhubala|Yusufali Kechery|G. Devarajan||\n|F E B|27|9|Samasya|Madhu, Kamalahaasan|K. Thankappan|Shyam||\n|F E B|27|10|Yudhabhoomi|K. P. Ummer, Vidhubala|Crossbelt Mani|R. K. Shekhar||\n|M A R|5|11|Seemantha Puthran|Prem Nazir, Jayabharathi|A. B. Raj|M. K. Arjunan||\n|M A R|12|12|Swapnadanam|Rani Chandra, Dr. Mohandas|K. G. George|Bhaskar Chandavarkar||\n|M A R|19|13|Thulavarsham|Prem Nazir, sreedevi, Sudheer|N. Sankaran Nair|V. Dakshinamoorthy||\n|M A R|20|14|Aruthu|Kaviyoor Ponnamma, Kamalahasan|Ravi|G. Devarajan||\n|M A R|26|15|Swimming Pool|Kamal Haasan, M. G. Soman|J. Sasikumar|M. K. Arjunan||\n\n# Question\nWhat' s the content in the (1,1) cells\n"  # noqa: E501
+
+
+def test_prefix(llm=None, sampling_params=None, prompts=None):
+    start_time = time.time()
+
+    llm.generate(prompts, sampling_params=sampling_params)
+
+    end_time = time.time()
+    print(f"cost time {end_time - start_time}")
+
+
+def main(args):
+    llm = LLM(model=args.model,
+              tokenizer_mode='auto',
+              trust_remote_code=True,
+              enforce_eager=True,
+              use_v2_block_manager=args.use_v2_block_manager,
+              tensor_parallel_size=args.tensor_parallel_size,
+              enable_prefix_caching=args.enable_prefix_caching)
+
+    num_prompts = 100
+    prompts = [PROMPT] * num_prompts
+    sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)
+
+    print("------warm up------")
+    test_prefix(
+        llm=llm,
+        prompts=prompts,
+        sampling_params=sampling_params,
+    )
+
+    print("------start generating------")
+    test_prefix(
+        llm=llm,
+        prompts=prompts,
+        sampling_params=sampling_params,
+    )
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description='Benchmark the performance with or without automatic '
+        'prefix caching.')
+    parser.add_argument('--model',
+                        type=str,
+                        default='baichuan-inc/Baichuan2-13B-Chat')
+    parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
+    parser.add_argument('--output-len', type=int, default=10)
+    parser.add_argument('--enable-prefix-caching',
+                        action='store_true',
+                        help='enable prefix caching')
+    parser.add_argument('--use-v2-block-manager',
+                        action='store_true',
+                        help='Use BlockSpaceMangerV2')
+    args = parser.parse_args()
+    main(args)
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -0,0 +1,596 @@
+"""Benchmark online serving throughput.
+
+On the server side, run one of the following commands:
+    vLLM OpenAI API server
+    python -m vllm.entrypoints.openai.api_server \
+        --model <your_model> --swap-space 16 \
+        --disable-log-requests
+
+    (TGI backend)
+    ./launch_tgi_server.sh <your_model> <max_batch_total_tokens>
+
+On the client side, run:
+    python benchmarks/benchmark_serving.py \
+        --backend <backend> \
+        --model <your_model> \
+        --dataset-name sharegpt \
+        --dataset-path <path to dataset> \
+        --request-rate <request_rate> \ # By default <request_rate> is inf
+        --num-prompts <num_prompts> # By default <num_prompts> is 1000
+"""
+import argparse
+import asyncio
+import json
+import os
+import random
+import time
+import warnings
+from dataclasses import dataclass
+from datetime import datetime
+from typing import AsyncGenerator, List, Optional, Tuple
+
+import numpy as np
+from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
+                                  RequestFuncOutput)
+from tqdm.asyncio import tqdm
+from transformers import PreTrainedTokenizerBase
+
+from vllm.transformers_utils.tokenizer import get_tokenizer
+
+
+@dataclass
+class BenchmarkMetrics:
+    completed: int
+    total_input: int
+    total_output: int
+    request_throughput: float
+    input_throughput: float
+    output_throughput: float
+    mean_ttft_ms: float
+    median_ttft_ms: float
+    p99_ttft_ms: float
+    mean_tpot_ms: float
+    median_tpot_ms: float
+    p99_tpot_ms: float
+
+
+def sample_sharegpt_requests(
+    dataset_path: str,
+    num_requests: int,
+    tokenizer: PreTrainedTokenizerBase,
+    fixed_output_len: Optional[int] = None,
+) -> List[Tuple[str, int, int]]:
+    if fixed_output_len is not None and fixed_output_len < 4:
+        raise ValueError("output_len too small")
+
+    # Load the dataset.
+    with open(dataset_path) as f:
+        dataset = json.load(f)
+    # Filter out the conversations with less than 2 turns.
+    dataset = [data for data in dataset if len(data["conversations"]) >= 2]
+    # Only keep the first two turns of each conversation.
+    dataset = [(data["conversations"][0]["value"],
+                data["conversations"][1]["value"]) for data in dataset]
+
+    # Shuffle the dataset.
+    random.shuffle(dataset)
+
+    # Filter out sequences that are too long or too short
+    filtered_dataset: List[Tuple[str, int, int]] = []
+    for i in range(len(dataset)):
+        if len(filtered_dataset) == num_requests:
+            break
+
+        # Tokenize the prompts and completions.
+        prompt = dataset[i][0]
+        prompt_token_ids = tokenizer(prompt).input_ids
+        completion = dataset[i][1]
+        completion_token_ids = tokenizer(completion).input_ids
+        prompt_len = len(prompt_token_ids)
+        output_len = len(completion_token_ids
+                         ) if fixed_output_len is None else fixed_output_len
+        if prompt_len < 4 or output_len < 4:
+            # Prune too short sequences.
+            continue
+        if prompt_len > 1024 or prompt_len + output_len > 2048:
+            # Prune too long sequences.
+            continue
+        filtered_dataset.append((prompt, prompt_len, output_len))
+
+    return filtered_dataset
+
+
+def sample_sonnet_requests(
+    dataset_path: str,
+    num_requests: int,
+    input_len: int,
+    output_len: int,
+    prefix_len: int,
+    tokenizer: PreTrainedTokenizerBase,
+) -> List[Tuple[str, str, int, int]]:
+    assert (
+        input_len > prefix_len
+    ), "'args.sonnet-input-len' must be greater than 'args.prefix-input-len'."
+
+    # Load the dataset.
+    with open(dataset_path) as f:
+        poem_lines = f.readlines()
+
+    # Tokenize the poem lines.
+    poem_token_ids = tokenizer(poem_lines).input_ids
+    average_poem_len = sum(
+        len(token_ids) for token_ids in poem_token_ids) / len(poem_token_ids)
+
+    # Base prefix for all requests.
+    base_prompt = "Pick as many lines as you can from these poem lines:\n"
+    base_message = [{
+        "role": "user",
+        "content": base_prompt,
+    }]
+    base_prompt_formatted = tokenizer.apply_chat_template(
+        base_message, add_generation_prompt=True, tokenize=False)
+    base_prompt_offset = len(tokenizer(base_prompt_formatted).input_ids)
+
+    assert (
+        input_len > base_prompt_offset
+    ), f"Please set 'args.sonnet-input-len' higher than {base_prompt_offset}."
+    num_input_lines = round(
+        (input_len - base_prompt_offset) / average_poem_len)
+
+    # First approximately `prefix_len` number of tokens in the
+    # prompt are fixed poem lines.
+    assert (
+        prefix_len > base_prompt_offset
+    ), f"Please set 'args.sonnet-prefix-len' higher than {base_prompt_offset}."
+
+    num_prefix_lines = round(
+        (prefix_len - base_prompt_offset) / average_poem_len)
+    prefix_lines = poem_lines[:num_prefix_lines]
+
+    # Sample the rest of lines per request.
+    sampled_requests: List[Tuple[str, int, int]] = []
+    for _ in range(num_requests):
+        sampled_lines = "".join(
+            prefix_lines +
+            random.sample(poem_lines, num_input_lines - num_prefix_lines))
+
+        prompt = f"{base_prompt}{sampled_lines}"
+        message = [
+            {
+                "role": "user",
+                "content": prompt,
+            },
+        ]
+        prompt_formatted = tokenizer.apply_chat_template(
+            message, add_generation_prompt=True, tokenize=False)
+        prompt_len = len(tokenizer(prompt_formatted).input_ids)
+        sampled_requests.append(
+            (prompt, prompt_formatted, prompt_len, output_len))
+
+    return sampled_requests
+
+
+async def get_request(
+    input_requests: List[Tuple[str, int, int]],
+    request_rate: float,
+) -> AsyncGenerator[Tuple[str, int, int], None]:
+    input_requests = iter(input_requests)
+    for request in input_requests:
+        yield request
+
+        if request_rate == float("inf"):
+            # If the request rate is infinity, then we don't need to wait.
+            continue
+        # Sample the request interval from the exponential distribution.
+        interval = np.random.exponential(1.0 / request_rate)
+        # The next request will be sent after the interval.
+        await asyncio.sleep(interval)
+
+
+def calculate_metrics(
+    input_requests: List[Tuple[str, int, int]],
+    outputs: List[RequestFuncOutput],
+    dur_s: float,
+    tokenizer: PreTrainedTokenizerBase,
+) -> Tuple[BenchmarkMetrics, List[int]]:
+    actual_output_lens = []
+    total_input = 0
+    completed = 0
+    tpots = []
+    ttfts = []
+    for i in range(len(outputs)):
+        if outputs[i].success:
+            output_len = len(tokenizer(outputs[i].generated_text).input_ids)
+            actual_output_lens.append(output_len)
+            total_input += input_requests[i][1]
+            if output_len > 1:
+                tpots.append(
+                    (outputs[i].latency - outputs[i].ttft) / (output_len - 1))
+            ttfts.append(outputs[i].ttft)
+            completed += 1
+        else:
+            actual_output_lens.append(0)
+
+    metrics = BenchmarkMetrics(
+        completed=completed,
+        total_input=total_input,
+        total_output=sum(actual_output_lens),
+        request_throughput=completed / dur_s,
+        input_throughput=total_input / dur_s,
+        output_throughput=sum(actual_output_lens) / dur_s,
+        mean_ttft_ms=np.mean(ttfts or 0) *
+        1000,  # ttfts is empty if streaming is not supported by backend
+        median_ttft_ms=np.median(ttfts or 0) * 1000,
+        p99_ttft_ms=np.percentile(ttfts or 0, 99) * 1000,
+        mean_tpot_ms=np.mean(tpots) * 1000,
+        median_tpot_ms=np.median(tpots) * 1000,
+        p99_tpot_ms=np.percentile(tpots, 99) * 1000,
+    )
+
+    return metrics, actual_output_lens
+
+
+async def benchmark(
+    backend: str,
+    api_url: str,
+    model_id: str,
+    tokenizer: PreTrainedTokenizerBase,
+    input_requests: List[Tuple[str, int, int]],
+    best_of: int,
+    use_beam_search: bool,
+    request_rate: float,
+    disable_tqdm: bool,
+):
+    if backend in ASYNC_REQUEST_FUNCS:
+        request_func = ASYNC_REQUEST_FUNCS.get(backend)
+    else:
+        raise ValueError(f"Unknown backend: {backend}")
+
+    print(f"Traffic request rate: {request_rate}")
+
+    pbar = None if disable_tqdm else tqdm(total=len(input_requests))
+
+    benchmark_start_time = time.perf_counter()
+    tasks = []
+    async for request in get_request(input_requests, request_rate):
+        prompt, prompt_len, output_len = request
+        request_func_input = RequestFuncInput(
+            model=model_id,
+            prompt=prompt,
+            api_url=api_url,
+            prompt_len=prompt_len,
+            output_len=output_len,
+            best_of=best_of,
+            use_beam_search=use_beam_search,
+        )
+        tasks.append(
+            asyncio.create_task(
+                request_func(request_func_input=request_func_input,
+                             pbar=pbar)))
+    outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)
+
+    if not disable_tqdm:
+        pbar.close()
+
+    benchmark_duration = time.perf_counter() - benchmark_start_time
+
+    metrics, actual_output_lens = calculate_metrics(
+        input_requests=input_requests,
+        outputs=outputs,
+        dur_s=benchmark_duration,
+        tokenizer=tokenizer,
+    )
+
+    print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='='))
+    print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
+    print("{:<40} {:<10.2f}".format("Benchmark duration (s):",
+                                    benchmark_duration))
+    print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
+    print("{:<40} {:<10}".format("Total generated tokens:",
+                                 metrics.total_output))
+    print("{:<40} {:<10.2f}".format("Request throughput (req/s):",
+                                    metrics.request_throughput))
+    print("{:<40} {:<10.2f}".format("Input token throughput (tok/s):",
+                                    metrics.input_throughput))
+    print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):",
+                                    metrics.output_throughput))
+    print("{s:{c}^{n}}".format(s='Time to First Token', n=50, c='-'))
+    print("{:<40} {:<10.2f}".format("Mean TTFT (ms):", metrics.mean_ttft_ms))
+    print("{:<40} {:<10.2f}".format("Median TTFT (ms):",
+                                    metrics.median_ttft_ms))
+    print("{:<40} {:<10.2f}".format("P99 TTFT (ms):", metrics.p99_ttft_ms))
+    print("{s:{c}^{n}}".format(s='Time per Output Token (excl. 1st token)',
+                               n=50,
+                               c='-'))
+    print("{:<40} {:<10.2f}".format("Mean TPOT (ms):", metrics.mean_tpot_ms))
+    print("{:<40} {:<10.2f}".format("Median TPOT (ms):",
+                                    metrics.median_tpot_ms))
+    print("{:<40} {:<10.2f}".format("P99 TPOT (ms):", metrics.p99_tpot_ms))
+    print("=" * 50)
+
+    result = {
+        "duration": benchmark_duration,
+        "completed": metrics.completed,
+        "total_input_tokens": metrics.total_input,
+        "total_output_tokens": metrics.total_output,
+        "request_throughput": metrics.request_throughput,
+        "input_throughput": metrics.input_throughput,
+        "output_throughput": metrics.output_throughput,
+        "mean_ttft_ms": metrics.mean_ttft_ms,
+        "median_ttft_ms": metrics.median_ttft_ms,
+        "p99_ttft_ms": metrics.p99_ttft_ms,
+        "mean_tpot_ms": metrics.mean_tpot_ms,
+        "median_tpot_ms": metrics.median_tpot_ms,
+        "p99_tpot_ms": metrics.p99_tpot_ms,
+        "input_lens": [output.prompt_len for output in outputs],
+        "output_lens": actual_output_lens,
+        "ttfts": [output.ttft for output in outputs],
+        "itls": [output.itl for output in outputs],
+        "generated_texts": [output.generated_text for output in outputs],
+        "errors": [output.error for output in outputs],
+    }
+    return result
+
+
+def main(args: argparse.Namespace):
+    print(args)
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+
+    backend = args.backend
+    model_id = args.model
+    tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model
+
+    if args.base_url is not None:
+        api_url = f"{args.base_url}{args.endpoint}"
+    else:
+        api_url = f"http://{args.host}:{args.port}{args.endpoint}"
+
+    tokenizer = get_tokenizer(tokenizer_id,
+                              trust_remote_code=args.trust_remote_code)
+
+    if args.dataset is not None:
+        warnings.warn(
+            "The '--dataset' argument will be deprecated in the next "
+            "release. Please use '--dataset-name' and "
+            "'--dataset-path' in the future runs.",
+            stacklevel=2)
+        input_requests = sample_sharegpt_requests(
+            dataset_path=args.dataset,
+            num_requests=args.num_prompts,
+            tokenizer=tokenizer,
+            fixed_output_len=args.sharegpt_output_len,
+        )
+
+    elif args.dataset_name == "sharegpt":
+        input_requests = sample_sharegpt_requests(
+            dataset_path=args.dataset_path,
+            num_requests=args.num_prompts,
+            tokenizer=tokenizer,
+            fixed_output_len=args.sharegpt_output_len,
+        )
+
+    elif args.dataset_name == "sonnet":
+        # Do not format the prompt, pass to message directly
+        if args.backend == "openai-chat":
+            input_requests = sample_sonnet_requests(
+                dataset_path=args.dataset_path,
+                num_requests=args.num_prompts,
+                input_len=args.sonnet_input_len,
+                output_len=args.sonnet_output_len,
+                prefix_len=args.sonnet_prefix_len,
+                tokenizer=tokenizer,
+            )
+            input_requests = [(prompt, prompt_len, output_len)
+                              for prompt, prompt_formatted, prompt_len,
+                              output_len in input_requests]
+        else:
+            assert (
+                tokenizer.chat_template or tokenizer.default_chat_template
+            ), "Tokenizer/model must have chat template for sonnet dataset."
+            input_requests = sample_sonnet_requests(
+                dataset_path=args.dataset_path,
+                num_requests=args.num_prompts,
+                input_len=args.sonnet_input_len,
+                output_len=args.sonnet_output_len,
+                prefix_len=args.sonnet_prefix_len,
+                tokenizer=tokenizer,
+            )
+            input_requests = [(prompt_formatted, prompt_len, output_len)
+                              for prompt, prompt_formatted, prompt_len,
+                              output_len in input_requests]
+
+    else:
+        raise ValueError(f"Unknown dataset: {args.dataset_name}")
+
+    benchmark_result = asyncio.run(
+        benchmark(
+            backend=backend,
+            api_url=api_url,
+            model_id=model_id,
+            tokenizer=tokenizer,
+            input_requests=input_requests,
+            best_of=args.best_of,
+            use_beam_search=args.use_beam_search,
+            request_rate=args.request_rate,
+            disable_tqdm=args.disable_tqdm,
+        ))
+
+    # Save config and results to json
+    if args.save_result:
+        result_json = {}
+
+        # Setup
+        current_dt = datetime.now().strftime("%Y%m%d-%H%M%S")
+        result_json["date"] = current_dt
+        result_json["backend"] = backend
+        result_json["model_id"] = model_id
+        result_json["tokenizer_id"] = tokenizer_id
+        result_json["best_of"] = args.best_of
+        result_json["use_beam_search"] = args.use_beam_search
+        result_json["num_prompts"] = args.num_prompts
+
+        # Metadata
+        if args.metadata:
+            for item in args.metadata:
+                if "=" in item:
+                    kvstring = item.split("=")
+                    result_json[kvstring[0].strip()] = kvstring[1].strip()
+                else:
+                    raise ValueError(
+                        "Invalid metadata format. Please use KEY=VALUE format."
+                    )
+
+        # Traffic
+        result_json["request_rate"] = (
+            args.request_rate if args.request_rate < float("inf") else "inf")
+
+        # Merge with benchmark result
+        result_json = {**result_json, **benchmark_result}
+
+        # Save to file
+        base_model_id = model_id.split("/")[-1]
+        file_name = f"{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json"  #noqa
+        if args.result_dir:
+            file_name = os.path.join(args.result_dir, file_name)
+        with open(file_name, "w") as outfile:
+            json.dump(result_json, outfile)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Benchmark the online serving throughput.")
+    parser.add_argument(
+        "--backend",
+        type=str,
+        default="vllm",
+        choices=list(ASYNC_REQUEST_FUNCS.keys()),
+    )
+    parser.add_argument(
+        "--base-url",
+        type=str,
+        default=None,
+        help="Server or API base url if not using http host and port.",
+    )
+    parser.add_argument("--host", type=str, default="localhost")
+    parser.add_argument("--port", type=int, default=8000)
+    parser.add_argument(
+        "--endpoint",
+        type=str,
+        default="/v1/completions",
+        help="API endpoint.",
+    )
+    parser.add_argument(
+        "--dataset",
+        type=str,
+        default=None,
+        help="Path to the ShareGPT dataset, will be deprecated in the "
+        "next release.",
+    )
+    parser.add_argument(
+        "--dataset-name",
+        type=str,
+        default="sharegpt",
+        choices=["sharegpt", "sonnet"],
+        help="Name of the dataset to benchmark on.",
+    )
+    parser.add_argument("--dataset-path",
+                        type=str,
+                        default=None,
+                        help="Path to the dataset.")
+    parser.add_argument(
+        "--model",
+        type=str,
+        required=True,
+        help="Name of the model.",
+    )
+    parser.add_argument(
+        "--tokenizer",
+        type=str,
+        help=
+        "Name or path of the tokenizer, if not using the default tokenizer.",
+    )
+    parser.add_argument(
+        "--best-of",
+        type=int,
+        default=1,
+        help="Generates `best_of` sequences per prompt and "
+        "returns the best one.",
+    )
+    parser.add_argument("--use-beam-search", action="store_true")
+    parser.add_argument(
+        "--num-prompts",
+        type=int,
+        default=1000,
+        help="Number of prompts to process.",
+    )
+    parser.add_argument(
+        "--sharegpt-output-len",
+        type=int,
+        default=None,
+        help="Output length for each request. Overrides the output length "
+        "from the ShareGPT dataset.")
+    parser.add_argument(
+        "--sonnet-input-len",
+        type=int,
+        default=550,
+        help=
+        "Number of input tokens per request, used only for sonnet dataset.",
+    )
+    parser.add_argument(
+        "--sonnet-output-len",
+        type=int,
+        default=150,
+        help=
+        "Number of output tokens per request, used only for sonnet dataset.",
+    )
+    parser.add_argument(
+        "--sonnet-prefix-len",
+        type=int,
+        default=200,
+        help=
+        "Number of prefix tokens per request, used only for sonnet dataset.",
+    )
+    parser.add_argument(
+        "--request-rate",
+        type=float,
+        default=float("inf"),
+        help="Number of requests per second. If this is inf, "
+        "then all the requests are sent at time 0. "
+        "Otherwise, we use Poisson process to synthesize "
+        "the request arrival times.",
+    )
+    parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument(
+        "--trust-remote-code",
+        action="store_true",
+        help="Trust remote code from huggingface",
+    )
+    parser.add_argument(
+        "--disable-tqdm",
+        action="store_true",
+        help="Specify to disable tqdm progress bar.",
+    )
+    parser.add_argument(
+        "--save-result",
+        action="store_true",
+        help="Specify to save benchmark results to a json file",
+    )
+    parser.add_argument(
+        "--metadata",
+        metavar="KEY=VALUE",
+        nargs="*",
+        help="Key-value pairs (e.g, --metadata version=0.3.3 tp=1) "
+        "for metadata of this run to be saved in the result JSON file "
+        "for record keeping purposes.",
+    )
+    parser.add_argument(
+        "--result-dir",
+        type=str,
+        default=None,
+        help="Specify directory to save benchmark json results."
+        "If not specified, results are saved in the current directory.",
+    )
+
+    args = parser.parse_args()
+    main(args)
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -0,0 +1,387 @@
+"""Benchmark offline inference throughput."""
+import argparse
+import json
+import random
+import time
+from typing import List, Optional, Tuple
+
+import torch
+from tqdm import tqdm
+from transformers import (AutoModelForCausalLM, AutoTokenizer,
+                          PreTrainedTokenizerBase)
+
+from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
+
+
+def sample_requests(
+    dataset_path: str,
+    num_requests: int,
+    tokenizer: PreTrainedTokenizerBase,
+    fixed_output_len: Optional[int],
+) -> List[Tuple[str, int, int]]:
+    if fixed_output_len is not None and fixed_output_len < 4:
+        raise ValueError("output_len too small")
+
+    # Load the dataset.
+    with open(dataset_path) as f:
+        dataset = json.load(f)
+    # Filter out the conversations with less than 2 turns.
+    dataset = [data for data in dataset if len(data["conversations"]) >= 2]
+    # Only keep the first two turns of each conversation.
+    dataset = [(data["conversations"][0]["value"],
+                data["conversations"][1]["value"]) for data in dataset]
+
+    # Shuffle the dataset.
+    random.shuffle(dataset)
+
+    # Filter out sequences that are too long or too short
+    filtered_dataset: List[Tuple[str, int, int]] = []
+    for i in range(len(dataset)):
+        if len(filtered_dataset) == num_requests:
+            break
+
+        # Tokenize the prompts and completions.
+        prompt = dataset[i][0]
+        prompt_token_ids = tokenizer(prompt).input_ids
+        completion = dataset[i][1]
+        completion_token_ids = tokenizer(completion).input_ids
+        prompt_len = len(prompt_token_ids)
+        output_len = len(completion_token_ids
+                         ) if fixed_output_len is None else fixed_output_len
+        if prompt_len < 4 or output_len < 4:
+            # Prune too short sequences.
+            continue
+        if prompt_len > 1024 or prompt_len + output_len > 2048:
+            # Prune too long sequences.
+            continue
+        filtered_dataset.append((prompt, prompt_len, output_len))
+
+    return filtered_dataset
+
+
+def run_vllm(
+    requests: List[Tuple[str, int, int]],
+    model: str,
+    tokenizer: str,
+    quantization: Optional[str],
+    tensor_parallel_size: int,
+    seed: int,
+    n: int,
+    use_beam_search: bool,
+    trust_remote_code: bool,
+    dtype: str,
+    max_model_len: Optional[int],
+    enforce_eager: bool,
+    kv_cache_dtype: str,
+    quantization_param_path: Optional[str],
+    device: str,
+    enable_prefix_caching: bool,
+    enable_chunked_prefill: bool,
+    max_num_batched_tokens: int,
+    gpu_memory_utilization: float = 0.9,
+    download_dir: Optional[str] = None,
+) -> float:
+    from vllm import LLM, SamplingParams
+    llm = LLM(
+        model=model,
+        tokenizer=tokenizer,
+        quantization=quantization,
+        tensor_parallel_size=tensor_parallel_size,
+        seed=seed,
+        trust_remote_code=trust_remote_code,
+        dtype=dtype,
+        max_model_len=max_model_len,
+        gpu_memory_utilization=gpu_memory_utilization,
+        enforce_eager=enforce_eager,
+        kv_cache_dtype=kv_cache_dtype,
+        quantization_param_path=quantization_param_path,
+        device=device,
+        enable_prefix_caching=enable_prefix_caching,
+        download_dir=download_dir,
+        enable_chunked_prefill=enable_chunked_prefill,
+        max_num_batched_tokens=max_num_batched_tokens,
+    )
+
+    # Add the requests to the engine.
+    prompts = []
+    sampling_params = []
+    for prompt, _, output_len in requests:
+        prompts.append(prompt)
+        sampling_params.append(
+            SamplingParams(
+                n=n,
+                temperature=0.0 if use_beam_search else 1.0,
+                top_p=1.0,
+                use_beam_search=use_beam_search,
+                ignore_eos=True,
+                max_tokens=output_len,
+            ))
+
+    start = time.perf_counter()
+    llm.generate(prompts, sampling_params, use_tqdm=True)
+    end = time.perf_counter()
+    return end - start
+
+
+def run_hf(
+    requests: List[Tuple[str, int, int]],
+    model: str,
+    tokenizer: PreTrainedTokenizerBase,
+    n: int,
+    use_beam_search: bool,
+    max_batch_size: int,
+    trust_remote_code: bool,
+) -> float:
+    assert not use_beam_search
+    llm = AutoModelForCausalLM.from_pretrained(
+        model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code)
+    if llm.config.model_type == "llama":
+        # To enable padding in the HF backend.
+        tokenizer.pad_token = tokenizer.eos_token
+    llm = llm.cuda()
+
+    pbar = tqdm(total=len(requests))
+    start = time.perf_counter()
+    batch: List[str] = []
+    max_prompt_len = 0
+    max_output_len = 0
+    for i in range(len(requests)):
+        prompt, prompt_len, output_len = requests[i]
+        # Add the prompt to the batch.
+        batch.append(prompt)
+        max_prompt_len = max(max_prompt_len, prompt_len)
+        max_output_len = max(max_output_len, output_len)
+        if len(batch) < max_batch_size and i != len(requests) - 1:
+            # Check if we can add more requests to the batch.
+            _, next_prompt_len, next_output_len = requests[i + 1]
+            if (max(max_prompt_len, next_prompt_len) +
+                    max(max_output_len, next_output_len)) <= 2048:
+                # We can add more requests to the batch.
+                continue
+
+        # Generate the sequences.
+        input_ids = tokenizer(batch, return_tensors="pt",
+                              padding=True).input_ids
+        llm_outputs = llm.generate(
+            input_ids=input_ids.cuda(),
+            do_sample=not use_beam_search,
+            num_return_sequences=n,
+            temperature=1.0,
+            top_p=1.0,
+            use_cache=True,
+            max_new_tokens=max_output_len,
+        )
+        # Include the decoding time.
+        tokenizer.batch_decode(llm_outputs, skip_special_tokens=True)
+        pbar.update(len(batch))
+
+        # Clear the batch.
+        batch = []
+        max_prompt_len = 0
+        max_output_len = 0
+    end = time.perf_counter()
+    return end - start
+
+
+def run_mii(
+    requests: List[Tuple[str, int, int]],
+    model: str,
+    tensor_parallel_size: int,
+    output_len: int,
+) -> float:
+    from mii import client, serve
+    llm = serve(model, tensor_parallel=tensor_parallel_size)
+    prompts = [prompt for prompt, _, _ in requests]
+
+    start = time.perf_counter()
+    llm.generate(prompts, max_new_tokens=output_len)
+    end = time.perf_counter()
+    client = client(model)
+    client.terminate_server()
+    return end - start
+
+
+def main(args: argparse.Namespace):
+    print(args)
+    random.seed(args.seed)
+
+    # Sample the requests.
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.tokenizer, trust_remote_code=args.trust_remote_code)
+    if args.dataset is None:
+        # Synthesize a prompt with the given input length.
+        prompt = "hi" * (args.input_len - 1)
+        requests = [(prompt, args.input_len, args.output_len)
+                    for _ in range(args.num_prompts)]
+    else:
+        requests = sample_requests(args.dataset, args.num_prompts, tokenizer,
+                                   args.output_len)
+
+    if args.backend == "vllm":
+        elapsed_time = run_vllm(
+            requests, args.model, args.tokenizer, args.quantization,
+            args.tensor_parallel_size, args.seed, args.n, args.use_beam_search,
+            args.trust_remote_code, args.dtype, args.max_model_len,
+            args.enforce_eager, args.kv_cache_dtype,
+            args.quantization_param_path, args.device,
+            args.enable_prefix_caching, args.enable_chunked_prefill,
+            args.max_num_batched_tokens, args.gpu_memory_utilization,
+            args.download_dir)
+    elif args.backend == "hf":
+        assert args.tensor_parallel_size == 1
+        elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
+                              args.use_beam_search, args.hf_max_batch_size,
+                              args.trust_remote_code)
+    elif args.backend == "mii":
+        elapsed_time = run_mii(requests, args.model, args.tensor_parallel_size,
+                               args.output_len)
+    else:
+        raise ValueError(f"Unknown backend: {args.backend}")
+    total_num_tokens = sum(prompt_len + output_len
+                           for _, prompt_len, output_len in requests)
+    print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
+          f"{total_num_tokens / elapsed_time:.2f} tokens/s")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Benchmark the throughput.")
+    parser.add_argument("--backend",
+                        type=str,
+                        choices=["vllm", "hf", "mii"],
+                        default="vllm")
+    parser.add_argument("--dataset",
+                        type=str,
+                        default=None,
+                        help="Path to the dataset.")
+    parser.add_argument("--input-len",
+                        type=int,
+                        default=None,
+                        help="Input prompt length for each request")
+    parser.add_argument("--output-len",
+                        type=int,
+                        default=None,
+                        help="Output length for each request. Overrides the "
+                        "output length from the dataset.")
+    parser.add_argument("--model", type=str, default="facebook/opt-125m")
+    parser.add_argument("--tokenizer", type=str, default=None)
+    parser.add_argument('--quantization',
+                        '-q',
+                        choices=[*QUANTIZATION_METHODS, None],
+                        default=None)
+    parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1)
+    parser.add_argument("--n",
+                        type=int,
+                        default=1,
+                        help="Number of generated sequences per prompt.")
+    parser.add_argument("--use-beam-search", action="store_true")
+    parser.add_argument("--num-prompts",
+                        type=int,
+                        default=1000,
+                        help="Number of prompts to process.")
+    parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument("--hf-max-batch-size",
+                        type=int,
+                        default=None,
+                        help="Maximum batch size for HF backend.")
+    parser.add_argument('--trust-remote-code',
+                        action='store_true',
+                        help='trust remote code from huggingface')
+    parser.add_argument(
+        '--max-model-len',
+        type=int,
+        default=None,
+        help='Maximum length of a sequence (including prompt and output). '
+        'If None, will be derived from the model.')
+    parser.add_argument(
+        '--dtype',
+        type=str,
+        default='auto',
+        choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'],
+        help='data type for model weights and activations. '
+        'The "auto" option will use FP16 precision '
+        'for FP32 and FP16 models, and BF16 precision '
+        'for BF16 models.')
+    parser.add_argument('--gpu-memory-utilization',
+                        type=float,
+                        default=0.9,
+                        help='the fraction of GPU memory to be used for '
+                        'the model executor, which can range from 0 to 1.'
+                        'If unspecified, will use the default value of 0.9.')
+    parser.add_argument("--enforce-eager",
+                        action="store_true",
+                        help="enforce eager execution")
+    parser.add_argument(
+        "--kv-cache-dtype",
+        type=str,
+        choices=["auto", "fp8"],
+        default="auto",
+        help=
+        'Data type for kv cache storage. If "auto", will use model data type. '
+        'FP8_E5M2 (without scaling) is only supported on cuda version greater '
+        'than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead supported for '
+        'common inference criteria.')
+    parser.add_argument(
+        '--quantization-param-path',
+        type=str,
+        default=None,
+        help='Path to the JSON file containing the KV cache scaling factors. '
+        'This should generally be supplied, when KV cache dtype is FP8. '
+        'Otherwise, KV cache scaling factors default to 1.0, which may cause '
+        'accuracy issues. FP8_E5M2 (without scaling) is only supported on '
+        'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is '
+        'instead supported for common inference criteria.')
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="cuda",
+        choices=["cuda", "cpu"],
+        help='device type for vLLM execution, supporting CUDA and CPU.')
+    parser.add_argument(
+        "--enable-prefix-caching",
+        action='store_true',
+        help="enable automatic prefix caching for vLLM backend.")
+    parser.add_argument("--enable-chunked-prefill",
+                        action='store_true',
+                        help="enable chunked prefill for vLLM backend.")
+    parser.add_argument('--max-num-batched-tokens',
+                        type=int,
+                        default=None,
+                        help='maximum number of batched tokens per '
+                        'iteration')
+    parser.add_argument('--download-dir',
+                        type=str,
+                        default=None,
+                        help='directory to download and load the weights, '
+                        'default to the default cache dir of huggingface')
+    args = parser.parse_args()
+    if args.tokenizer is None:
+        args.tokenizer = args.model
+    if args.dataset is None:
+        assert args.input_len is not None
+        assert args.output_len is not None
+    else:
+        assert args.input_len is None
+
+    if args.backend == "vllm":
+        if args.hf_max_batch_size is not None:
+            raise ValueError("HF max batch size is only for HF backend.")
+    elif args.backend == "hf":
+        if args.hf_max_batch_size is None:
+            raise ValueError("HF max batch size is required for HF backend.")
+        if args.quantization is not None:
+            raise ValueError("Quantization is only for vLLM backend.")
+    elif args.backend == "mii":
+        if args.dtype != "auto":
+            raise ValueError("dtype must be auto for MII backend.")
+        if args.n != 1:
+            raise ValueError("n must be 1 for MII backend.")
+        if args.use_beam_search:
+            raise ValueError("Beam search is not supported for MII backend.")
+        if args.quantization is not None:
+            raise ValueError("Quantization is only for vLLM backend.")
+        if args.hf_max_batch_size is not None:
+            raise ValueError("HF max batch size is only for HF backend.")
+        if args.tokenizer != args.model:
+            raise ValueError("Tokenizer must be the same as the model for MII "
+                             "backend.")
+    main(args)
--- a/benchmarks/kernels/benchmark_aqlm.py
+++ b/benchmarks/kernels/benchmark_aqlm.py
@@ -0,0 +1,302 @@
+import argparse
+import os
+import sys
+from typing import Optional
+
+import torch
+import torch.nn.functional as F
+
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.quantization.aqlm import (
+    dequantize_weight, generic_dequantize_gemm, get_int_dtype,
+    optimized_dequantize_gemm)
+
+os.environ['CUDA_VISIBLE_DEVICES'] = '0'
+
+
+def torch_mult(
+        input: torch.Tensor,  #  [..., in_features]
+        weights: torch.Tensor,
+        scales: torch.Tensor,  #  [num_out_groups, 1, 1, 1]
+) -> torch.Tensor:
+    output = F.linear(input, weights)
+    return output
+
+
+def dequant_out_scale(
+    input: torch.Tensor,  #  [..., in_features]
+    codes: torch.IntTensor,  #  [num_out_groups, num_in_groups, num_codebooks]
+    codebooks: torch.
+    Tensor,  #  [num_codebooks, codebook_size, out_group_size, in_group_size]
+    scales: torch.Tensor,  #  [num_out_groups, 1, 1, 1]
+    output_partition_sizes: torch.IntTensor,
+    bias: Optional[torch.Tensor],
+) -> torch.Tensor:
+
+    weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes)
+
+    if bias is None:
+        output = F.linear(input, weights, bias)
+        orig_shape = output.shape
+        flattened_output = output.view(-1, output.size(-1))
+        f_scales = scales.view(-1, scales.shape[0])
+        b_scales = f_scales.expand(flattened_output.shape[0], -1)
+        flattened_output *= b_scales
+        return flattened_output.view(orig_shape)
+    else:
+        b_scales = scales.view(scales.shape[:-3] + (-1, )).expand(
+            -1, weights.shape[1])
+        weights *= b_scales
+        return F.linear(input, weights, bias)
+
+
+def dequant_weight_scale(
+    input: torch.Tensor,  #  [..., in_features]
+    codes: torch.IntTensor,  #  [num_out_groups, num_in_groups, num_codebooks]
+    codebooks: torch.
+    Tensor,  #  [num_codebooks, codebook_size, out_group_size, in_group_size]
+    scales: torch.Tensor,  #  [num_out_groups, 1, 1, 1]
+    output_partition_sizes: torch.IntTensor,
+    bias: Optional[torch.Tensor],
+) -> torch.Tensor:
+
+    weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes)
+
+    b_scales = scales.view(scales.shape[:-3] + (-1, )).expand(
+        -1, weights.shape[1])
+    weights *= b_scales
+    return F.linear(input, weights, bias)
+
+
+def dequant_no_scale(
+    input: torch.Tensor,  #  [..., in_features]
+    codes: torch.IntTensor,  #  [num_out_groups, num_in_groups, num_codebooks]
+    codebooks: torch.
+    Tensor,  #  [num_codebooks, codebook_size, out_group_size, in_group_size]
+    scales: torch.Tensor,  #  [num_out_groups, 1, 1, 1]
+    output_partition_sizes: torch.IntTensor,
+    bias: Optional[torch.Tensor],
+) -> torch.Tensor:
+
+    weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes)
+
+    return F.linear(input, weights, bias)
+
+
+# Compare the optimized 1x16 and 2x8 cuda decompression/dequant kernels against
+# the generic pytorch version.
+# Just visual comparison.
+def dequant_test(k: int, parts: torch.tensor, nbooks: int, bits: int) -> None:
+
+    n = parts.sum().item()
+
+    device = torch.device('cuda:0')
+
+    code_range = (1 << bits) // 2
+    ingroups = 8
+
+    codes = torch.randint(-code_range,
+                          code_range,
+                          size=(n, k // ingroups, nbooks),
+                          dtype=get_int_dtype(bits),
+                          device=device)
+
+    codebooks = torch.randn(size=(parts.shape[0] * nbooks, 1 << bits, 1, 8),
+                            dtype=torch.float16,
+                            device=device)
+
+    count = 0
+    for index in range(16):
+        for i in range(8):
+            for book in range(nbooks):
+                codebooks[book, index, 0, i] = count * (10**book)
+            count += 1
+
+    print("codes shape", codes.shape)
+
+    for i in range(16):
+        for book in range(nbooks):
+            codes[0, i, book] = i
+            codes[0, -i, book] = i
+
+    weights = dequantize_weight(codes, codebooks, None)
+    weights2 = ops.aqlm_dequant(codes, codebooks, parts)
+
+    print("weights shape:", weights.shape)
+    print("weights2 shape:", weights2.shape)
+
+    print("weights are:", weights)
+    print("weights2 are:", weights2)
+
+    print("first 128 weights are", weights[0, 0:128].to(torch.int32))
+    print("first 128 weights2 are:", weights2[0, 0:128].to(torch.int32))
+
+    print("last 128 weights are", weights[0, -128:])
+    print("last 128 weights2 are:", weights2[0, -128:])
+
+
+def main():
+
+    parser = argparse.ArgumentParser(description="Benchmark aqlm performance.")
+
+    # Add arguments
+    parser.add_argument("--nbooks",
+                        type=int,
+                        default=1,
+                        help="Number of codebooks (default: 1)")
+    parser.add_argument("--bits",
+                        type=int,
+                        default=16,
+                        help="Number of bits per code element (default: 16)")
+    parser.add_argument(
+        "--test",
+        type=bool,
+        default=False,
+        help="Run the decompression/dequant tester rather than benchmarking "
+        "(default: False)")
+
+    # Parse the arguments
+    args = parser.parse_args()
+
+    # Extract values
+    nbooks = args.nbooks
+    bits = args.bits
+
+    if args.test:
+        dequant_test(4096, torch.tensor((4096, )), nbooks, bits)
+        return
+
+    # Otherwise, benchmark.
+    methods = [
+        ops.aqlm_gemm,
+        dequant_out_scale,
+        generic_dequantize_gemm,
+        optimized_dequantize_gemm,
+        dequant_weight_scale,
+        torch_mult,
+        dequant_no_scale,
+    ]
+
+    filename = f"./aqlm_benchmark_{nbooks}x{bits}.csv"
+    print(f"writing benchmarks to file {filename}")
+    with open(filename, "w") as f:
+        sys.stdout = f
+
+        print('m | k | n | n parts', end='')
+        for method in methods:
+            print(f" | {method.__name__.replace('_', ' ')} (µs)", end='')
+        print('')
+
+        # These are reasonable prefill sizes.
+        ksandpartions = ((4096, (4096, 4096, 4096)), (4096, (4096, )),
+                         (4096, (11008, 11008)), (11008, (4096, )))
+
+        # reasonable ranges for m.
+        for m in [
+                1, 2, 4, 8, 10, 12, 14, 16, 24, 32, 48, 52, 56, 64, 96, 112,
+                128, 256, 512, 1024, 1536, 2048, 3072, 4096
+        ]:
+            print(f'{m}', file=sys.__stdout__)
+            for ksp in ksandpartions:
+                run_grid(m, ksp[0], torch.tensor(ksp[1]), nbooks, bits,
+                         methods)
+
+        sys.stdout = sys.__stdout__
+
+
+def run_grid(m: int, k: int, parts: torch.tensor, nbooks: int, bits: int,
+             methods):
+
+    # I didn't see visible improvements from increasing these, but feel free :)
+    num_warmup_trials = 1
+    num_trials = 1
+
+    num_calls = 100
+
+    # warmup.
+    for method in methods:
+        for _ in range(num_warmup_trials):
+            run_timing(
+                num_calls=num_calls,
+                m=m,
+                k=k,
+                parts=parts,
+                nbooks=nbooks,
+                bits=bits,
+                method=method,
+            )
+
+    n = parts.sum().item()
+    print(f'{m} | {k} | {n} | {parts.tolist()}', end='')
+
+    for method in methods:
+        best_time_us = 1e20
+        for _ in range(num_trials):
+            kernel_dur_ms = run_timing(
+                num_calls=num_calls,
+                m=m,
+                k=k,
+                parts=parts,
+                nbooks=nbooks,
+                bits=bits,
+                method=method,
+            )
+
+            kernel_dur_us = 1000 * kernel_dur_ms
+
+            if kernel_dur_us < best_time_us:
+                best_time_us = kernel_dur_us
+
+        print(f' | {kernel_dur_us:.0f}', end='')
+
+    print('')
+
+
+def run_timing(num_calls: int, m: int, k: int, parts: torch.tensor,
+               nbooks: int, bits: int, method) -> float:
+
+    n = parts.sum().item()
+
+    device = torch.device('cuda:0')
+
+    input = torch.randn((1, m, k), dtype=torch.float16, device=device)
+
+    code_range = (1 << bits) // 2
+    ingroups = 8
+
+    codes = torch.randint(-code_range,
+                          code_range,
+                          size=(n, k // ingroups, nbooks),
+                          dtype=get_int_dtype(bits),
+                          device=device)
+
+    codebooks = torch.randn(size=(parts.shape[0] * nbooks, 1 << bits, 1, 8),
+                            dtype=torch.float16,
+                            device=device)
+
+    scales = torch.randn(size=(n, 1, 1, 1), dtype=torch.float16, device=device)
+
+    # for comparison to just a pytorch mult.
+    weights = torch.randn((n, k), dtype=torch.float16, device=device)
+
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
+
+    start_event.record()
+
+    if method is torch_mult:
+        for i in range(num_calls):
+            torch_mult(input, weights, scales)
+    else:
+        for i in range(num_calls):
+            method(input, codes, codebooks, scales, parts, None)
+
+    end_event.record()
+    end_event.synchronize()
+
+    dur_ms = start_event.elapsed_time(end_event) / num_calls
+    return dur_ms
+
+
+if __name__ == "__main__":
+    sys.exit(main())
--- a/benchmarks/kernels/benchmark_mixtral_moe.py
+++ b/benchmarks/kernels/benchmark_mixtral_moe.py
@@ -0,0 +1,215 @@
+import argparse
+import json
+import os
+import sys
+
+import torch
+import torch.nn.functional as F
+import triton
+from tqdm import tqdm
+
+from vllm.model_executor.layers.fused_moe import (fused_moe,
+                                                  get_config_file_name)
+
+os.environ['CUDA_VISIBLE_DEVICES'] = '0'
+
+
+def main(dtype: str):
+    method = fused_moe
+    for bs in [
+            1, 2, 4, 8, 16, 24, 32, 48, 64, 96, 128, 256, 512, 1024, 1536,
+            2048, 3072, 4096
+    ]:
+        run_grid(bs, method=method, dtype=dtype)
+
+
+def run_grid(bs, method, dtype: str):
+    d_model = 4096
+    num_total_experts = 8
+    top_k = 2
+    tp_size = 2
+    model_intermediate_size = 14336
+    num_layers = 32
+    num_calls = 100
+
+    num_warmup_trials = 1
+    num_trials = 1
+
+    configs = []
+
+    for block_size_n in [32, 64, 128, 256]:
+        for block_size_m in [16, 32, 64, 128, 256]:
+            for block_size_k in [64, 128, 256]:
+                for group_size_m in [1, 16, 32, 64]:
+                    for num_warps in [4, 8]:
+                        for num_stages in [2, 3, 4, 5]:
+                            configs.append({
+                                "BLOCK_SIZE_M": block_size_m,
+                                "BLOCK_SIZE_N": block_size_n,
+                                "BLOCK_SIZE_K": block_size_k,
+                                "GROUP_SIZE_M": group_size_m,
+                                "num_warps": num_warps,
+                                "num_stages": num_stages,
+                            })
+
+    best_config = None
+    best_time_us = 1e20
+
+    print(f'{tp_size=} {bs=}')
+
+    for config in tqdm(configs):
+        # warmup
+        try:
+            for _ in range(num_warmup_trials):
+                run_timing(
+                    num_calls=num_calls,
+                    bs=bs,
+                    d_model=d_model,
+                    num_total_experts=num_total_experts,
+                    top_k=top_k,
+                    tp_size=tp_size,
+                    model_intermediate_size=model_intermediate_size,
+                    method=method,
+                    config=config,
+                    dtype=dtype,
+                )
+        except triton.runtime.autotuner.OutOfResources:
+            continue
+
+        # trial
+        for _ in range(num_trials):
+            kernel_dur_ms = run_timing(
+                num_calls=num_calls,
+                bs=bs,
+                d_model=d_model,
+                num_total_experts=num_total_experts,
+                top_k=top_k,
+                tp_size=tp_size,
+                model_intermediate_size=model_intermediate_size,
+                method=method,
+                config=config,
+                dtype=dtype,
+            )
+
+            kernel_dur_us = 1000 * kernel_dur_ms
+            model_dur_ms = kernel_dur_ms * num_layers
+
+            if kernel_dur_us < best_time_us:
+                best_config = config
+                best_time_us = kernel_dur_us
+
+                tqdm.write(
+                    f'{kernel_dur_us=:.1f} {model_dur_ms=:.1f}'
+                    f' {bs=} {tp_size=} {top_k=} {num_total_experts=} '
+                    f'{d_model=} {model_intermediate_size=} {num_layers=}')
+
+    print("best_time_us", best_time_us)
+    print("best_config", best_config)
+
+    # holds Dict[str, Dict[str, int]]
+    filename = get_config_file_name(num_total_experts,
+                                    model_intermediate_size // tp_size,
+                                    "float8" if dtype == "float8" else None)
+    print(f"writing config to file {filename}")
+    existing_content = {}
+    if os.path.exists(filename):
+        with open(filename, "r") as f:
+            existing_content = json.load(f)
+    existing_content[str(bs)] = best_config
+    with open(filename, "w") as f:
+        json.dump(existing_content, f, indent=4)
+        f.write("\n")
+
+
+def run_timing(num_calls: int, bs: int, d_model: int, num_total_experts: int,
+               top_k: int, tp_size: int, model_intermediate_size: int, method,
+               config, dtype: str) -> float:
+    shard_intermediate_size = model_intermediate_size // tp_size
+
+    hidden_states = torch.rand(
+        (bs, d_model),
+        device="cuda:0",
+        dtype=torch.float16,
+    )
+
+    w1 = torch.rand(
+        (num_total_experts, 2 * shard_intermediate_size, d_model),
+        device=hidden_states.device,
+        dtype=hidden_states.dtype,
+    )
+
+    w2 = torch.rand(
+        (num_total_experts, d_model, shard_intermediate_size),
+        device=hidden_states.device,
+        dtype=hidden_states.dtype,
+    )
+
+    w1_scale = None
+    w2_scale = None
+    a1_scale = None
+    a2_scale = None
+
+    if dtype == "float8":
+        w1 = w1.to(torch.float8_e4m3fn)
+        w2 = w2.to(torch.float8_e4m3fn)
+        w1_scale = torch.ones(num_total_experts,
+                              device=hidden_states.device,
+                              dtype=torch.float32)
+        w2_scale = torch.ones(num_total_experts,
+                              device=hidden_states.device,
+                              dtype=torch.float32)
+        a1_scale = torch.ones(1,
+                              device=hidden_states.device,
+                              dtype=torch.float32)
+        a2_scale = torch.ones(1,
+                              device=hidden_states.device,
+                              dtype=torch.float32)
+
+    gating_output = F.softmax(torch.rand(
+        (num_calls, bs, num_total_experts),
+        device=hidden_states.device,
+        dtype=torch.float32,
+    ),
+                              dim=-1)
+
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
+
+    start_event.record()
+    for i in range(num_calls):
+        hidden_states = method(
+            hidden_states=hidden_states,
+            w1=w1,
+            w2=w2,
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+            a1_scale=a1_scale,
+            a2_scale=a2_scale,
+            gating_output=gating_output[i],
+            topk=2,
+            renormalize=True,
+            inplace=True,
+            override_config=config,
+            use_fp8=dtype == "float8",
+        )
+    end_event.record()
+    end_event.synchronize()
+
+    dur_ms = start_event.elapsed_time(end_event) / num_calls
+    return dur_ms
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        prog='benchmark_mixtral_moe',
+        description='Benchmark and tune the fused_moe kernel',
+    )
+    parser.add_argument(
+        '--dtype',
+        type=str,
+        default='auto',
+        choices=['float8', 'float16'],
+        help='Data type used for fused_moe kernel computations',
+    )
+    args = parser.parse_args()
+    sys.exit(main(args.dtype))
--- a/benchmarks/kernels/benchmark_paged_attention.py
+++ b/benchmarks/kernels/benchmark_paged_attention.py
@@ -0,0 +1,211 @@
+import argparse
+import random
+import time
+from typing import Optional
+
+import torch
+
+from vllm import _custom_ops as ops
+from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, create_kv_caches_with_random
+
+NUM_BLOCKS = 1024
+PARTITION_SIZE = 512
+
+
+@torch.inference_mode()
+def main(
+    version: str,
+    num_seqs: int,
+    seq_len: int,
+    num_query_heads: int,
+    num_kv_heads: int,
+    head_size: int,
+    use_alibi: bool,
+    block_size: int,
+    dtype: torch.dtype,
+    seed: int,
+    do_profile: bool,
+    device: str = "cuda",
+    kv_cache_dtype: Optional[str] = None,
+) -> None:
+    random.seed(seed)
+    torch.random.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+
+    scale = float(1.0 / (head_size**0.5))
+    query = torch.empty(num_seqs,
+                        num_query_heads,
+                        head_size,
+                        dtype=dtype,
+                        device=device)
+    query.uniform_(-scale, scale)
+
+    assert num_query_heads % num_kv_heads == 0
+    alibi_slopes = None
+    if use_alibi:
+        alibi_slopes = torch.randn(num_query_heads,
+                                   dtype=torch.float,
+                                   device=device)
+
+    seq_lens = [seq_len for _ in range(num_seqs)]
+    max_seq_len = max(seq_lens)
+    seq_lens = torch.tensor(seq_lens, dtype=torch.int, device=device)
+
+    # Create the block tables.
+    max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size
+    block_tables = []
+    for _ in range(num_seqs):
+        block_table = [
+            random.randint(0, NUM_BLOCKS - 1)
+            for _ in range(max_num_blocks_per_seq)
+        ]
+        block_tables.append(block_table)
+    block_tables = torch.tensor(block_tables, dtype=torch.int, device=device)
+
+    # Create the KV cache.
+    key_caches, value_caches = create_kv_caches_with_random(NUM_BLOCKS,
+                                                            block_size,
+                                                            1,
+                                                            num_kv_heads,
+                                                            head_size,
+                                                            kv_cache_dtype,
+                                                            dtype,
+                                                            device=device)
+    key_cache, value_cache = key_caches[0], value_caches[0]
+
+    # Prepare for the paged attention kernel.
+    output = torch.empty_like(query)
+    if version == "v2":
+        num_partitions = ((max_seq_len + PARTITION_SIZE - 1) // PARTITION_SIZE)
+        tmp_output = torch.empty(
+            size=(num_seqs, num_query_heads, num_partitions, head_size),
+            dtype=output.dtype,
+            device=output.device,
+        )
+        exp_sums = torch.empty(
+            size=(num_seqs, num_query_heads, num_partitions),
+            dtype=torch.float32,
+            device=output.device,
+        )
+        max_logits = torch.empty_like(exp_sums)
+
+    def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
+        torch.cuda.synchronize()
+        if profile:
+            torch.cuda.cudart().cudaProfilerStart()
+        start_time = time.perf_counter()
+
+        # Using default kv_scale
+        kv_scale = 1.0
+
+        for _ in range(num_iters):
+            if version == "v1":
+                ops.paged_attention_v1(
+                    output,
+                    query,
+                    key_cache,
+                    value_cache,
+                    num_kv_heads,
+                    scale,
+                    block_tables,
+                    seq_lens,
+                    block_size,
+                    max_seq_len,
+                    alibi_slopes,
+                    kv_cache_dtype,
+                    kv_scale,
+                )
+            elif version == "v2":
+                ops.paged_attention_v2(
+                    output,
+                    exp_sums,
+                    max_logits,
+                    tmp_output,
+                    query,
+                    key_cache,
+                    value_cache,
+                    num_kv_heads,
+                    scale,
+                    block_tables,
+                    seq_lens,
+                    block_size,
+                    max_seq_len,
+                    alibi_slopes,
+                    kv_cache_dtype,
+                    kv_scale,
+                )
+            else:
+                raise ValueError(f"Invalid version: {version}")
+        torch.cuda.synchronize()
+
+        end_time = time.perf_counter()
+        if profile:
+            torch.cuda.cudart().cudaProfilerStart()
+        return (end_time - start_time) / num_iters
+
+    # Warmup.
+    print("Warming up...")
+    run_benchmark = run_cuda_benchmark
+    run_benchmark(num_iters=3, profile=False)
+
+    # Benchmark.
+    if do_profile:
+        latency = run_benchmark(num_iters=1, profile=True)
+    else:
+        latency = run_benchmark(num_iters=100, profile=False)
+    print(f"Kernel running time: {latency * 1000000:.3f} us")
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        description="Benchmark the paged attention kernel.")
+    parser.add_argument("--version",
+                        type=str,
+                        choices=["v1", "v2"],
+                        default="v2")
+    parser.add_argument("--batch-size", type=int, default=8)
+    parser.add_argument("--seq_len", type=int, default=4096)
+    parser.add_argument("--num-query-heads", type=int, default=64)
+    parser.add_argument("--num-kv-heads", type=int, default=8)
+    parser.add_argument("--head-size",
+                        type=int,
+                        choices=[64, 80, 96, 112, 128, 256],
+                        default=128)
+    parser.add_argument("--block-size", type=int, choices=[16, 32], default=16)
+    parser.add_argument("--use-alibi", action="store_true")
+    parser.add_argument("--dtype",
+                        type=str,
+                        choices=["half", "bfloat16", "float"],
+                        default="half")
+    parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument("--profile", action="store_true")
+    parser.add_argument(
+        "--kv-cache-dtype",
+        type=str,
+        choices=["auto", "fp8"],
+        default="auto",
+        help=
+        'Data type for kv cache storage. If "auto", will use model data type. '
+        'FP8_E5M2 (without scaling) is only supported on cuda version greater '
+        'than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead supported for '
+        'common inference criteria.')
+    args = parser.parse_args()
+    print(args)
+
+    if args.num_query_heads % args.num_kv_heads != 0:
+        raise ValueError("num_query_heads must be divisible by num_kv_heads")
+    main(
+        version=args.version,
+        num_seqs=args.batch_size,
+        seq_len=args.seq_len,
+        num_query_heads=args.num_query_heads,
+        num_kv_heads=args.num_kv_heads,
+        head_size=args.head_size,
+        block_size=args.block_size,
+        use_alibi=args.use_alibi,
+        dtype=STR_DTYPE_TO_TORCH_DTYPE[args.dtype],
+        seed=args.seed,
+        do_profile=args.profile,
+        kv_cache_dtype=args.kv_cache_dtype,
+    )
--- a/benchmarks/kernels/benchmark_rope.py
+++ b/benchmarks/kernels/benchmark_rope.py
@@ -0,0 +1,121 @@
+import argparse
+from itertools import accumulate
+from typing import Optional
+
+import nvtx
+import torch
+
+from vllm.model_executor.layers.rotary_embedding import get_rope
+
+
+def benchmark_rope_kernels_multi_lora(
+    is_neox_style: bool,
+    batch_size: int,
+    seq_len: int,
+    num_heads: int,
+    head_size: int,
+    rotary_dim: Optional[int],
+    dtype: torch.dtype,
+    seed: int,
+    device: str,
+    max_position: int = 8192,
+    base: int = 10000,
+) -> None:
+    torch.random.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+    torch.set_default_device(device)
+    if rotary_dim is None:
+        rotary_dim = head_size
+    # silulating serving 4 LoRAs
+    scaling_factors = [1, 2, 4, 8]
+    # batched RoPE can take multiple scaling factors
+    batched_rope = get_rope(head_size, rotary_dim, max_position, base,
+                            is_neox_style, {
+                                "type": "linear",
+                                "factor": tuple(scaling_factors)
+                            })
+    # non-batched RoPE takes only one scaling factor, we create multiple
+    # instances to simulate the same behavior
+    non_batched_ropes = []
+    for scaling_factor in scaling_factors:
+        non_batched_ropes.append(
+            get_rope(head_size, rotary_dim, max_position, base, is_neox_style,
+                     {
+                         "type": "linear",
+                         "factor": (scaling_factor, )
+                     }))
+
+    positions = torch.randint(0, max_position, (batch_size, seq_len))
+    query = torch.randn(batch_size,
+                        seq_len,
+                        num_heads * head_size,
+                        dtype=dtype)
+    key = torch.randn_like(query)
+
+    # create query offsets for batched RoPE, we concat multiple kv cache
+    # together and each query needs to find the right kv cache of its type
+    offset_map = torch.tensor(
+        list(
+            accumulate([0] + [
+                max_position * scaling_factor * 2
+                for scaling_factor in scaling_factors[:-1]
+            ])))
+    query_types = torch.randint(0,
+                                len(scaling_factors), (batch_size, seq_len),
+                                device=device)
+    # map query types to offsets
+    query_offsets = offset_map[query_types]
+    # the kernel takes flattened offsets
+    flatten_offsets = query_offsets.flatten()
+
+    # batched queries of the same type together for non-batched RoPE
+    queries = [query[query_types == i] for i in range(len(scaling_factors))]
+    keys = [key[query_types == i] for i in range(len(scaling_factors))]
+    packed_qkr = zip(queries, keys, non_batched_ropes)
+    # synchronize before start timing
+    torch.cuda.synchronize()
+    with nvtx.annotate("non-batched", color="yellow"):
+        for q, k, r in packed_qkr:
+            r.forward(positions, q, k)
+    torch.cuda.synchronize()
+    with nvtx.annotate("batched", color="green"):
+        batched_rope.forward(positions, query, key, flatten_offsets)
+    torch.cuda.synchronize()
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        description="Benchmark the rotary embedding kernels.")
+    parser.add_argument("--is-neox-style", type=bool, default=True)
+    parser.add_argument("--batch-size", type=int, default=16)
+    parser.add_argument("--seq-len", type=int, default=512)
+    parser.add_argument("--num-heads", type=int, default=8)
+    parser.add_argument("--head-size",
+                        type=int,
+                        choices=[64, 80, 96, 112, 128, 256],
+                        default=128)
+    parser.add_argument("--rotary-dim", type=int, choices=[16, 32], default=32)
+    parser.add_argument("--dtype",
+                        type=str,
+                        choices=["bfloat16", "float"],
+                        default="float")
+    parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument("--device",
+                        type=str,
+                        choices=["cuda:0", "cuda:1"],
+                        default="cuda:0")
+    args = parser.parse_args()
+    print(args)
+
+    benchmark_rope_kernels_multi_lora(
+        is_neox_style=args.is_neox_style,
+        batch_size=args.batch_size,
+        seq_len=args.seq_len,
+        num_heads=args.num_heads,
+        head_size=args.head_size,
+        rotary_dim=args.rotary_dim,
+        dtype=getattr(torch, args.dtype),
+        seed=args.seed,
+        device=args.device,
+    )
--- a/benchmarks/launch_tgi_server.sh
+++ b/benchmarks/launch_tgi_server.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+PORT=8000
+MODEL=$1
+TOKENS=$2
+
+docker run --gpus all --shm-size 1g -p $PORT:80 \
+           -v $PWD/data:/data \
+           ghcr.io/huggingface/text-generation-inference:1.4.0 \
+           --model-id $MODEL \
+           --sharded false  \
+           --max-input-length 1024 \
+           --max-total-tokens 2048 \
+           --max-best-of 5 \
+           --max-concurrent-requests 5000 \
+           --max-batch-total-tokens $TOKENS
--- a/benchmarks/sonnet.txt
+++ b/benchmarks/sonnet.txt
@@ -0,0 +1,518 @@
+FROM fairest creatures we desire increase,
+That thereby beauty's rose might never die,
+But as the riper should by time decease,
+His tender heir might bear his memory:
+But thou, contracted to thine own bright eyes,
+Feed'st thy light'st flame with self-substantial fuel,
+Making a famine where abundance lies,
+Thyself thy foe, to thy sweet self too cruel.
+Thou that art now the world's fresh ornament
+And only herald to the gaudy spring,
+Within thine own bud buriest thy content
+And, tender churl, makest waste in niggarding.
+Pity the world, or else this glutton be,
+To eat the world's due, by the grave and thee.
+When forty winters shall beseige thy brow,
+And dig deep trenches in thy beauty's field,
+Thy youth's proud livery, so gazed on now,
+Will be a tatter'd weed, of small worth held:
+Then being ask'd where all thy beauty lies,
+Where all the treasure of thy lusty days,
+To say, within thine own deep-sunken eyes,
+Were an all-eating shame and thriftless praise.
+How much more praise deserved thy beauty's use,
+If thou couldst answer 'This fair child of mine
+Shall sum my count and make my old excuse,'
+Proving his beauty by succession thine!
+This were to be new made when thou art old,
+And see thy blood warm when thou feel'st it cold.
+Look in thy glass, and tell the face thou viewest
+Now is the time that face should form another;
+Whose fresh repair if now thou not renewest,
+Thou dost beguile the world, unbless some mother.
+For where is she so fair whose unear'd womb
+Disdains the tillage of thy husbandry?
+Or who is he so fond will be the tomb
+Of his self-love, to stop posterity?
+Thou art thy mother's glass, and she in thee
+Calls back the lovely April of her prime:
+So thou through windows of thine age shall see
+Despite of wrinkles this thy golden time.
+But if thou live, remember'd not to be,
+Die single, and thine image dies with thee.
+Unthrifty loveliness, why dost thou spend
+Upon thyself thy beauty's legacy?
+Nature's bequest gives nothing but doth lend,
+And being frank she lends to those are free.
+Then, beauteous niggard, why dost thou abuse
+The bounteous largess given thee to give?
+Profitless usurer, why dost thou use
+So great a sum of sums, yet canst not live?
+For having traffic with thyself alone,
+Thou of thyself thy sweet self dost deceive.
+Then how, when nature calls thee to be gone,
+What acceptable audit canst thou leave?
+Thy unused beauty must be tomb'd with thee,
+Which, used, lives th' executor to be.
+Those hours, that with gentle work did frame
+The lovely gaze where every eye doth dwell,
+Will play the tyrants to the very same
+And that unfair which fairly doth excel:
+For never-resting time leads summer on
+To hideous winter and confounds him there;
+Sap cheque'd with frost and lusty leaves quite gone,
+Beauty o'ersnow'd and bareness every where:
+Then, were not summer's distillation left,
+A liquid prisoner pent in walls of glass,
+Beauty's effect with beauty were bereft,
+Nor it nor no remembrance what it was:
+But flowers distill'd though they with winter meet,
+Leese but their show; their substance still lives sweet.
+Then let not winter's ragged hand deface
+In thee thy summer, ere thou be distill'd:
+Make sweet some vial; treasure thou some place
+With beauty's treasure, ere it be self-kill'd.
+That use is not forbidden usury,
+Which happies those that pay the willing loan;
+That's for thyself to breed another thee,
+Or ten times happier, be it ten for one;
+Ten times thyself were happier than thou art,
+If ten of thine ten times refigured thee:
+Then what could death do, if thou shouldst depart,
+Leaving thee living in posterity?
+Be not self-will'd, for thou art much too fair
+To be death's conquest and make worms thine heir.
+Lo! in the orient when the gracious light
+Lifts up his burning head, each under eye
+Doth homage to his new-appearing sight,
+Serving with looks his sacred majesty;
+And having climb'd the steep-up heavenly hill,
+Resembling strong youth in his middle age,
+yet mortal looks adore his beauty still,
+Attending on his golden pilgrimage;
+But when from highmost pitch, with weary car,
+Like feeble age, he reeleth from the day,
+The eyes, 'fore duteous, now converted are
+From his low tract and look another way:
+So thou, thyself out-going in thy noon,
+Unlook'd on diest, unless thou get a son.
+Music to hear, why hear'st thou music sadly?
+Sweets with sweets war not, joy delights in joy.
+Why lovest thou that which thou receivest not gladly,
+Or else receivest with pleasure thine annoy?
+If the true concord of well-tuned sounds,
+By unions married, do offend thine ear,
+They do but sweetly chide thee, who confounds
+In singleness the parts that thou shouldst bear.
+Mark how one string, sweet husband to another,
+Strikes each in each by mutual ordering,
+Resembling sire and child and happy mother
+Who all in one, one pleasing note do sing:
+Whose speechless song, being many, seeming one,
+Sings this to thee: 'thou single wilt prove none.'
+Is it for fear to wet a widow's eye
+That thou consumest thyself in single life?
+Ah! if thou issueless shalt hap to die.
+The world will wail thee, like a makeless wife;
+The world will be thy widow and still weep
+That thou no form of thee hast left behind,
+When every private widow well may keep
+By children's eyes her husband's shape in mind.
+Look, what an unthrift in the world doth spend
+Shifts but his place, for still the world enjoys it;
+But beauty's waste hath in the world an end,
+And kept unused, the user so destroys it.
+No love toward others in that bosom sits
+That on himself such murderous shame commits.
+For shame! deny that thou bear'st love to any,
+Who for thyself art so unprovident.
+Grant, if thou wilt, thou art beloved of many,
+But that thou none lovest is most evident;
+For thou art so possess'd with murderous hate
+That 'gainst thyself thou stick'st not to conspire.
+Seeking that beauteous roof to ruinate
+Which to repair should be thy chief desire.
+O, change thy thought, that I may change my mind!
+Shall hate be fairer lodged than gentle love?
+Be, as thy presence is, gracious and kind,
+Or to thyself at least kind-hearted prove:
+Make thee another self, for love of me,
+That beauty still may live in thine or thee.
+As fast as thou shalt wane, so fast thou growest
+In one of thine, from that which thou departest;
+And that fresh blood which youngly thou bestowest
+Thou mayst call thine when thou from youth convertest.
+Herein lives wisdom, beauty and increase:
+Without this, folly, age and cold decay:
+If all were minded so, the times should cease
+And threescore year would make the world away.
+Let those whom Nature hath not made for store,
+Harsh featureless and rude, barrenly perish:
+Look, whom she best endow'd she gave the more;
+Which bounteous gift thou shouldst in bounty cherish:
+She carved thee for her seal, and meant thereby
+Thou shouldst print more, not let that copy die.
+When I do count the clock that tells the time,
+And see the brave day sunk in hideous night;
+When I behold the violet past prime,
+And sable curls all silver'd o'er with white;
+When lofty trees I see barren of leaves
+Which erst from heat did canopy the herd,
+And summer's green all girded up in sheaves
+Borne on the bier with white and bristly beard,
+Then of thy beauty do I question make,
+That thou among the wastes of time must go,
+Since sweets and beauties do themselves forsake
+And die as fast as they see others grow;
+And nothing 'gainst Time's scythe can make defence
+Save breed, to brave him when he takes thee hence.
+O, that you were yourself! but, love, you are
+No longer yours than you yourself here live:
+Against this coming end you should prepare,
+And your sweet semblance to some other give.
+So should that beauty which you hold in lease
+Find no determination: then you were
+Yourself again after yourself's decease,
+When your sweet issue your sweet form should bear.
+Who lets so fair a house fall to decay,
+Which husbandry in honour might uphold
+Against the stormy gusts of winter's day
+And barren rage of death's eternal cold?
+O, none but unthrifts! Dear my love, you know
+You had a father: let your son say so.
+Not from the stars do I my judgment pluck;
+And yet methinks I have astronomy,
+But not to tell of good or evil luck,
+Of plagues, of dearths, or seasons' quality;
+Nor can I fortune to brief minutes tell,
+Pointing to each his thunder, rain and wind,
+Or say with princes if it shall go well,
+By oft predict that I in heaven find:
+But from thine eyes my knowledge I derive,
+And, constant stars, in them I read such art
+As truth and beauty shall together thrive,
+If from thyself to store thou wouldst convert;
+Or else of thee this I prognosticate:
+Thy end is truth's and beauty's doom and date.
+When I consider every thing that grows
+Holds in perfection but a little moment,
+That this huge stage presenteth nought but shows
+Whereon the stars in secret influence comment;
+When I perceive that men as plants increase,
+Cheered and cheque'd even by the self-same sky,
+Vaunt in their youthful sap, at height decrease,
+And wear their brave state out of memory;
+Then the conceit of this inconstant stay
+Sets you most rich in youth before my sight,
+Where wasteful Time debateth with Decay,
+To change your day of youth to sullied night;
+And all in war with Time for love of you,
+As he takes from you, I engraft you new.
+But wherefore do not you a mightier way
+Make war upon this bloody tyrant, Time?
+And fortify yourself in your decay
+With means more blessed than my barren rhyme?
+Now stand you on the top of happy hours,
+And many maiden gardens yet unset
+With virtuous wish would bear your living flowers,
+Much liker than your painted counterfeit:
+So should the lines of life that life repair,
+Which this, Time's pencil, or my pupil pen,
+Neither in inward worth nor outward fair,
+Can make you live yourself in eyes of men.
+To give away yourself keeps yourself still,
+And you must live, drawn by your own sweet skill.
+Who will believe my verse in time to come,
+If it were fill'd with your most high deserts?
+Though yet, heaven knows, it is but as a tomb
+Which hides your life and shows not half your parts.
+If I could write the beauty of your eyes
+And in fresh numbers number all your graces,
+The age to come would say 'This poet lies:
+Such heavenly touches ne'er touch'd earthly faces.'
+So should my papers yellow'd with their age
+Be scorn'd like old men of less truth than tongue,
+And your true rights be term'd a poet's rage
+And stretched metre of an antique song:
+But were some child of yours alive that time,
+You should live twice; in it and in my rhyme.
+Shall I compare thee to a summer's day?
+Thou art more lovely and more temperate:
+Rough winds do shake the darling buds of May,
+And summer's lease hath all too short a date:
+Sometime too hot the eye of heaven shines,
+And often is his gold complexion dimm'd;
+And every fair from fair sometime declines,
+By chance or nature's changing course untrimm'd;
+But thy eternal summer shall not fade
+Nor lose possession of that fair thou owest;
+Nor shall Death brag thou wander'st in his shade,
+When in eternal lines to time thou growest:
+So long as men can breathe or eyes can see,
+So long lives this and this gives life to thee.
+Devouring Time, blunt thou the lion's paws,
+And make the earth devour her own sweet brood;
+Pluck the keen teeth from the fierce tiger's jaws,
+And burn the long-lived phoenix in her blood;
+Make glad and sorry seasons as thou fleets,
+And do whate'er thou wilt, swift-footed Time,
+To the wide world and all her fading sweets;
+But I forbid thee one most heinous crime:
+O, carve not with thy hours my love's fair brow,
+Nor draw no lines there with thine antique pen;
+Him in thy course untainted do allow
+For beauty's pattern to succeeding men.
+Yet, do thy worst, old Time: despite thy wrong,
+My love shall in my verse ever live young.
+A woman's face with Nature's own hand painted
+Hast thou, the master-mistress of my passion;
+A woman's gentle heart, but not acquainted
+With shifting change, as is false women's fashion;
+An eye more bright than theirs, less false in rolling,
+Gilding the object whereupon it gazeth;
+A man in hue, all 'hues' in his controlling,
+Much steals men's eyes and women's souls amazeth.
+And for a woman wert thou first created;
+Till Nature, as she wrought thee, fell a-doting,
+And by addition me of thee defeated,
+By adding one thing to my purpose nothing.
+But since she prick'd thee out for women's pleasure,
+Mine be thy love and thy love's use their treasure.
+So is it not with me as with that Muse
+Stirr'd by a painted beauty to his verse,
+Who heaven itself for ornament doth use
+And every fair with his fair doth rehearse
+Making a couplement of proud compare,
+With sun and moon, with earth and sea's rich gems,
+With April's first-born flowers, and all things rare
+That heaven's air in this huge rondure hems.
+O' let me, true in love, but truly write,
+And then believe me, my love is as fair
+As any mother's child, though not so bright
+As those gold candles fix'd in heaven's air:
+Let them say more than like of hearsay well;
+I will not praise that purpose not to sell.
+My glass shall not persuade me I am old,
+So long as youth and thou are of one date;
+But when in thee time's furrows I behold,
+Then look I death my days should expiate.
+For all that beauty that doth cover thee
+Is but the seemly raiment of my heart,
+Which in thy breast doth live, as thine in me:
+How can I then be elder than thou art?
+O, therefore, love, be of thyself so wary
+As I, not for myself, but for thee will;
+Bearing thy heart, which I will keep so chary
+As tender nurse her babe from faring ill.
+Presume not on thy heart when mine is slain;
+Thou gavest me thine, not to give back again.
+As an unperfect actor on the stage
+Who with his fear is put besides his part,
+Or some fierce thing replete with too much rage,
+Whose strength's abundance weakens his own heart.
+So I, for fear of trust, forget to say
+The perfect ceremony of love's rite,
+And in mine own love's strength seem to decay,
+O'ercharged with burden of mine own love's might.
+O, let my books be then the eloquence
+And dumb presagers of my speaking breast,
+Who plead for love and look for recompense
+More than that tongue that more hath more express'd.
+O, learn to read what silent love hath writ:
+To hear with eyes belongs to love's fine wit.
+Mine eye hath play'd the painter and hath stell'd
+Thy beauty's form in table of my heart;
+My body is the frame wherein 'tis held,
+And perspective it is the painter's art.
+For through the painter must you see his skill,
+To find where your true image pictured lies;
+Which in my bosom's shop is hanging still,
+That hath his windows glazed with thine eyes.
+Now see what good turns eyes for eyes have done:
+Mine eyes have drawn thy shape, and thine for me
+Are windows to my breast, where-through the sun
+Delights to peep, to gaze therein on thee;
+Yet eyes this cunning want to grace their art;
+They draw but what they see, know not the heart.
+Let those who are in favour with their stars
+Of public honour and proud titles boast,
+Whilst I, whom fortune of such triumph bars,
+Unlook'd for joy in that I honour most.
+Great princes' favourites their fair leaves spread
+But as the marigold at the sun's eye,
+And in themselves their pride lies buried,
+For at a frown they in their glory die.
+The painful warrior famoused for fight,
+After a thousand victories once foil'd,
+Is from the book of honour razed quite,
+And all the rest forgot for which he toil'd:
+Then happy I, that love and am beloved
+Where I may not remove nor be removed.
+Lord of my love, to whom in vassalage
+Thy merit hath my duty strongly knit,
+To thee I send this written embassage,
+To witness duty, not to show my wit:
+Duty so great, which wit so poor as mine
+May make seem bare, in wanting words to show it,
+But that I hope some good conceit of thine
+In thy soul's thought, all naked, will bestow it;
+Till whatsoever star that guides my moving
+Points on me graciously with fair aspect
+And puts apparel on my tatter'd loving,
+To show me worthy of thy sweet respect:
+Then may I dare to boast how I do love thee;
+Till then not show my head where thou mayst prove me.
+Weary with toil, I haste me to my bed,
+The dear repose for limbs with travel tired;
+But then begins a journey in my head,
+To work my mind, when body's work's expired:
+For then my thoughts, from far where I abide,
+Intend a zealous pilgrimage to thee,
+And keep my drooping eyelids open wide,
+Looking on darkness which the blind do see
+Save that my soul's imaginary sight
+Presents thy shadow to my sightless view,
+Which, like a jewel hung in ghastly night,
+Makes black night beauteous and her old face new.
+Lo! thus, by day my limbs, by night my mind,
+For thee and for myself no quiet find.
+How can I then return in happy plight,
+That am debarr'd the benefit of rest?
+When day's oppression is not eased by night,
+But day by night, and night by day, oppress'd?
+And each, though enemies to either's reign,
+Do in consent shake hands to torture me;
+The one by toil, the other to complain
+How far I toil, still farther off from thee.
+I tell the day, to please them thou art bright
+And dost him grace when clouds do blot the heaven:
+So flatter I the swart-complexion'd night,
+When sparkling stars twire not thou gild'st the even.
+But day doth daily draw my sorrows longer
+And night doth nightly make grief's strength seem stronger.
+When, in disgrace with fortune and men's eyes,
+I all alone beweep my outcast state
+And trouble deal heaven with my bootless cries
+And look upon myself and curse my fate,
+Wishing me like to one more rich in hope,
+Featured like him, like him with friends possess'd,
+Desiring this man's art and that man's scope,
+With what I most enjoy contented least;
+Yet in these thoughts myself almost despising,
+Haply I think on thee, and then my state,
+Like to the lark at break of day arising
+From sullen earth, sings hymns at heaven's gate;
+For thy sweet love remember'd such wealth brings
+That then I scorn to change my state with kings.
+When to the sessions of sweet silent thought
+I summon up remembrance of things past,
+I sigh the lack of many a thing I sought,
+And with old woes new wail my dear time's waste:
+Then can I drown an eye, unused to flow,
+For precious friends hid in death's dateless night,
+And weep afresh love's long since cancell'd woe,
+And moan the expense of many a vanish'd sight:
+Then can I grieve at grievances foregone,
+And heavily from woe to woe tell o'er
+The sad account of fore-bemoaned moan,
+Which I new pay as if not paid before.
+But if the while I think on thee, dear friend,
+All losses are restored and sorrows end.
+Thy bosom is endeared with all hearts,
+Which I by lacking have supposed dead,
+And there reigns love and all love's loving parts,
+And all those friends which I thought buried.
+How many a holy and obsequious tear
+Hath dear religious love stol'n from mine eye
+As interest of the dead, which now appear
+But things removed that hidden in thee lie!
+Thou art the grave where buried love doth live,
+Hung with the trophies of my lovers gone,
+Who all their parts of me to thee did give;
+That due of many now is thine alone:
+Their images I loved I view in thee,
+And thou, all they, hast all the all of me.
+If thou survive my well-contented day,
+When that churl Death my bones with dust shall cover,
+And shalt by fortune once more re-survey
+These poor rude lines of thy deceased lover,
+Compare them with the bettering of the time,
+And though they be outstripp'd by every pen,
+Reserve them for my love, not for their rhyme,
+Exceeded by the height of happier men.
+O, then vouchsafe me but this loving thought:
+'Had my friend's Muse grown with this growing age,
+A dearer birth than this his love had brought,
+To march in ranks of better equipage:
+But since he died and poets better prove,
+Theirs for their style I'll read, his for his love.'
+Full many a glorious morning have I seen
+Flatter the mountain-tops with sovereign eye,
+Kissing with golden face the meadows green,
+Gilding pale streams with heavenly alchemy;
+Anon permit the basest clouds to ride
+With ugly rack on his celestial face,
+And from the forlorn world his visage hide,
+Stealing unseen to west with this disgrace:
+Even so my sun one early morn did shine
+With all triumphant splendor on my brow;
+But out, alack! he was but one hour mine;
+The region cloud hath mask'd him from me now.
+Yet him for this my love no whit disdaineth;
+Suns of the world may stain when heaven's sun staineth.
+Why didst thou promise such a beauteous day,
+And make me travel forth without my cloak,
+To let base clouds o'ertake me in my way,
+Hiding thy bravery in their rotten smoke?
+'Tis not enough that through the cloud thou break,
+To dry the rain on my storm-beaten face,
+For no man well of such a salve can speak
+That heals the wound and cures not the disgrace:
+Nor can thy shame give physic to my grief;
+Though thou repent, yet I have still the loss:
+The offender's sorrow lends but weak relief
+To him that bears the strong offence's cross.
+Ah! but those tears are pearl which thy love sheds,
+And they are rich and ransom all ill deeds.
+No more be grieved at that which thou hast done:
+Roses have thorns, and silver fountains mud;
+Clouds and eclipses stain both moon and sun,
+And loathsome canker lives in sweetest bud.
+All men make faults, and even I in this,
+Authorizing thy trespass with compare,
+Myself corrupting, salving thy amiss,
+Excusing thy sins more than thy sins are;
+For to thy sensual fault I bring in sense--
+Thy adverse party is thy advocate--
+And 'gainst myself a lawful plea commence:
+Such civil war is in my love and hate
+That I an accessary needs must be
+To that sweet thief which sourly robs from me.
+Let me confess that we two must be twain,
+Although our undivided loves are one:
+So shall those blots that do with me remain
+Without thy help by me be borne alone.
+In our two loves there is but one respect,
+Though in our lives a separable spite,
+Which though it alter not love's sole effect,
+Yet doth it steal sweet hours from love's delight.
+I may not evermore acknowledge thee,
+Lest my bewailed guilt should do thee shame,
+Nor thou with public kindness honour me,
+Unless thou take that honour from thy name:
+But do not so; I love thee in such sort
+As, thou being mine, mine is thy good report.
+As a decrepit father takes delight
+To see his active child do deeds of youth,
+So I, made lame by fortune's dearest spite,
+Take all my comfort of thy worth and truth.
+For whether beauty, birth, or wealth, or wit,
+Or any of these all, or all, or more,
+Entitled in thy parts do crowned sit,
+I make my love engrafted to this store:
+So then I am not lame, poor, nor despised,
+Whilst that this shadow doth such substance give
+That I in thy abundance am sufficed
+And by a part of all thy glory live.
+Look, what is best, that best I wish in thee:
+This wish I have; then ten times happy me!