add pkgs

2025-08-06 15:49:14 +08:00
parent e80b916c52
commit bf00e72fb2
111 changed files with 21880 additions and 1 deletions
--- a/examples/vllm_test/benchmark_throughput.py
+++ b/examples/vllm_test/benchmark_throughput.py
@@ -0,0 +1,363 @@
+"""Benchmark offline inference throughput."""
+import argparse
+import json
+import random
+import time
+import random
+from typing import List, Tuple, Union
+
+import torch
+from tqdm import tqdm
+from transformers import AutoModelForCausalLM, PreTrainedTokenizerBase
+
+from xtrt_llm.vllm import LLM, SamplingParams
+from xtrt_llm.vllm.transformers_utils.tokenizer import get_tokenizer
+
+
+def dummy_sample_requests(
+    tokenizer: PreTrainedTokenizerBase,
+    prompt: Union[str, List[str]],
+    tokenid: int,
+    output_len: Union[int, List[int]],
+    input_len: Union[int, List[int]],
+    max_model_len: int,
+    num_requests: Union[int, List[int]],
+) -> List[Tuple[List[int], int, int]]:
+
+    if prompt is not None:
+        if isinstance(prompt, str):
+            assert isinstance(input_len, int) \
+                and isinstance(output_len, int) and isinstance(num_requests, int)
+            prompt_token_ids_list = [tokenizer(prompt).input_ids]
+            input_len = [input_len]
+            output_len = [output_len]
+            num_requests = [num_requests]
+        else:
+            assert isinstance(input_len, list) \
+                and isinstance(output_len, list) and isinstance(num_requests, list)
+            prompt_token_ids_list = [tokenizer(x).input_ids for x in prompt]
+    if tokenid is not None:
+        if isinstance(input_len, int):
+            assert isinstance(output_len, int) and isinstance(num_requests, int)
+            prompt_token_ids_list = [[tokenid] * input_len]
+            input_len = [input_len]
+            output_len = [output_len]
+            num_requests = [num_requests]
+        else:
+            assert isinstance(output_len, list) and isinstance(num_requests, list)
+            prompt_token_ids_list = [[tokenid] * x for x in input_len]
+
+    sampled_requests: List[Tuple[List[int], int, int]] = []
+    for i, prompt_token_ids in enumerate(prompt_token_ids_list):
+        for idx in range(num_requests[i]):
+            if len(prompt_token_ids) < input_len[i]:
+                prompt_token_ids.extend([prompt_token_ids[0]] *
+                                        (input_len[i] - len(prompt_token_ids)))
+            if len(prompt_token_ids) > input_len[i]:
+                prompt_token_ids = prompt_token_ids[:input_len[i] -
+                                                    len(prompt_token_ids)]
+            sampled_requests.append(
+                (prompt_token_ids, input_len[i], min(output_len[i], max_model_len - input_len[i])))
+
+    random.shuffle(sampled_requests)
+    return sampled_requests
+
+
+def sample_requests(
+    dataset_path: str,
+    num_requests: int,
+    tokenizer: PreTrainedTokenizerBase,
+) -> List[Tuple[str, int, int]]:
+    # Load the dataset.
+    with open(dataset_path) as f:
+        dataset = json.load(f)
+    # Filter out the conversations with less than 2 turns.
+    dataset = [data for data in dataset if len(data["conversations"]) >= 2]
+    # Only keep the first two turns of each conversation.
+    dataset = [(data["conversations"][0]["value"],
+                data["conversations"][1]["value"]) for data in dataset]
+
+    # Tokenize the prompts and completions.
+    prompts = [prompt for prompt, _ in dataset]
+    prompt_token_ids = tokenizer(prompts).input_ids
+    completions = [completion for _, completion in dataset]
+    completion_token_ids = tokenizer(completions).input_ids
+    tokenized_dataset = []
+    for i in range(len(dataset)):
+        output_len = len(completion_token_ids[i])
+        tokenized_dataset.append((prompts[i], prompt_token_ids[i], output_len))
+
+    # Filter out too long sequences.
+    filtered_dataset: List[Tuple[str, int, int]] = []
+    for prompt, prompt_token_ids, output_len in tokenized_dataset:
+        prompt_len = len(prompt_token_ids)
+        if prompt_len < 4 or output_len < 4:
+            # Prune too short sequences.
+            continue
+        if prompt_len > 1024 or prompt_len + output_len > 2048:
+            # Prune too long sequences.
+            continue
+        filtered_dataset.append((prompt, prompt_len, output_len))
+
+    # Sample the requests.
+    sampled_requests = random.sample(filtered_dataset, num_requests)
+    return sampled_requests
+
+
+def dummy_run_vllm(
+    requests: List[Tuple[List[int], int, int]],
+    model: str,
+    tokenizer: str,
+    tensor_parallel_size: int,
+    seed: int,
+    n: int,
+    use_beam_search: bool,
+    trust_remote_code: bool,
+    max_model_len: int,
+    engine_dir: str,
+    max_num_seqs: int,
+    max_num_batched_tokens: int,
+) -> float:
+    llm = LLM(
+        model=model,
+        tokenizer=tokenizer,
+        tensor_parallel_size=tensor_parallel_size,
+        seed=seed,
+        trust_remote_code=trust_remote_code,
+        disable_log_stats=False,
+        max_model_len=max_model_len,
+        engine_dir=engine_dir,
+        max_num_seqs=max_num_seqs,
+        max_num_batched_tokens=max_num_batched_tokens,
+    )
+    start = time.time()
+    # Add the requests to the engine.
+    for prompt_tokenids, _, output_len in requests:
+        sampling_params = SamplingParams(
+            n=n,
+            temperature=0.0 if use_beam_search else 1.0,
+            top_p=1.0,
+            use_beam_search=use_beam_search,
+            ignore_eos=True,
+            max_tokens=output_len,
+        )
+        # FIXME(woosuk): Do not use internal method.
+        llm._add_request(
+            # model_type="llama2",
+            prompt=None,
+            prompt_token_ids=prompt_tokenids,
+            sampling_params=sampling_params,
+        )
+
+    # FIXME(woosuk): Do use internal method.
+    llm._run_engine(use_tqdm=True)
+    end = time.time()
+    return end - start
+
+
+def run_vllm(
+    requests: List[Tuple[str, int, int]],
+    model: str,
+    tokenizer: str,
+    tensor_parallel_size: int,
+    seed: int,
+    n: int,
+    use_beam_search: bool,
+    trust_remote_code: bool,
+) -> float:
+    llm = LLM(
+        model=model,
+        tokenizer=tokenizer,
+        tensor_parallel_size=tensor_parallel_size,
+        seed=seed,
+        trust_remote_code=trust_remote_code,
+    )
+
+    # Add the requests to the engine.
+    for prompt, _, output_len in requests:
+        sampling_params = SamplingParams(
+            n=n,
+            temperature=0.0 if use_beam_search else 1.0,
+            top_p=1.0,
+            use_beam_search=use_beam_search,
+            ignore_eos=True,
+            max_tokens=output_len,
+        )
+        # FIXME(woosuk): Do not use internal method.
+        llm._add_request(
+            model_type="llama2",
+            prompt=prompt,
+            prompt_token_ids=None,
+            sampling_params=sampling_params,
+        )
+
+    start = time.time()
+    # FIXME(woosuk): Do use internal method.
+    llm._run_engine(use_tqdm=True)
+    end = time.time()
+    return end - start
+
+
+def run_hf(
+    requests: List[Tuple[str, int, int]],
+    model: str,
+    tokenizer: PreTrainedTokenizerBase,
+    n: int,
+    use_beam_search: bool,
+    max_batch_size: int,
+    trust_remote_code: bool,
+) -> float:
+    assert not use_beam_search
+    llm = AutoModelForCausalLM.from_pretrained(
+        model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code)
+    if llm.config.model_type == "llama":
+        # To enable padding in the HF backend.
+        tokenizer.pad_token = tokenizer.eos_token
+    llm = llm.cuda()
+
+    pbar = tqdm(total=len(requests))
+    start = time.time()
+    batch: List[str] = []
+    max_prompt_len = 0
+    max_output_len = 0
+    for i in range(len(requests)):
+        prompt, prompt_len, output_len = requests[i]
+        # Add the prompt to the batch.
+        batch.append(prompt)
+        max_prompt_len = max(max_prompt_len, prompt_len)
+        max_output_len = max(max_output_len, output_len)
+        if len(batch) < max_batch_size and i != len(requests) - 1:
+            # Check if we can add more requests to the batch.
+            _, next_prompt_len, next_output_len = requests[i + 1]
+            if (max(max_prompt_len, next_prompt_len) +
+                    max(max_output_len, next_output_len)) <= 2048:
+                # We can add more requests to the batch.
+                continue
+
+        # Generate the sequences.
+        input_ids = tokenizer(batch, return_tensors="pt",
+                              padding=True).input_ids
+        llm_outputs = llm.generate(
+            input_ids=input_ids.cuda(),
+            do_sample=not use_beam_search,
+            num_return_sequences=n,
+            temperature=1.0,
+            top_p=1.0,
+            use_cache=True,
+            max_new_tokens=max_output_len,
+        )
+        # Include the decoding time.
+        tokenizer.batch_decode(llm_outputs, skip_special_tokens=True)
+        pbar.update(len(batch))
+
+        # Clear the batch.
+        batch = []
+        max_prompt_len = 0
+        max_output_len = 0
+    end = time.time()
+    return end - start
+
+
+def main(args: argparse.Namespace):
+    print(args)
+    random.seed(args.seed)
+
+    # Sample the requests.
+    tokenizer = get_tokenizer(args.tokenizer,
+                              trust_remote_code=args.trust_remote_code)
+    if args.dummy_dataset:
+        requests = dummy_sample_requests(tokenizer, args.dummy_prompt,
+                                         args.dummy_tokenid,
+                                         args.dummy_output_len,
+                                         args.dummy_input_len,
+                                         args.max_model_len, args.num_prompts)
+
+        if args.backend == "vllm":
+            elapsed_time = dummy_run_vllm(
+                requests, args.model, args.tokenizer, args.tensor_parallel_size,
+                args.seed, args.n, args.use_beam_search, args.trust_remote_code,
+                args.max_model_len, args.engine_dir, args.max_num_seqs,
+                args.max_num_batched_tokens)
+        else:
+            raise ValueError(f"Unknown backend: {args.backend}")
+        total_num_tokens = sum(output_len
+                               for _, _, output_len in requests)
+        print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
+              f"{total_num_tokens / elapsed_time:.2f} tokens/s")
+    else:
+        requests = sample_requests(args.dataset, args.num_prompts, tokenizer)
+
+        if args.backend == "vllm":
+            elapsed_time = run_vllm(requests, args.model, args.tokenizer,
+                                    args.tensor_parallel_size, args.seed,
+                                    args.n, args.use_beam_search,
+                                    args.trust_remote_code)
+        elif args.backend == "hf":
+            assert args.tensor_parallel_size == 1
+            elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
+                                  args.use_beam_search, args.hf_max_batch_size,
+                                  args.trust_remote_code)
+        else:
+            raise ValueError(f"Unknown backend: {args.backend}")
+        total_num_tokens = sum(prompt_len + output_len
+                               for _, prompt_len, output_len in requests)
+        print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
+              f"{total_num_tokens / elapsed_time:.2f} tokens/s")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Benchmark the throughput.")
+    parser.add_argument("--backend",
+                        type=str,
+                        choices=["vllm", "hf"],
+                        default="vllm")
+    parser.add_argument("--dataset", type=str, help="Path to the dataset.")
+    parser.add_argument("--model", type=str, default="facebook/opt-125m")
+    parser.add_argument("--tokenizer", type=str, default=None)
+    parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1)
+    parser.add_argument("--n",
+                        type=int,
+                        default=1,
+                        help="Number of generated sequences per prompt.")
+    parser.add_argument("--use-beam-search", action="store_true")
+    parser.add_argument("--num-prompts",
+                        nargs='+',
+                        type=int,
+                        default=1000,
+                        help="Number of prompts to process.")
+    parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument("--hf-max-batch-size",
+                        type=int,
+                        default=None,
+                        help="Maximum batch size for HF backend.")
+    parser.add_argument('--trust-remote-code',
+                        action='store_true',
+                        help='trust remote code from huggingface')
+    parser.add_argument('--max-model-len', type=int, default=2048)
+    parser.add_argument('--max-num-batched-tokens', type=int, default=2048)
+    parser.add_argument('--max-num-seqs', type=int, default=128)
+    parser.add_argument('--dummy-dataset',
+                        action='store_true',
+                        help='use dummy data to test')
+    parser.add_argument('--dummy-prompt', nargs='+', type=str, default=None)
+    parser.add_argument('--dummy-tokenid', type=int, default=None)
+    parser.add_argument('--dummy-input-len', nargs='+', type=int, default=1024)
+    parser.add_argument('--dummy-output-len', nargs='+', type=int, default=1024)
+    parser.add_argument("--engine_dir", type=str, help="Path to the engine.")
+    args = parser.parse_args()
+
+    if args.backend == "vllm":
+        if args.hf_max_batch_size is not None:
+            raise ValueError("HF max batch size is only for HF backend.")
+    elif args.backend == "hf":
+        if args.hf_max_batch_size is None:
+            raise ValueError("HF max batch size is required for HF backend.")
+    if args.dummy_dataset:
+        if args.dummy_prompt is None and args.dummy_tokenid is None:
+            raise ValueError(
+                "dummy_dataset is True, thus dummy_prompt is not None or dummy_tokenid is not None."
+            )
+    if args.tokenizer is None:
+        args.tokenizer = args.model
+
+    main(args)
--- a/examples/vllm_test/openai_chatcompletion_client.py
+++ b/examples/vllm_test/openai_chatcompletion_client.py
@@ -0,0 +1,37 @@
+from openai import OpenAI
+
+# Modify OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8000/v1"
+
+client = OpenAI(
+    # defaults to os.environ.get("OPENAI_API_KEY")
+    api_key=openai_api_key,
+    base_url=openai_api_base,
+)
+
+models = client.models.list()
+model = models.data[0].id
+
+chat_completion = client.chat.completions.create(
+    messages=[{
+        "role": "system",
+        "content": "You are a helpful assistant."
+    }, {
+        "role": "user",
+        "content": "Who won the world series in 2020?"
+    }, {
+        "role":
+        "assistant",
+        "content":
+        "The Los Angeles Dodgers won the World Series in 2020."
+    }, {
+        "role": "user",
+        "content": "Where was it played?"
+    }],
+    model=model,
+)
+
+
+print("Chat completion results:")
+print(chat_completion)
--- a/examples/vllm_test/run_llama1-7b_throughput.sh
+++ b/examples/vllm_test/run_llama1-7b_throughput.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+# bash vllm_test/run_llama1-7b_throughput.sh /path/to/llama7b_hf_model /path/to/llama7b_vls_engine
+model_path=$1
+engine_path=$2
+
+#run test fixed input/output benchmark# llama7b-1xpu
+XMLIR_D_XPU_L3_SIZE=0 python benchmark_throughput.py \
+    --trust-remote-code \
+    --backend vllm \
+    --model $model_path \
+    --tokenizer $model_path \
+    --engine_dir $engine_path \
+    --tensor-parallel-size 1 \
+    --dummy-dataset \
+    --max-num-seqs 14 \
+    --max-num-batched-tokens 2048 \
+    --dummy-tokenid 1 \
+    --dummy-input-len 1024 \
+    --dummy-output-len 1024 \
+    --max-model-len 2048 \
+    --num-prompts 14
--- a/examples/vllm_test/run_stats.sh
+++ b/examples/vllm_test/run_stats.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+tmp=`grep 'Avg prompt throughput' server.log > server.log.valid`
+python run_stats_server.py
--- a/examples/vllm_test/run_stats_server.py
+++ b/examples/vllm_test/run_stats_server.py
@@ -0,0 +1,90 @@
+import re
+import sys
+import numpy as np
+import pandas as pd
+
+# 用于记录每个度量的值
+first_token_times_values = []
+prompt_throughput_values = []
+generation_throughput_values = []
+running_values = []
+
+# 从文件中读取数据
+file_path = "server.log.valid"  # 替换成你的文件路径
+with open(file_path, 'r') as file:
+    # 遍历文件中的每一行进行统计
+    for line in file:
+        # 使用正则表达式提取Avg First Token times和Avg generation throughput以及Running的值
+        match_first_token = re.search(r"Avg First Token times:([0-9.]+)", line)
+        match_prompt_throughput = re.search(r"Avg prompt throughput: ([0-9.]+)", line)
+        match_generation_throughput = re.search(r"Avg generation throughput: ([0-9.]+)", line)
+        match_running = re.search(r"Running: (\d+)", line)
+
+        # 统计Avg First Token times
+        if match_first_token:
+            first_token_times = float(match_first_token.group(1))
+            if abs(first_token_times) > 1e-5:
+                first_token_times_values.append(first_token_times)
+
+        if match_prompt_throughput:
+            prompt_throughput = float(match_prompt_throughput.group(1))
+            if abs(prompt_throughput) > 1e-5:
+                prompt_throughput_values.append(prompt_throughput)
+
+        # 统计Avg generation throughput和Running
+        if match_generation_throughput and match_running:
+            generation_throughput = float(match_generation_throughput.group(1))
+            running = int(match_running.group(1))
+            if abs(generation_throughput) > 1e-5:
+                generation_throughput_values.append(generation_throughput)
+                running_values.append(running)
+
+# 计算平均值
+avg_first_token_times = np.mean(first_token_times_values) if len(first_token_times_values) > 0 else 0
+max_first_token_times = np.max(first_token_times_values) if len(first_token_times_values) > 0 else 0
+min_first_token_times = np.min(first_token_times_values) if len(first_token_times_values) > 0 else 0
+p10_first_token_times = np.percentile(first_token_times_values, 10) if len(first_token_times_values) > 0 else 0
+p90_first_token_times = np.percentile(first_token_times_values, 90) if len(first_token_times_values) > 0 else 0
+p99_first_token_times = np.percentile(first_token_times_values, 99) if len(first_token_times_values) > 0 else 0
+cnt_first_token_times = len(first_token_times_values) if len(first_token_times_values) > 0 else 0
+
+avg_prompt_throughput = np.mean(prompt_throughput_values) if len(prompt_throughput_values) > 0 else 0
+max_prompt_throughput = np.max(prompt_throughput_values) if len(prompt_throughput_values) > 0 else 0
+min_prompt_throughput = np.min(prompt_throughput_values) if len(prompt_throughput_values) > 0 else 0
+p10_prompt_throughput = np.percentile(prompt_throughput_values, 10) if len(prompt_throughput_values) > 0 else 0
+p90_prompt_throughput = np.percentile(prompt_throughput_values, 90) if len(prompt_throughput_values) > 0 else 0
+p99_prompt_throughput = np.percentile(prompt_throughput_values, 99) if len(prompt_throughput_values) > 0 else 0
+cnt_prompt_throughput = len(prompt_throughput_values) if len(prompt_throughput_values) > 0 else 0
+
+avg_generation_throughput = np.mean(generation_throughput_values) if len(generation_throughput_values) > 0 else 0
+max_generation_throughput = np.max(generation_throughput_values) if len(generation_throughput_values) > 0 else 0
+min_generation_throughput = np.min(generation_throughput_values) if len(generation_throughput_values) > 0 else 0
+p10_generation_throughput = np.percentile(generation_throughput_values, 10) if len(generation_throughput_values) > 0 else 0
+p90_generation_throughput = np.percentile(generation_throughput_values, 90) if len(generation_throughput_values) > 0 else 0
+p99_generation_throughput = np.percentile(generation_throughput_values, 99) if len(generation_throughput_values) > 0 else 0
+cnt_generation_throughput = len(generation_throughput_values) if len(generation_throughput_values) > 0 else 0
+
+avg_running = np.mean(running_values) if len(running_values) > 0 else 0
+max_running = np.max(running_values) if len(running_values) > 0 else 0
+min_running = np.min(running_values) if len(running_values) > 0 else 0
+p10_running = np.percentile(running_values, 10) if len(running_values) > 0 else 0
+p90_running = np.percentile(running_values, 90) if len(running_values) > 0 else 0
+p99_running = np.percentile(running_values, 99) if len(running_values) > 0 else 0
+cnt_running = len(running_values) if len(running_values) > 0 else 0
+
+# Create a DataFrame
+data = {
+    'avg': [avg_first_token_times, avg_prompt_throughput, avg_generation_throughput, avg_running],
+    'max': [max_first_token_times, max_prompt_throughput, max_generation_throughput, max_running],
+    'min': [min_first_token_times, min_prompt_throughput, min_generation_throughput, min_running],
+    'p10': [p10_first_token_times, p10_prompt_throughput, p10_generation_throughput, p10_running],
+    'p90': [p90_first_token_times, p90_prompt_throughput, p90_generation_throughput, p90_running],
+    'p99': [p99_first_token_times, p99_prompt_throughput, p99_generation_throughput, p99_running],
+    'num': [cnt_first_token_times, cnt_prompt_throughput, cnt_generation_throughput, cnt_running]
+}
+
+df = pd.DataFrame(data, index=['first_token_times', 'prompt_throughput', 'generation_throughput', 'running'])
+
+# Display the DataFrame
+print(df)
+
--- a/examples/vllm_test/run_throughput.sh
+++ b/examples/vllm_test/run_throughput.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+model_path=$1
+engine_path=$2
+
+#run test fixed input/output benchmark
+XMLIR_D_XPU_L3_SIZE=0 python benchmark_throughput.py --backend vllm --model $model_path --tokenizer $model_path --engine_dir $engine_path --tensor-parallel-size 8 --dummy-dataset --max-num-seqs 128 --max-num-batched-tokens 2048 --dummy-tokenid 1 --dummy-input-len 1024 --dummy-output-len 1024 --max-model-len 2048 --num-prompts 128 > server.log
--- a/examples/vllm_test/template_alpaca.jinja
+++ b/examples/vllm_test/template_alpaca.jinja
@@ -0,0 +1,29 @@
+{{ (messages|selectattr('role', 'equalto', 'system')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'system')|list) else '' }}
+
+{% for message in messages %}
+{% if message['role'] == 'user' %}
+### Instruction:
+{{ message['content']|trim -}}
+{% if not loop.last %}
+
+
+{% endif %}
+{% elif message['role'] == 'assistant' %}
+### Response:
+{{ message['content']|trim -}}
+{% if not loop.last %}
+
+
+{% endif %}
+{% elif message['role'] == 'user_context' %}
+### Input:
+{{ message['content']|trim -}}
+{% if not loop.last %}
+
+
+{% endif %}
+{% endif %}
+{% endfor %}
+{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}
+### Response:
+{% endif %}
--- a/examples/vllm_test/template_chatml.jinja
+++ b/examples/vllm_test/template_chatml.jinja
@@ -0,0 +1 @@
+{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}
--- a/examples/vllm_test/template_inkbot.jinja
+++ b/examples/vllm_test/template_inkbot.jinja
@@ -0,0 +1,30 @@
+<#meta#>
+- Date: {{ (messages|selectattr('role', 'equalto', 'meta-current_date')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'meta-current_date')|list) else '' }}
+- Task: {{ (messages|selectattr('role', 'equalto', 'meta-task_name')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'meta-task_name')|list) else '' }}
+<#system#>
+{{ (messages|selectattr('role', 'equalto', 'system')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'system')|list) else '' }}
+<#chat#>
+{% for message in messages %}
+{% if message['role'] == 'user' %}
+<#user#>
+{{ message['content']|trim -}}
+{% if not loop.last %}
+
+{% endif %}
+{% elif message['role'] == 'assistant' %}
+<#bot#>
+{{ message['content']|trim -}}
+{% if not loop.last %}
+
+{% endif %}
+{% elif message['role'] == 'user_context' %}
+<#user_context#>
+{{ message['content']|trim -}}
+{% if not loop.last %}
+
+{% endif %}
+{% endif %}
+{% endfor %}
+{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}
+<#bot#>
+{% endif %}
--- a/examples/vllm_test/test_api_client.py
+++ b/examples/vllm_test/test_api_client.py
@@ -0,0 +1,102 @@
+"""Example Python client for vllm.entrypoints.api_server"""
+
+import argparse
+import json
+from typing import Iterable, List
+
+import requests
+from xtrt_llm.vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams
+
+
+def clear_line(n: int = 1) -> None:
+    LINE_UP = '\033[1A'
+    LINE_CLEAR = '\x1b[2K'
+    for _ in range(n):
+        print(LINE_UP, end=LINE_CLEAR, flush=True)
+
+
+def post_http_request(prompt: str,
+                      api_url: str,
+                      n: int = 1,
+                      stream: bool = False) -> requests.Response:
+    headers = {"User-Agent": "Test Client"}
+    pload = {
+        "prompt": prompt,
+        "n": n,
+        "use_beam_search": True,
+        "temperature": 0.0,
+        "max_tokens": 16,
+        "stream": stream,
+    }
+    response = requests.post(api_url, headers=headers, json=pload, stream=True)
+    return response
+
+
+def get_streaming_response(response: requests.Response) -> Iterable[List[str]]:
+    for chunk in response.iter_lines(chunk_size=8192,
+                                     decode_unicode=False,
+                                     delimiter=b"\0"):
+        if chunk:
+            data = json.loads(chunk.decode("utf-8"))
+            output = data["text"]
+            yield output
+
+
+def get_response(response: requests.Response) -> List[str]:
+    data = json.loads(response.content)
+    output = data["text"]
+    return output
+
+
+def create_test_prompts() -> List[str]:
+    """Create a list of test prompts."""
+    test_prompts = list()   
+         
+    unit_promts = ["To be or not to be,",
+                   "A robot may not injure a human being",
+                   "A robot may not injure a human being",
+                   "It is only with the heart that one can see rightly",
+                   "A robot may not injure a human being",
+                   "To be or not to be,",
+                   "It is only with the heart that one can see rightly",
+                   "To be or not to be,",
+                   "It is only with the heart that one can see rightly"]
+        
+    for i in range (0,100):
+        test_prompts += unit_promts
+    
+    return test_prompts
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", type=str, default="localhost")
+    parser.add_argument("--port", type=int, default=8000)
+    parser.add_argument("--n", type=int, default=4)
+    parser.add_argument("--prompt", type=str, default="")
+    parser.add_argument("--stream", action="store_true")
+    args = parser.parse_args()
+    prompt = args.prompt
+    api_url = f"http://{args.host}:{args.port}/generate"
+    n = args.n
+    stream = args.stream
+
+    if prompt == '':
+        prompt_list = create_test_prompts()
+    else:
+        prompt_list = [prompt]
+    for i in range(len(prompt_list)):
+        print(f"Prompt: {prompt_list[i]!r}\n", flush=True)
+        response = post_http_request(prompt_list[i], api_url, n, stream)
+
+        if stream:
+            num_printed_lines = 0
+            for h in get_streaming_response(response):
+                clear_line(num_printed_lines)
+                num_printed_lines = 0
+                for i, line in enumerate(h):
+                    num_printed_lines += 1
+                    print(f"Beam candidate {i}: {line!r}", flush=True)
+        else:
+            output = get_response(response)
+            for i, line in enumerate(output):
+                print(f"Beam candidate {i}: {line!r}", flush=True)
--- a/examples/vllm_test/test_llm_engine.py
+++ b/examples/vllm_test/test_llm_engine.py
@@ -0,0 +1,119 @@
+import argparse
+from typing import List, Tuple
+import xtrt_llm
+from xtrt_llm.vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams
+
+
+def create_test_prompts() -> List[Tuple[str, SamplingParams]]:
+    """Create a list of test prompts with their sampling parameters."""
+    return [
+        ("A robot may not injure a human being",
+         SamplingParams(temperature=0.0,
+                        logprobs=1,
+                        prompt_logprobs=1,
+                        max_tokens=50)),
+        ("To be or not to be,",
+         SamplingParams(temperature=0.0,
+                        logprobs=1,
+                        prompt_logprobs=1,
+                        max_tokens=50)),
+        ("To be or not to be,",
+         SamplingParams(temperature=0.0,
+                        logprobs=1,
+                        prompt_logprobs=1,
+                        max_tokens=50)),
+        ("A robot may not injure a human being",
+         SamplingParams(temperature=0.0,
+                        logprobs=1,
+                        prompt_logprobs=1,
+                        max_tokens=50)),
+        ("To be or not to be,",
+         SamplingParams(temperature=0.0,
+                        logprobs=1,
+                        prompt_logprobs=1,
+                        max_tokens=50)),
+        ("A robot may not injure a human being",
+         SamplingParams(temperature=0.0,
+                        logprobs=1,
+                        prompt_logprobs=1,
+                        max_tokens=50)),
+        ("A robot may not injure a human being",
+         SamplingParams(temperature=0.0,
+                        logprobs=1,
+                        prompt_logprobs=1,
+                        max_tokens=50)),
+        ("It is only with the heart that one can see rightly",
+         SamplingParams(temperature=0.0,
+                        logprobs=1,
+                        prompt_logprobs=1,
+                        max_tokens=50)),
+        ("A robot may not injure a human being",
+         SamplingParams(temperature=0.0,
+                        logprobs=1,
+                        prompt_logprobs=1,
+                        max_tokens=50)),
+        ("It is only with the heart that one can see rightly",
+         SamplingParams(temperature=0.0,
+                        logprobs=1,
+                        prompt_logprobs=1,
+                        max_tokens=50)),
+        ("A robot may not injure a human being",
+         SamplingParams(temperature=0.0,
+                        logprobs=1,
+                        prompt_logprobs=1,
+                        max_tokens=50)),
+        ("It is only with the heart that one can see rightly",
+         SamplingParams(temperature=0.0,
+                        logprobs=1,
+                        prompt_logprobs=1,
+                        max_tokens=50)),
+        # ("To be or not to be,",
+        #  SamplingParams(temperature=0.8, top_k=5, presence_penalty=0.2)),
+        # ("What is the meaning of life?",
+        #  SamplingParams(n=2,
+        #                 best_of=5,
+        #                 temperature=0.8,
+        #                 top_p=0.95,
+        #                 frequency_penalty=0.1)),
+        # ("It is only with the heart that one can see rightly",
+        #  SamplingParams(n=3, best_of=3, use_beam_search=True,
+        #                 temperature=0.0)),
+    ]
+
+
+def process_requests(engine: LLMEngine,
+                     test_prompts: List[Tuple[str, SamplingParams]]):
+    """Continuously process a list of prompts and handle the outputs."""
+    request_id = 0
+
+    while test_prompts or engine.has_unfinished_requests():
+        if test_prompts:
+            prompt, sampling_params = test_prompts.pop(0)
+            engine.add_request(str(request_id), prompt, sampling_params)
+            request_id += 1
+        request_outputs: List[RequestOutput] = engine.step()
+
+        for request_output in request_outputs:
+            if request_output.finished:
+                print("end_request_output:", request_output)
+
+
+def initialize_engine(args: argparse.Namespace) -> LLMEngine:
+    """Initialize the LLMEngine from the command line arguments."""
+    engine_args = EngineArgs.from_cli_args(args)
+    return LLMEngine.from_engine_args(engine_args)
+
+
+def main(args: argparse.Namespace):
+    """Main function that sets up and runs the prompt processing."""
+    engine = initialize_engine(args)
+    test_prompts = create_test_prompts()
+    process_requests(engine, test_prompts)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        description='Demo on using the LLMEngine class directly')
+    parser = EngineArgs.add_cli_args(parser)
+    args = parser.parse_args()
+    main(args)
				`@@ -0,0 +1 @@`
				`{% for message in messages %}{{'<\|im_start\|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<\|im_end\|>' + '\n'}}{% endif %}{% endfor %}`