add pkgs
This commit is contained in:
363
examples/vllm_test/benchmark_throughput.py
Normal file
363
examples/vllm_test/benchmark_throughput.py
Normal file
@@ -0,0 +1,363 @@
|
||||
"""Benchmark offline inference throughput."""
|
||||
import argparse
|
||||
import json
|
||||
import random
|
||||
import time
|
||||
import random
|
||||
from typing import List, Tuple, Union
|
||||
|
||||
import torch
|
||||
from tqdm import tqdm
|
||||
from transformers import AutoModelForCausalLM, PreTrainedTokenizerBase
|
||||
|
||||
from xtrt_llm.vllm import LLM, SamplingParams
|
||||
from xtrt_llm.vllm.transformers_utils.tokenizer import get_tokenizer
|
||||
|
||||
|
||||
def dummy_sample_requests(
|
||||
tokenizer: PreTrainedTokenizerBase,
|
||||
prompt: Union[str, List[str]],
|
||||
tokenid: int,
|
||||
output_len: Union[int, List[int]],
|
||||
input_len: Union[int, List[int]],
|
||||
max_model_len: int,
|
||||
num_requests: Union[int, List[int]],
|
||||
) -> List[Tuple[List[int], int, int]]:
|
||||
|
||||
if prompt is not None:
|
||||
if isinstance(prompt, str):
|
||||
assert isinstance(input_len, int) \
|
||||
and isinstance(output_len, int) and isinstance(num_requests, int)
|
||||
prompt_token_ids_list = [tokenizer(prompt).input_ids]
|
||||
input_len = [input_len]
|
||||
output_len = [output_len]
|
||||
num_requests = [num_requests]
|
||||
else:
|
||||
assert isinstance(input_len, list) \
|
||||
and isinstance(output_len, list) and isinstance(num_requests, list)
|
||||
prompt_token_ids_list = [tokenizer(x).input_ids for x in prompt]
|
||||
if tokenid is not None:
|
||||
if isinstance(input_len, int):
|
||||
assert isinstance(output_len, int) and isinstance(num_requests, int)
|
||||
prompt_token_ids_list = [[tokenid] * input_len]
|
||||
input_len = [input_len]
|
||||
output_len = [output_len]
|
||||
num_requests = [num_requests]
|
||||
else:
|
||||
assert isinstance(output_len, list) and isinstance(num_requests, list)
|
||||
prompt_token_ids_list = [[tokenid] * x for x in input_len]
|
||||
|
||||
sampled_requests: List[Tuple[List[int], int, int]] = []
|
||||
for i, prompt_token_ids in enumerate(prompt_token_ids_list):
|
||||
for idx in range(num_requests[i]):
|
||||
if len(prompt_token_ids) < input_len[i]:
|
||||
prompt_token_ids.extend([prompt_token_ids[0]] *
|
||||
(input_len[i] - len(prompt_token_ids)))
|
||||
if len(prompt_token_ids) > input_len[i]:
|
||||
prompt_token_ids = prompt_token_ids[:input_len[i] -
|
||||
len(prompt_token_ids)]
|
||||
sampled_requests.append(
|
||||
(prompt_token_ids, input_len[i], min(output_len[i], max_model_len - input_len[i])))
|
||||
|
||||
random.shuffle(sampled_requests)
|
||||
return sampled_requests
|
||||
|
||||
|
||||
def sample_requests(
|
||||
dataset_path: str,
|
||||
num_requests: int,
|
||||
tokenizer: PreTrainedTokenizerBase,
|
||||
) -> List[Tuple[str, int, int]]:
|
||||
# Load the dataset.
|
||||
with open(dataset_path) as f:
|
||||
dataset = json.load(f)
|
||||
# Filter out the conversations with less than 2 turns.
|
||||
dataset = [data for data in dataset if len(data["conversations"]) >= 2]
|
||||
# Only keep the first two turns of each conversation.
|
||||
dataset = [(data["conversations"][0]["value"],
|
||||
data["conversations"][1]["value"]) for data in dataset]
|
||||
|
||||
# Tokenize the prompts and completions.
|
||||
prompts = [prompt for prompt, _ in dataset]
|
||||
prompt_token_ids = tokenizer(prompts).input_ids
|
||||
completions = [completion for _, completion in dataset]
|
||||
completion_token_ids = tokenizer(completions).input_ids
|
||||
tokenized_dataset = []
|
||||
for i in range(len(dataset)):
|
||||
output_len = len(completion_token_ids[i])
|
||||
tokenized_dataset.append((prompts[i], prompt_token_ids[i], output_len))
|
||||
|
||||
# Filter out too long sequences.
|
||||
filtered_dataset: List[Tuple[str, int, int]] = []
|
||||
for prompt, prompt_token_ids, output_len in tokenized_dataset:
|
||||
prompt_len = len(prompt_token_ids)
|
||||
if prompt_len < 4 or output_len < 4:
|
||||
# Prune too short sequences.
|
||||
continue
|
||||
if prompt_len > 1024 or prompt_len + output_len > 2048:
|
||||
# Prune too long sequences.
|
||||
continue
|
||||
filtered_dataset.append((prompt, prompt_len, output_len))
|
||||
|
||||
# Sample the requests.
|
||||
sampled_requests = random.sample(filtered_dataset, num_requests)
|
||||
return sampled_requests
|
||||
|
||||
|
||||
def dummy_run_vllm(
|
||||
requests: List[Tuple[List[int], int, int]],
|
||||
model: str,
|
||||
tokenizer: str,
|
||||
tensor_parallel_size: int,
|
||||
seed: int,
|
||||
n: int,
|
||||
use_beam_search: bool,
|
||||
trust_remote_code: bool,
|
||||
max_model_len: int,
|
||||
engine_dir: str,
|
||||
max_num_seqs: int,
|
||||
max_num_batched_tokens: int,
|
||||
) -> float:
|
||||
llm = LLM(
|
||||
model=model,
|
||||
tokenizer=tokenizer,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
seed=seed,
|
||||
trust_remote_code=trust_remote_code,
|
||||
disable_log_stats=False,
|
||||
max_model_len=max_model_len,
|
||||
engine_dir=engine_dir,
|
||||
max_num_seqs=max_num_seqs,
|
||||
max_num_batched_tokens=max_num_batched_tokens,
|
||||
)
|
||||
start = time.time()
|
||||
# Add the requests to the engine.
|
||||
for prompt_tokenids, _, output_len in requests:
|
||||
sampling_params = SamplingParams(
|
||||
n=n,
|
||||
temperature=0.0 if use_beam_search else 1.0,
|
||||
top_p=1.0,
|
||||
use_beam_search=use_beam_search,
|
||||
ignore_eos=True,
|
||||
max_tokens=output_len,
|
||||
)
|
||||
# FIXME(woosuk): Do not use internal method.
|
||||
llm._add_request(
|
||||
# model_type="llama2",
|
||||
prompt=None,
|
||||
prompt_token_ids=prompt_tokenids,
|
||||
sampling_params=sampling_params,
|
||||
)
|
||||
|
||||
# FIXME(woosuk): Do use internal method.
|
||||
llm._run_engine(use_tqdm=True)
|
||||
end = time.time()
|
||||
return end - start
|
||||
|
||||
|
||||
def run_vllm(
|
||||
requests: List[Tuple[str, int, int]],
|
||||
model: str,
|
||||
tokenizer: str,
|
||||
tensor_parallel_size: int,
|
||||
seed: int,
|
||||
n: int,
|
||||
use_beam_search: bool,
|
||||
trust_remote_code: bool,
|
||||
) -> float:
|
||||
llm = LLM(
|
||||
model=model,
|
||||
tokenizer=tokenizer,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
seed=seed,
|
||||
trust_remote_code=trust_remote_code,
|
||||
)
|
||||
|
||||
# Add the requests to the engine.
|
||||
for prompt, _, output_len in requests:
|
||||
sampling_params = SamplingParams(
|
||||
n=n,
|
||||
temperature=0.0 if use_beam_search else 1.0,
|
||||
top_p=1.0,
|
||||
use_beam_search=use_beam_search,
|
||||
ignore_eos=True,
|
||||
max_tokens=output_len,
|
||||
)
|
||||
# FIXME(woosuk): Do not use internal method.
|
||||
llm._add_request(
|
||||
model_type="llama2",
|
||||
prompt=prompt,
|
||||
prompt_token_ids=None,
|
||||
sampling_params=sampling_params,
|
||||
)
|
||||
|
||||
start = time.time()
|
||||
# FIXME(woosuk): Do use internal method.
|
||||
llm._run_engine(use_tqdm=True)
|
||||
end = time.time()
|
||||
return end - start
|
||||
|
||||
|
||||
def run_hf(
|
||||
requests: List[Tuple[str, int, int]],
|
||||
model: str,
|
||||
tokenizer: PreTrainedTokenizerBase,
|
||||
n: int,
|
||||
use_beam_search: bool,
|
||||
max_batch_size: int,
|
||||
trust_remote_code: bool,
|
||||
) -> float:
|
||||
assert not use_beam_search
|
||||
llm = AutoModelForCausalLM.from_pretrained(
|
||||
model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code)
|
||||
if llm.config.model_type == "llama":
|
||||
# To enable padding in the HF backend.
|
||||
tokenizer.pad_token = tokenizer.eos_token
|
||||
llm = llm.cuda()
|
||||
|
||||
pbar = tqdm(total=len(requests))
|
||||
start = time.time()
|
||||
batch: List[str] = []
|
||||
max_prompt_len = 0
|
||||
max_output_len = 0
|
||||
for i in range(len(requests)):
|
||||
prompt, prompt_len, output_len = requests[i]
|
||||
# Add the prompt to the batch.
|
||||
batch.append(prompt)
|
||||
max_prompt_len = max(max_prompt_len, prompt_len)
|
||||
max_output_len = max(max_output_len, output_len)
|
||||
if len(batch) < max_batch_size and i != len(requests) - 1:
|
||||
# Check if we can add more requests to the batch.
|
||||
_, next_prompt_len, next_output_len = requests[i + 1]
|
||||
if (max(max_prompt_len, next_prompt_len) +
|
||||
max(max_output_len, next_output_len)) <= 2048:
|
||||
# We can add more requests to the batch.
|
||||
continue
|
||||
|
||||
# Generate the sequences.
|
||||
input_ids = tokenizer(batch, return_tensors="pt",
|
||||
padding=True).input_ids
|
||||
llm_outputs = llm.generate(
|
||||
input_ids=input_ids.cuda(),
|
||||
do_sample=not use_beam_search,
|
||||
num_return_sequences=n,
|
||||
temperature=1.0,
|
||||
top_p=1.0,
|
||||
use_cache=True,
|
||||
max_new_tokens=max_output_len,
|
||||
)
|
||||
# Include the decoding time.
|
||||
tokenizer.batch_decode(llm_outputs, skip_special_tokens=True)
|
||||
pbar.update(len(batch))
|
||||
|
||||
# Clear the batch.
|
||||
batch = []
|
||||
max_prompt_len = 0
|
||||
max_output_len = 0
|
||||
end = time.time()
|
||||
return end - start
|
||||
|
||||
|
||||
def main(args: argparse.Namespace):
|
||||
print(args)
|
||||
random.seed(args.seed)
|
||||
|
||||
# Sample the requests.
|
||||
tokenizer = get_tokenizer(args.tokenizer,
|
||||
trust_remote_code=args.trust_remote_code)
|
||||
if args.dummy_dataset:
|
||||
requests = dummy_sample_requests(tokenizer, args.dummy_prompt,
|
||||
args.dummy_tokenid,
|
||||
args.dummy_output_len,
|
||||
args.dummy_input_len,
|
||||
args.max_model_len, args.num_prompts)
|
||||
|
||||
if args.backend == "vllm":
|
||||
elapsed_time = dummy_run_vllm(
|
||||
requests, args.model, args.tokenizer, args.tensor_parallel_size,
|
||||
args.seed, args.n, args.use_beam_search, args.trust_remote_code,
|
||||
args.max_model_len, args.engine_dir, args.max_num_seqs,
|
||||
args.max_num_batched_tokens)
|
||||
else:
|
||||
raise ValueError(f"Unknown backend: {args.backend}")
|
||||
total_num_tokens = sum(output_len
|
||||
for _, _, output_len in requests)
|
||||
print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
|
||||
f"{total_num_tokens / elapsed_time:.2f} tokens/s")
|
||||
else:
|
||||
requests = sample_requests(args.dataset, args.num_prompts, tokenizer)
|
||||
|
||||
if args.backend == "vllm":
|
||||
elapsed_time = run_vllm(requests, args.model, args.tokenizer,
|
||||
args.tensor_parallel_size, args.seed,
|
||||
args.n, args.use_beam_search,
|
||||
args.trust_remote_code)
|
||||
elif args.backend == "hf":
|
||||
assert args.tensor_parallel_size == 1
|
||||
elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
|
||||
args.use_beam_search, args.hf_max_batch_size,
|
||||
args.trust_remote_code)
|
||||
else:
|
||||
raise ValueError(f"Unknown backend: {args.backend}")
|
||||
total_num_tokens = sum(prompt_len + output_len
|
||||
for _, prompt_len, output_len in requests)
|
||||
print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
|
||||
f"{total_num_tokens / elapsed_time:.2f} tokens/s")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Benchmark the throughput.")
|
||||
parser.add_argument("--backend",
|
||||
type=str,
|
||||
choices=["vllm", "hf"],
|
||||
default="vllm")
|
||||
parser.add_argument("--dataset", type=str, help="Path to the dataset.")
|
||||
parser.add_argument("--model", type=str, default="facebook/opt-125m")
|
||||
parser.add_argument("--tokenizer", type=str, default=None)
|
||||
parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1)
|
||||
parser.add_argument("--n",
|
||||
type=int,
|
||||
default=1,
|
||||
help="Number of generated sequences per prompt.")
|
||||
parser.add_argument("--use-beam-search", action="store_true")
|
||||
parser.add_argument("--num-prompts",
|
||||
nargs='+',
|
||||
type=int,
|
||||
default=1000,
|
||||
help="Number of prompts to process.")
|
||||
parser.add_argument("--seed", type=int, default=0)
|
||||
parser.add_argument("--hf-max-batch-size",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Maximum batch size for HF backend.")
|
||||
parser.add_argument('--trust-remote-code',
|
||||
action='store_true',
|
||||
help='trust remote code from huggingface')
|
||||
parser.add_argument('--max-model-len', type=int, default=2048)
|
||||
parser.add_argument('--max-num-batched-tokens', type=int, default=2048)
|
||||
parser.add_argument('--max-num-seqs', type=int, default=128)
|
||||
parser.add_argument('--dummy-dataset',
|
||||
action='store_true',
|
||||
help='use dummy data to test')
|
||||
parser.add_argument('--dummy-prompt', nargs='+', type=str, default=None)
|
||||
parser.add_argument('--dummy-tokenid', type=int, default=None)
|
||||
parser.add_argument('--dummy-input-len', nargs='+', type=int, default=1024)
|
||||
parser.add_argument('--dummy-output-len', nargs='+', type=int, default=1024)
|
||||
parser.add_argument("--engine_dir", type=str, help="Path to the engine.")
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.backend == "vllm":
|
||||
if args.hf_max_batch_size is not None:
|
||||
raise ValueError("HF max batch size is only for HF backend.")
|
||||
elif args.backend == "hf":
|
||||
if args.hf_max_batch_size is None:
|
||||
raise ValueError("HF max batch size is required for HF backend.")
|
||||
if args.dummy_dataset:
|
||||
if args.dummy_prompt is None and args.dummy_tokenid is None:
|
||||
raise ValueError(
|
||||
"dummy_dataset is True, thus dummy_prompt is not None or dummy_tokenid is not None."
|
||||
)
|
||||
if args.tokenizer is None:
|
||||
args.tokenizer = args.model
|
||||
|
||||
main(args)
|
||||
37
examples/vllm_test/openai_chatcompletion_client.py
Normal file
37
examples/vllm_test/openai_chatcompletion_client.py
Normal file
@@ -0,0 +1,37 @@
|
||||
from openai import OpenAI
|
||||
|
||||
# Modify OpenAI's API key and API base to use vLLM's API server.
|
||||
openai_api_key = "EMPTY"
|
||||
openai_api_base = "http://localhost:8000/v1"
|
||||
|
||||
client = OpenAI(
|
||||
# defaults to os.environ.get("OPENAI_API_KEY")
|
||||
api_key=openai_api_key,
|
||||
base_url=openai_api_base,
|
||||
)
|
||||
|
||||
models = client.models.list()
|
||||
model = models.data[0].id
|
||||
|
||||
chat_completion = client.chat.completions.create(
|
||||
messages=[{
|
||||
"role": "system",
|
||||
"content": "You are a helpful assistant."
|
||||
}, {
|
||||
"role": "user",
|
||||
"content": "Who won the world series in 2020?"
|
||||
}, {
|
||||
"role":
|
||||
"assistant",
|
||||
"content":
|
||||
"The Los Angeles Dodgers won the World Series in 2020."
|
||||
}, {
|
||||
"role": "user",
|
||||
"content": "Where was it played?"
|
||||
}],
|
||||
model=model,
|
||||
)
|
||||
|
||||
|
||||
print("Chat completion results:")
|
||||
print(chat_completion)
|
||||
21
examples/vllm_test/run_llama1-7b_throughput.sh
Normal file
21
examples/vllm_test/run_llama1-7b_throughput.sh
Normal file
@@ -0,0 +1,21 @@
|
||||
#!/bin/bash
|
||||
# bash vllm_test/run_llama1-7b_throughput.sh /path/to/llama7b_hf_model /path/to/llama7b_vls_engine
|
||||
model_path=$1
|
||||
engine_path=$2
|
||||
|
||||
#run test fixed input/output benchmark# llama7b-1xpu
|
||||
XMLIR_D_XPU_L3_SIZE=0 python benchmark_throughput.py \
|
||||
--trust-remote-code \
|
||||
--backend vllm \
|
||||
--model $model_path \
|
||||
--tokenizer $model_path \
|
||||
--engine_dir $engine_path \
|
||||
--tensor-parallel-size 1 \
|
||||
--dummy-dataset \
|
||||
--max-num-seqs 14 \
|
||||
--max-num-batched-tokens 2048 \
|
||||
--dummy-tokenid 1 \
|
||||
--dummy-input-len 1024 \
|
||||
--dummy-output-len 1024 \
|
||||
--max-model-len 2048 \
|
||||
--num-prompts 14
|
||||
3
examples/vllm_test/run_stats.sh
Normal file
3
examples/vllm_test/run_stats.sh
Normal file
@@ -0,0 +1,3 @@
|
||||
#!/bin/bash
|
||||
tmp=`grep 'Avg prompt throughput' server.log > server.log.valid`
|
||||
python run_stats_server.py
|
||||
90
examples/vllm_test/run_stats_server.py
Normal file
90
examples/vllm_test/run_stats_server.py
Normal file
@@ -0,0 +1,90 @@
|
||||
import re
|
||||
import sys
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
# 用于记录每个度量的值
|
||||
first_token_times_values = []
|
||||
prompt_throughput_values = []
|
||||
generation_throughput_values = []
|
||||
running_values = []
|
||||
|
||||
# 从文件中读取数据
|
||||
file_path = "server.log.valid" # 替换成你的文件路径
|
||||
with open(file_path, 'r') as file:
|
||||
# 遍历文件中的每一行进行统计
|
||||
for line in file:
|
||||
# 使用正则表达式提取Avg First Token times和Avg generation throughput以及Running的值
|
||||
match_first_token = re.search(r"Avg First Token times:([0-9.]+)", line)
|
||||
match_prompt_throughput = re.search(r"Avg prompt throughput: ([0-9.]+)", line)
|
||||
match_generation_throughput = re.search(r"Avg generation throughput: ([0-9.]+)", line)
|
||||
match_running = re.search(r"Running: (\d+)", line)
|
||||
|
||||
# 统计Avg First Token times
|
||||
if match_first_token:
|
||||
first_token_times = float(match_first_token.group(1))
|
||||
if abs(first_token_times) > 1e-5:
|
||||
first_token_times_values.append(first_token_times)
|
||||
|
||||
if match_prompt_throughput:
|
||||
prompt_throughput = float(match_prompt_throughput.group(1))
|
||||
if abs(prompt_throughput) > 1e-5:
|
||||
prompt_throughput_values.append(prompt_throughput)
|
||||
|
||||
# 统计Avg generation throughput和Running
|
||||
if match_generation_throughput and match_running:
|
||||
generation_throughput = float(match_generation_throughput.group(1))
|
||||
running = int(match_running.group(1))
|
||||
if abs(generation_throughput) > 1e-5:
|
||||
generation_throughput_values.append(generation_throughput)
|
||||
running_values.append(running)
|
||||
|
||||
# 计算平均值
|
||||
avg_first_token_times = np.mean(first_token_times_values) if len(first_token_times_values) > 0 else 0
|
||||
max_first_token_times = np.max(first_token_times_values) if len(first_token_times_values) > 0 else 0
|
||||
min_first_token_times = np.min(first_token_times_values) if len(first_token_times_values) > 0 else 0
|
||||
p10_first_token_times = np.percentile(first_token_times_values, 10) if len(first_token_times_values) > 0 else 0
|
||||
p90_first_token_times = np.percentile(first_token_times_values, 90) if len(first_token_times_values) > 0 else 0
|
||||
p99_first_token_times = np.percentile(first_token_times_values, 99) if len(first_token_times_values) > 0 else 0
|
||||
cnt_first_token_times = len(first_token_times_values) if len(first_token_times_values) > 0 else 0
|
||||
|
||||
avg_prompt_throughput = np.mean(prompt_throughput_values) if len(prompt_throughput_values) > 0 else 0
|
||||
max_prompt_throughput = np.max(prompt_throughput_values) if len(prompt_throughput_values) > 0 else 0
|
||||
min_prompt_throughput = np.min(prompt_throughput_values) if len(prompt_throughput_values) > 0 else 0
|
||||
p10_prompt_throughput = np.percentile(prompt_throughput_values, 10) if len(prompt_throughput_values) > 0 else 0
|
||||
p90_prompt_throughput = np.percentile(prompt_throughput_values, 90) if len(prompt_throughput_values) > 0 else 0
|
||||
p99_prompt_throughput = np.percentile(prompt_throughput_values, 99) if len(prompt_throughput_values) > 0 else 0
|
||||
cnt_prompt_throughput = len(prompt_throughput_values) if len(prompt_throughput_values) > 0 else 0
|
||||
|
||||
avg_generation_throughput = np.mean(generation_throughput_values) if len(generation_throughput_values) > 0 else 0
|
||||
max_generation_throughput = np.max(generation_throughput_values) if len(generation_throughput_values) > 0 else 0
|
||||
min_generation_throughput = np.min(generation_throughput_values) if len(generation_throughput_values) > 0 else 0
|
||||
p10_generation_throughput = np.percentile(generation_throughput_values, 10) if len(generation_throughput_values) > 0 else 0
|
||||
p90_generation_throughput = np.percentile(generation_throughput_values, 90) if len(generation_throughput_values) > 0 else 0
|
||||
p99_generation_throughput = np.percentile(generation_throughput_values, 99) if len(generation_throughput_values) > 0 else 0
|
||||
cnt_generation_throughput = len(generation_throughput_values) if len(generation_throughput_values) > 0 else 0
|
||||
|
||||
avg_running = np.mean(running_values) if len(running_values) > 0 else 0
|
||||
max_running = np.max(running_values) if len(running_values) > 0 else 0
|
||||
min_running = np.min(running_values) if len(running_values) > 0 else 0
|
||||
p10_running = np.percentile(running_values, 10) if len(running_values) > 0 else 0
|
||||
p90_running = np.percentile(running_values, 90) if len(running_values) > 0 else 0
|
||||
p99_running = np.percentile(running_values, 99) if len(running_values) > 0 else 0
|
||||
cnt_running = len(running_values) if len(running_values) > 0 else 0
|
||||
|
||||
# Create a DataFrame
|
||||
data = {
|
||||
'avg': [avg_first_token_times, avg_prompt_throughput, avg_generation_throughput, avg_running],
|
||||
'max': [max_first_token_times, max_prompt_throughput, max_generation_throughput, max_running],
|
||||
'min': [min_first_token_times, min_prompt_throughput, min_generation_throughput, min_running],
|
||||
'p10': [p10_first_token_times, p10_prompt_throughput, p10_generation_throughput, p10_running],
|
||||
'p90': [p90_first_token_times, p90_prompt_throughput, p90_generation_throughput, p90_running],
|
||||
'p99': [p99_first_token_times, p99_prompt_throughput, p99_generation_throughput, p99_running],
|
||||
'num': [cnt_first_token_times, cnt_prompt_throughput, cnt_generation_throughput, cnt_running]
|
||||
}
|
||||
|
||||
df = pd.DataFrame(data, index=['first_token_times', 'prompt_throughput', 'generation_throughput', 'running'])
|
||||
|
||||
# Display the DataFrame
|
||||
print(df)
|
||||
|
||||
7
examples/vllm_test/run_throughput.sh
Normal file
7
examples/vllm_test/run_throughput.sh
Normal file
@@ -0,0 +1,7 @@
|
||||
#!/bin/bash
|
||||
|
||||
model_path=$1
|
||||
engine_path=$2
|
||||
|
||||
#run test fixed input/output benchmark
|
||||
XMLIR_D_XPU_L3_SIZE=0 python benchmark_throughput.py --backend vllm --model $model_path --tokenizer $model_path --engine_dir $engine_path --tensor-parallel-size 8 --dummy-dataset --max-num-seqs 128 --max-num-batched-tokens 2048 --dummy-tokenid 1 --dummy-input-len 1024 --dummy-output-len 1024 --max-model-len 2048 --num-prompts 128 > server.log
|
||||
29
examples/vllm_test/template_alpaca.jinja
Normal file
29
examples/vllm_test/template_alpaca.jinja
Normal file
@@ -0,0 +1,29 @@
|
||||
{{ (messages|selectattr('role', 'equalto', 'system')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'system')|list) else '' }}
|
||||
|
||||
{% for message in messages %}
|
||||
{% if message['role'] == 'user' %}
|
||||
### Instruction:
|
||||
{{ message['content']|trim -}}
|
||||
{% if not loop.last %}
|
||||
|
||||
|
||||
{% endif %}
|
||||
{% elif message['role'] == 'assistant' %}
|
||||
### Response:
|
||||
{{ message['content']|trim -}}
|
||||
{% if not loop.last %}
|
||||
|
||||
|
||||
{% endif %}
|
||||
{% elif message['role'] == 'user_context' %}
|
||||
### Input:
|
||||
{{ message['content']|trim -}}
|
||||
{% if not loop.last %}
|
||||
|
||||
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}
|
||||
### Response:
|
||||
{% endif %}
|
||||
1
examples/vllm_test/template_chatml.jinja
Normal file
1
examples/vllm_test/template_chatml.jinja
Normal file
@@ -0,0 +1 @@
|
||||
{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}
|
||||
30
examples/vllm_test/template_inkbot.jinja
Normal file
30
examples/vllm_test/template_inkbot.jinja
Normal file
@@ -0,0 +1,30 @@
|
||||
<#meta#>
|
||||
- Date: {{ (messages|selectattr('role', 'equalto', 'meta-current_date')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'meta-current_date')|list) else '' }}
|
||||
- Task: {{ (messages|selectattr('role', 'equalto', 'meta-task_name')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'meta-task_name')|list) else '' }}
|
||||
<#system#>
|
||||
{{ (messages|selectattr('role', 'equalto', 'system')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'system')|list) else '' }}
|
||||
<#chat#>
|
||||
{% for message in messages %}
|
||||
{% if message['role'] == 'user' %}
|
||||
<#user#>
|
||||
{{ message['content']|trim -}}
|
||||
{% if not loop.last %}
|
||||
|
||||
{% endif %}
|
||||
{% elif message['role'] == 'assistant' %}
|
||||
<#bot#>
|
||||
{{ message['content']|trim -}}
|
||||
{% if not loop.last %}
|
||||
|
||||
{% endif %}
|
||||
{% elif message['role'] == 'user_context' %}
|
||||
<#user_context#>
|
||||
{{ message['content']|trim -}}
|
||||
{% if not loop.last %}
|
||||
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}
|
||||
<#bot#>
|
||||
{% endif %}
|
||||
102
examples/vllm_test/test_api_client.py
Normal file
102
examples/vllm_test/test_api_client.py
Normal file
@@ -0,0 +1,102 @@
|
||||
"""Example Python client for vllm.entrypoints.api_server"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
from typing import Iterable, List
|
||||
|
||||
import requests
|
||||
from xtrt_llm.vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams
|
||||
|
||||
|
||||
def clear_line(n: int = 1) -> None:
|
||||
LINE_UP = '\033[1A'
|
||||
LINE_CLEAR = '\x1b[2K'
|
||||
for _ in range(n):
|
||||
print(LINE_UP, end=LINE_CLEAR, flush=True)
|
||||
|
||||
|
||||
def post_http_request(prompt: str,
|
||||
api_url: str,
|
||||
n: int = 1,
|
||||
stream: bool = False) -> requests.Response:
|
||||
headers = {"User-Agent": "Test Client"}
|
||||
pload = {
|
||||
"prompt": prompt,
|
||||
"n": n,
|
||||
"use_beam_search": True,
|
||||
"temperature": 0.0,
|
||||
"max_tokens": 16,
|
||||
"stream": stream,
|
||||
}
|
||||
response = requests.post(api_url, headers=headers, json=pload, stream=True)
|
||||
return response
|
||||
|
||||
|
||||
def get_streaming_response(response: requests.Response) -> Iterable[List[str]]:
|
||||
for chunk in response.iter_lines(chunk_size=8192,
|
||||
decode_unicode=False,
|
||||
delimiter=b"\0"):
|
||||
if chunk:
|
||||
data = json.loads(chunk.decode("utf-8"))
|
||||
output = data["text"]
|
||||
yield output
|
||||
|
||||
|
||||
def get_response(response: requests.Response) -> List[str]:
|
||||
data = json.loads(response.content)
|
||||
output = data["text"]
|
||||
return output
|
||||
|
||||
|
||||
def create_test_prompts() -> List[str]:
|
||||
"""Create a list of test prompts."""
|
||||
test_prompts = list()
|
||||
|
||||
unit_promts = ["To be or not to be,",
|
||||
"A robot may not injure a human being",
|
||||
"A robot may not injure a human being",
|
||||
"It is only with the heart that one can see rightly",
|
||||
"A robot may not injure a human being",
|
||||
"To be or not to be,",
|
||||
"It is only with the heart that one can see rightly",
|
||||
"To be or not to be,",
|
||||
"It is only with the heart that one can see rightly"]
|
||||
|
||||
for i in range (0,100):
|
||||
test_prompts += unit_promts
|
||||
|
||||
return test_prompts
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--host", type=str, default="localhost")
|
||||
parser.add_argument("--port", type=int, default=8000)
|
||||
parser.add_argument("--n", type=int, default=4)
|
||||
parser.add_argument("--prompt", type=str, default="")
|
||||
parser.add_argument("--stream", action="store_true")
|
||||
args = parser.parse_args()
|
||||
prompt = args.prompt
|
||||
api_url = f"http://{args.host}:{args.port}/generate"
|
||||
n = args.n
|
||||
stream = args.stream
|
||||
|
||||
if prompt == '':
|
||||
prompt_list = create_test_prompts()
|
||||
else:
|
||||
prompt_list = [prompt]
|
||||
for i in range(len(prompt_list)):
|
||||
print(f"Prompt: {prompt_list[i]!r}\n", flush=True)
|
||||
response = post_http_request(prompt_list[i], api_url, n, stream)
|
||||
|
||||
if stream:
|
||||
num_printed_lines = 0
|
||||
for h in get_streaming_response(response):
|
||||
clear_line(num_printed_lines)
|
||||
num_printed_lines = 0
|
||||
for i, line in enumerate(h):
|
||||
num_printed_lines += 1
|
||||
print(f"Beam candidate {i}: {line!r}", flush=True)
|
||||
else:
|
||||
output = get_response(response)
|
||||
for i, line in enumerate(output):
|
||||
print(f"Beam candidate {i}: {line!r}", flush=True)
|
||||
119
examples/vllm_test/test_llm_engine.py
Normal file
119
examples/vllm_test/test_llm_engine.py
Normal file
@@ -0,0 +1,119 @@
|
||||
import argparse
|
||||
from typing import List, Tuple
|
||||
import xtrt_llm
|
||||
from xtrt_llm.vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams
|
||||
|
||||
|
||||
def create_test_prompts() -> List[Tuple[str, SamplingParams]]:
|
||||
"""Create a list of test prompts with their sampling parameters."""
|
||||
return [
|
||||
("A robot may not injure a human being",
|
||||
SamplingParams(temperature=0.0,
|
||||
logprobs=1,
|
||||
prompt_logprobs=1,
|
||||
max_tokens=50)),
|
||||
("To be or not to be,",
|
||||
SamplingParams(temperature=0.0,
|
||||
logprobs=1,
|
||||
prompt_logprobs=1,
|
||||
max_tokens=50)),
|
||||
("To be or not to be,",
|
||||
SamplingParams(temperature=0.0,
|
||||
logprobs=1,
|
||||
prompt_logprobs=1,
|
||||
max_tokens=50)),
|
||||
("A robot may not injure a human being",
|
||||
SamplingParams(temperature=0.0,
|
||||
logprobs=1,
|
||||
prompt_logprobs=1,
|
||||
max_tokens=50)),
|
||||
("To be or not to be,",
|
||||
SamplingParams(temperature=0.0,
|
||||
logprobs=1,
|
||||
prompt_logprobs=1,
|
||||
max_tokens=50)),
|
||||
("A robot may not injure a human being",
|
||||
SamplingParams(temperature=0.0,
|
||||
logprobs=1,
|
||||
prompt_logprobs=1,
|
||||
max_tokens=50)),
|
||||
("A robot may not injure a human being",
|
||||
SamplingParams(temperature=0.0,
|
||||
logprobs=1,
|
||||
prompt_logprobs=1,
|
||||
max_tokens=50)),
|
||||
("It is only with the heart that one can see rightly",
|
||||
SamplingParams(temperature=0.0,
|
||||
logprobs=1,
|
||||
prompt_logprobs=1,
|
||||
max_tokens=50)),
|
||||
("A robot may not injure a human being",
|
||||
SamplingParams(temperature=0.0,
|
||||
logprobs=1,
|
||||
prompt_logprobs=1,
|
||||
max_tokens=50)),
|
||||
("It is only with the heart that one can see rightly",
|
||||
SamplingParams(temperature=0.0,
|
||||
logprobs=1,
|
||||
prompt_logprobs=1,
|
||||
max_tokens=50)),
|
||||
("A robot may not injure a human being",
|
||||
SamplingParams(temperature=0.0,
|
||||
logprobs=1,
|
||||
prompt_logprobs=1,
|
||||
max_tokens=50)),
|
||||
("It is only with the heart that one can see rightly",
|
||||
SamplingParams(temperature=0.0,
|
||||
logprobs=1,
|
||||
prompt_logprobs=1,
|
||||
max_tokens=50)),
|
||||
# ("To be or not to be,",
|
||||
# SamplingParams(temperature=0.8, top_k=5, presence_penalty=0.2)),
|
||||
# ("What is the meaning of life?",
|
||||
# SamplingParams(n=2,
|
||||
# best_of=5,
|
||||
# temperature=0.8,
|
||||
# top_p=0.95,
|
||||
# frequency_penalty=0.1)),
|
||||
# ("It is only with the heart that one can see rightly",
|
||||
# SamplingParams(n=3, best_of=3, use_beam_search=True,
|
||||
# temperature=0.0)),
|
||||
]
|
||||
|
||||
|
||||
def process_requests(engine: LLMEngine,
|
||||
test_prompts: List[Tuple[str, SamplingParams]]):
|
||||
"""Continuously process a list of prompts and handle the outputs."""
|
||||
request_id = 0
|
||||
|
||||
while test_prompts or engine.has_unfinished_requests():
|
||||
if test_prompts:
|
||||
prompt, sampling_params = test_prompts.pop(0)
|
||||
engine.add_request(str(request_id), prompt, sampling_params)
|
||||
request_id += 1
|
||||
request_outputs: List[RequestOutput] = engine.step()
|
||||
|
||||
for request_output in request_outputs:
|
||||
if request_output.finished:
|
||||
print("end_request_output:", request_output)
|
||||
|
||||
|
||||
def initialize_engine(args: argparse.Namespace) -> LLMEngine:
|
||||
"""Initialize the LLMEngine from the command line arguments."""
|
||||
engine_args = EngineArgs.from_cli_args(args)
|
||||
return LLMEngine.from_engine_args(engine_args)
|
||||
|
||||
|
||||
def main(args: argparse.Namespace):
|
||||
"""Main function that sets up and runs the prompt processing."""
|
||||
engine = initialize_engine(args)
|
||||
test_prompts = create_test_prompts()
|
||||
process_requests(engine, test_prompts)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Demo on using the LLMEngine class directly')
|
||||
parser = EngineArgs.add_cli_args(parser)
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
Reference in New Issue
Block a user