59 lines
2.0 KiB
Python
59 lines
2.0 KiB
Python
import numpy as np
|
|
from vllm import LLM, SamplingParams
|
|
import os
|
|
import pandas as pd
|
|
|
|
def test_generating_csv():
|
|
'''
|
|
test generating csv
|
|
'''
|
|
# contents of this test is brought from benchmark_latency.py
|
|
|
|
csv_file = "output.csv"
|
|
if os.path.isfile(csv_file):
|
|
os.remove("output.csv")
|
|
assert not os.path.isfile(csv_file)
|
|
|
|
os.environ['VLLM_LATENCY_DEBUG'] = "1"
|
|
model_path = "/data/vllm/sq_per_tensor_per_channel/Llama-2-7b-hf"
|
|
tp = 1
|
|
batch_size = 4
|
|
input_len = 128
|
|
output_len = 5
|
|
quantization = "smoothquant"
|
|
llm = LLM(model=model_path,
|
|
tokenizer=model_path,
|
|
quantization=quantization,
|
|
tensor_parallel_size=tp,
|
|
trust_remote_code=True,
|
|
enforce_eager=True)
|
|
sampling_params = SamplingParams(
|
|
n=1,
|
|
temperature=1.0,
|
|
top_p=1.0,
|
|
ignore_eos=True,
|
|
max_tokens=output_len,
|
|
)
|
|
dummy_prompt_token_ids = np.random.randint(10000,
|
|
size=(batch_size,
|
|
input_len))
|
|
dummy_prompt_token_ids = dummy_prompt_token_ids.tolist()
|
|
llm.generate(prompt_token_ids=dummy_prompt_token_ids,
|
|
sampling_params=sampling_params,
|
|
use_tqdm=False)
|
|
llm.get_metrics(0, # args.num_iters_warmup,
|
|
False, #args.only_average,
|
|
input_len, #args.input_len,
|
|
output_len, #args.output_len,
|
|
tp, #args.tensor_parallel_size,
|
|
quantization, #args.quantization
|
|
llm.dump_info)
|
|
assert os.path.isfile(csv_file)
|
|
df = pd.read_csv(csv_file)
|
|
assert df['batch size'].item() == batch_size
|
|
assert df['model'].item() == model_path
|
|
assert df['input len'].item() == input_len
|
|
assert df['output len'].item() == output_len
|
|
assert df['tp'].item() == tp
|
|
assert df['weight dtype'].item() == "SmoothQuant-int8"
|
|
os.remove(csv_file) |