Replace time.time() to time.perf_counter() for benchmarking. (#6178)

Signed-off-by: Lifu Huang <lifu.hlf@gmail.com>
This commit is contained in:
Lifu Huang
2025-05-11 14:32:49 -07:00
committed by GitHub
parent e9a47f4cb5
commit 6e2da51561
61 changed files with 158 additions and 158 deletions

View File

@@ -64,11 +64,11 @@ def test_batch_by_batch(all_prompts, gen_len):
tot_time = 0
for i in range(len(all_prompts)):
tic = time.time()
tic = time.perf_counter()
text_qa.run_batch(
list(zip(all_prompts[i], [gen_len] * len(all_prompts[i]))),
)
tot_time += time.time() - tic
tot_time += time.perf_counter() - tic
return tot_time
@@ -78,13 +78,13 @@ def test_batch_by_batch_with_hint(all_prompts, gen_len):
tot_time = 0
for i in range(len(all_prompts)):
tic = time.time()
tic = time.perf_counter()
# Send a hint to cache the prefix
text_qa.run_batch(list(zip(all_prompts[i][:1], [gen_len])))
# Send the batch
text_qa.run_batch(list(zip(all_prompts[i], [gen_len] * len(all_prompts[i]))))
tot_time += time.time() - tic
tot_time += time.perf_counter() - tic
return tot_time
@@ -94,11 +94,11 @@ def test_send_all(all_prompts, gen_len):
all_prompts = [x for prompt_list in all_prompts for x in prompt_list]
tic = time.time()
tic = time.perf_counter()
text_qa.run_batch(
list(zip(all_prompts, [gen_len] * len(all_prompts))),
)
tot_time = time.time() - tic
tot_time = time.perf_counter() - tic
return tot_time

View File

@@ -81,7 +81,7 @@ def send_batch_request(endpoint, prompts, gen_tokens, request_id):
}
data = {"text": prompts, "sampling_params": sampling_params}
start_time = time.time()
start_time = time.perf_counter()
try:
response = requests.post(
endpoint.base_url + "/generate", json=data, timeout=3600
@@ -90,7 +90,7 @@ def send_batch_request(endpoint, prompts, gen_tokens, request_id):
error = response.json()
raise RuntimeError(f"Request {request_id} failed: {error}")
result = response.json()
elapsed_time = (time.time() - start_time) * 1000 # Convert to ms
elapsed_time = (time.perf_counter() - start_time) * 1000 # Convert to ms
avg_per_prompt = elapsed_time / len(prompts) if prompts else 0
return request_id, elapsed_time, avg_per_prompt, True, len(prompts)
except Exception as e:
@@ -104,7 +104,7 @@ def run_benchmark(endpoint, batched_prompts, batch_size, gen_tokens):
num_requests = len(batched_prompts)
# Record start time for total latency
benchmark_start_time = time.time()
benchmark_start_time = time.perf_counter()
for i, batch_prompts in enumerate(batched_prompts):
request_id = i + 1
@@ -119,7 +119,7 @@ def run_benchmark(endpoint, batched_prompts, batch_size, gen_tokens):
results.append(result)
# Calculate total latency
total_latency = (time.time() - benchmark_start_time) * 1000 # Convert to ms
total_latency = (time.perf_counter() - benchmark_start_time) * 1000 # Convert to ms
return results, total_latency

View File

@@ -44,10 +44,10 @@ def benchmark_sequential_vs_batch(prompts, batch_size, tokenizer):
for run in range(NUM_RUNS):
batch_prompts = prompts[:batch_size] # Use same prompts for fair comparison
start_time = time.time()
start_time = time.perf_counter()
for prompt in batch_prompts:
tokens = tokenizer.encode(prompt)
sequential_time = (time.time() - start_time) * 1000
sequential_time = (time.perf_counter() - start_time) * 1000
sequential_times.append(sequential_time)
# Batch tokenization using tokenizer()
@@ -55,9 +55,9 @@ def benchmark_sequential_vs_batch(prompts, batch_size, tokenizer):
for run in range(NUM_RUNS):
batch_prompts = prompts[:batch_size] # Use same prompts for fair comparison
start_time = time.time()
start_time = time.perf_counter()
tokens = tokenizer(batch_prompts)
batch_time = (time.time() - start_time) * 1000
batch_time = (time.perf_counter() - start_time) * 1000
batch_times.append(batch_time)
return {

View File

@@ -39,7 +39,7 @@ def main(args):
answer = await call_generate(**arg, temperature=0)
states.append(answer)
tic = time.time()
tic = time.perf_counter()
# we always sequentially execute agent calls to maintain its dependency
if args.backend != "lmql":
for arg in tqdm(arguments):
@@ -50,7 +50,7 @@ def main(args):
loop = asyncio.get_event_loop()
for arg in tqdm(arguments):
loop.run_until_complete(get_one_answer_async(arg))
latency = time.time() - tic
latency = time.perf_counter() - tic
print(f"Latency: {latency:.3f}")

View File

@@ -35,14 +35,14 @@ def main(args):
states = []
# Run requests
tic = time.time()
tic = time.perf_counter()
for a in arguments:
# only a single key in the dict
for func, arg in a.items():
result = func.run(**arg)
result.sync()
states.append(result)
latency = time.time() - tic
latency = time.perf_counter() - tic
# Compute accuracy
print(f"Latency: {latency:.3f}")

View File

@@ -75,7 +75,7 @@ def main(args):
)
states[i] = answer
tic = time.time()
tic = time.perf_counter()
if args.parallel == 1:
for i in tqdm(range(len(questions))):
get_one_answer(i)
@@ -106,9 +106,9 @@ def main(args):
for j in range(len(rets)):
states[i + j] = rets[j]
tic = time.time()
tic = time.perf_counter()
asyncio.run(batched_call(batch_size=args.parallel))
latency = time.time() - tic
latency = time.perf_counter() - tic
preds = []
for i in range(len(states)):

View File

@@ -84,14 +84,14 @@ def main(args):
#####################################
# Run requests
tic = time.time()
tic = time.perf_counter()
states = few_shot_gsm8k.run_batch(
arguments,
temperature=0,
num_threads=args.parallel,
progress_bar=True,
)
latency = time.time() - tic
latency = time.perf_counter() - tic
preds = []
for i in range(len(states)):

View File

@@ -57,7 +57,7 @@ def main(args):
context=few_shot_examples + questions[i], choices=choices[i]
)
tic = time.time()
tic = time.perf_counter()
if args.parallel == 1:
for i in tqdm(range(len(questions))):
get_one_answer(i)
@@ -82,10 +82,10 @@ def main(args):
for j in range(len(rets)):
preds[i + j] = rets[j]
tic = time.time()
tic = time.perf_counter()
asyncio.run(batched_call(batch_size=args.parallel))
latency = time.time() - tic
latency = time.perf_counter() - tic
# Compute accuracy
acc = np.mean(np.array(preds) == np.array(labels))

View File

@@ -68,7 +68,7 @@ def main(args):
#####################################
# Run requests
tic = time.time()
tic = time.perf_counter()
rets = few_shot_hellaswag.run_batch(
arguments,
temperature=0,
@@ -76,7 +76,7 @@ def main(args):
progress_bar=True,
)
preds = [choices[i].index(rets[i]["answer"]) for i in range(len(rets))]
latency = time.time() - tic
latency = time.perf_counter() - tic
# Compute accuracy
acc = np.mean(np.array(preds) == np.array(labels))

View File

@@ -261,7 +261,7 @@ class WorkloadGenerator:
client_id, payload = item
response = await async_request_sglang_generate(payload, self.url, self.pbar)
if self.pbar.n == self.pbar.total:
self.finished_time = time.time()
self.finished_time = time.perf_counter()
self.response_queue.put((client_id, response))
except Exception as e:
print(f"Request failed: {e}")
@@ -334,7 +334,7 @@ class WorkloadGenerator:
request_thread = threading.Thread(target=self.request_sender, daemon=True)
response_thread = threading.Thread(target=self.response_handler, daemon=True)
self.start_time = time.time()
self.start_time = time.perf_counter()
request_thread.start()
response_thread.start()

View File

@@ -53,7 +53,7 @@ def main(args):
def get_one_answer(i):
states[i] = json_decode(generate=call_generate, **arguments[i])
tic = time.time()
tic = time.perf_counter()
if args.parallel == 1:
for i in tqdm(range(len(arguments))):
get_one_answer(i)
@@ -68,7 +68,7 @@ def main(args):
for _ in rets:
pass
latency = time.time() - tic
latency = time.perf_counter() - tic
# Compute accuracy
print(f"Latency: {latency:.3f}")

View File

@@ -63,11 +63,11 @@ def main(args):
json_warm_up.run().sync()
# Run requests
tic = time.time()
tic = time.perf_counter()
states = json_decode.run_batch(
arguments, temperature=0, num_threads=args.parallel, progress_bar=True
)
latency = time.time() - tic
latency = time.perf_counter() - tic
# Compute accuracy
print(f"Latency: {latency:.3f}")

View File

@@ -175,7 +175,7 @@ def bench_character(args):
else:
raise ValueError(f"Invalid backend: {args.backend}")
tic = time.time()
tic = time.perf_counter()
if args.backend != "lmql":
if args.parallel == 1:
@@ -202,7 +202,7 @@ def bench_character(args):
asyncio.gather(*[get_one_answer_async(i) for i in bt])
)
latency = time.time() - tic
latency = time.perf_counter() - tic
return states, latency
@@ -236,7 +236,7 @@ def bench_city_doc(args):
else:
raise ValueError(f"Invalid backend: {args.backend}")
tic = time.time()
tic = time.perf_counter()
if args.parallel == 1:
for i in tqdm(range(len(arguments))):
get_one_answer(i)
@@ -246,7 +246,7 @@ def bench_city_doc(args):
for _ in rets:
pass
latency = time.time() - tic
latency = time.perf_counter() - tic
return states, latency

View File

@@ -67,14 +67,14 @@ def bench_city_doc(args):
sgl.set_default_backend(backend)
# Run requests
tic = time.time()
tic = time.perf_counter()
states = city_gen.run_batch(
arguments,
temperature=0,
num_threads=args.parallel,
progress_bar=True,
)
latency = time.time() - tic
latency = time.perf_counter() - tic
return states, latency
@@ -91,14 +91,14 @@ def bench_character(args):
sgl.set_default_backend(backend)
# Run requests
tic = time.time()
tic = time.perf_counter()
states = character_gen.run_batch(
arguments,
temperature=0,
num_threads=args.parallel,
progress_bar=True,
)
latency = time.time() - tic
latency = time.perf_counter() - tic
return states, latency

View File

@@ -85,14 +85,14 @@ def bench_schema(args):
sgl.set_default_backend(backend)
# Run requests
tic = time.time()
tic = time.perf_counter()
states = schema_gen.run_batch(
arguments,
temperature=0,
num_threads=args.parallel,
progress_bar=True,
)
latency = time.time() - tic
latency = time.perf_counter() - tic
# Check if the outputs are valid
indexes = []

View File

@@ -487,7 +487,7 @@ def main(args: argparse.Namespace):
]
print(f"Start tuning over {len(search_space)} configurations...")
start = time.time()
start = time.perf_counter()
configs = _distribute(
"tune",
[
@@ -522,7 +522,7 @@ def main(args: argparse.Namespace):
use_int8_w8a16,
block_shape,
)
end = time.time()
end = time.perf_counter()
print(f"Tuning took {end - start:.2f} seconds")
else:
outputs = _distribute(

View File

@@ -359,7 +359,7 @@ def tune_on_gpu(args_dict):
config for config in search_space if block_k % config["BLOCK_SIZE_K"] == 0
]
start = time.time()
start = time.perf_counter()
results = {}
for shape in tqdm(weight_shapes, desc=f"GPU {gpu_id} - Shapes"):
N, K = shape[0], shape[1]
@@ -379,7 +379,7 @@ def tune_on_gpu(args_dict):
best_configs = {M: config for M, config in zip(batch_sizes, benchmark_results)}
save_configs(N, K, block_n, block_k, best_configs, save_path, input_type)
end = time.time()
end = time.perf_counter()
print(f"Tuning on GPU {gpu_id} took {end - start:.2f} seconds")

View File

@@ -70,7 +70,7 @@ def eval_model(args, line_obj, num_hoops, src_indices, dst_percents):
# Select backend
backend = select_sglang_backend(args)
tic = time.time()
tic = time.perf_counter()
states = line_retrieval.run_batch(
arguments,
temperature=0,
@@ -78,7 +78,7 @@ def eval_model(args, line_obj, num_hoops, src_indices, dst_percents):
num_threads=args.parallel,
progress_bar=True,
)
latency = time.time() - tic
latency = time.perf_counter() - tic
corrects = []
for i in range(len(arguments)):

View File

@@ -41,7 +41,7 @@ def main(args):
sgl.set_default_backend(backend)
# Run requests
tic = time.time()
tic = time.perf_counter()
if args.parallel == 1:
for i in tqdm.tqdm(range(len(lines))):
image_file = arguments[i]["image_file"]
@@ -52,7 +52,7 @@ def main(args):
states = image_qa.run_batch(
arguments, temperature=0, num_threads=args.parallel, progress_bar=True
)
latency = time.time() - tic
latency = time.perf_counter() - tic
print(f"Latency: {latency:.3f}")

View File

@@ -85,7 +85,7 @@ def main(args):
call_generate = partial(get_call_generate(args), temperature=0)
# Run requests
tic = time.time()
tic = time.perf_counter()
if args.backend != "lmql":
@@ -120,7 +120,7 @@ def main(args):
asyncio.gather(*[get_one_answer_async(i) for i in bt])
)
latency = time.time() - tic
latency = time.perf_counter() - tic
# Compute accuracy
print(f"Latency: {latency:.3f}")

View File

@@ -59,7 +59,7 @@ def main(args):
backend = select_sglang_backend(args)
# Run requests
tic = time.time()
tic = time.perf_counter()
states = multi_dimension_judge.run_batch(
arguments,
temperature=0,
@@ -67,7 +67,7 @@ def main(args):
num_threads=args.parallel,
progress_bar=True,
)
latency = time.time() - tic
latency = time.perf_counter() - tic
print(f"Latency: {latency:.3f}")

View File

@@ -45,7 +45,7 @@ def main(args):
def get_one_answer(i):
states[i] = json_decode(generate=call_generate, **arguments[i])
tic = time.time()
tic = time.perf_counter()
if args.parallel == 1:
for i in tqdm(range(len(arguments))):
get_one_answer(i)
@@ -58,7 +58,7 @@ def main(args):
)
)
latency = time.time() - tic
latency = time.perf_counter() - tic
# Compute accuracy
print(f"Latency: {latency:.3f}")

View File

@@ -46,11 +46,11 @@ def main(args):
sgl.set_default_backend(backend)
# Run requests
tic = time.time()
tic = time.perf_counter()
states = json_decode.run_batch(
arguments, temperature=0, num_threads=args.parallel, progress_bar=True
)
latency = time.time() - tic
latency = time.perf_counter() - tic
# Compute accuracy
print(f"Latency: {latency:.3f}")

View File

@@ -76,7 +76,7 @@ def evaluate(args, subject, dev_df, test_df, call_generate):
pred = call_generate(prompts[i], temperature=0, max_tokens=max_tokens)
preds[i] = pred.strip()[0]
tic = time.time()
tic = time.perf_counter()
if args.parallel == 1:
for i in range(len(prompts)):
get_one_answer(i)
@@ -94,9 +94,9 @@ def evaluate(args, subject, dev_df, test_df, call_generate):
for j in range(len(rets)):
preds[i + j] = rets[j].strip()[0]
tic = time.time()
tic = time.perf_counter()
asyncio.run(batched_call(batch_size=args.parallel))
latency = time.time() - tic
latency = time.perf_counter() - tic
# Compute accuracy
cors = [pred == label for pred, label in zip(preds, labels)]

View File

@@ -116,7 +116,7 @@ def main(args):
backend = select_sglang_backend(args)
# Run
tic = time.time()
tic = time.perf_counter()
states = few_shot_mmlu.run_batch(
arguments,
temperature=0,
@@ -128,7 +128,7 @@ def main(args):
preds = [
s["answer"].strip()[0] if len(s["answer"].strip()) > 0 else "" for s in states
]
latency = time.time() - tic
latency = time.perf_counter() - tic
# Compute accuracy
cors = [pred == label for pred, label in zip(preds, labels)]

View File

@@ -119,7 +119,7 @@ async def eval_mmmu(args) -> None:
api_key="sk", base_url=f"http://127.0.0.1:{args.port}/v1"
)
semaphore = asyncio.Semaphore(args.concurrency)
start = time.time()
start = time.perf_counter()
base_url = f"http://127.0.0.1:{args.port}"
if args.profile:
@@ -147,7 +147,7 @@ async def eval_mmmu(args) -> None:
if profile_output.success:
print("Profiler stopped")
print(f"Benchmark time: {time.time() - start}")
print(f"Benchmark time: {time.perf_counter() - start}")
args.output_path = f"./val_sglang.json"
save_json(args.output_path, out_samples)
eval_result(model_answer_path=args.output_path, answer_dict=answer_dict)

View File

@@ -66,7 +66,7 @@ def main(args):
answers[i] = cur_answers
# Run requests
tic = time.time()
tic = time.perf_counter()
if args.parallel == 1:
for i in tqdm(range(len(questions))):
get_answer(i)
@@ -79,7 +79,7 @@ def main(args):
)
)
latency = time.time() - tic
latency = time.perf_counter() - tic
print(f"#questions: {len(questions)}, Latency: {latency:.2f}")

View File

@@ -57,7 +57,7 @@ def main(args):
sgl.set_default_backend(backend)
# Run requests
tic = time.time()
tic = time.perf_counter()
rets = answer_mt_bench.run_batch(
arguments,
temperature=0,
@@ -66,7 +66,7 @@ def main(args):
progress_bar=True,
)
answers = [[s["answer_1"], s["answer_2"]] for s in rets]
latency = time.time() - tic
latency = time.perf_counter() - tic
print(f"#questions: {len(questions)}, Latency: {latency:.2f}")

View File

@@ -68,7 +68,7 @@ def main(args):
sgl.set_default_backend(backend)
# Run requests
tic = time.time()
tic = time.perf_counter()
rets = answer_mt_bench.run_batch(
arguments,
temperature=0,
@@ -78,7 +78,7 @@ def main(args):
)
answers = [[s["answer_1"], s["answer_2"]] for s in rets]
latency = time.time() - tic
latency = time.perf_counter() - tic
num_output_tokens = sum(
s.get_meta_info("answer_1")["completion_tokens"]
+ s.get_meta_info("answer_2")["completion_tokens"]

View File

@@ -113,7 +113,7 @@ def main(args):
answer = multi_chain_gsm8k(questions[i], args.num_chains, call_generate)
states[i] = answer
tic = time.time()
tic = time.perf_counter()
if args.parallel == 1:
for i in tqdm(range(len(questions))):
get_one_answer(i)
@@ -134,7 +134,7 @@ def main(args):
)
states[i] = answer
tic = time.time()
tic = time.perf_counter()
loop = asyncio.get_event_loop()
batches = [
list(range(i, min(i + args.parallel, len(questions))))
@@ -144,7 +144,7 @@ def main(args):
tasks = [get_one_answer_asyncio(k) for k in bt]
loop.run_until_complete(asyncio.gather(*tasks))
latency = time.time() - tic
latency = time.perf_counter() - tic
preds = []
for i in range(len(states)):

View File

@@ -90,7 +90,7 @@ def main(args):
backend = select_sglang_backend(args)
# Run requests
tic = time.time()
tic = time.perf_counter()
states = multi_chain_gsm8k.run_batch(
arguments,
temperature=0,
@@ -98,7 +98,7 @@ def main(args):
num_threads=args.parallel,
progress_bar=True,
)
latency = time.time() - tic
latency = time.perf_counter() - tic
preds = []
for i in range(len(states)):

View File

@@ -61,7 +61,7 @@ def main(args):
def get_one_answer(i):
states[i] = multi_document_qa(generate=call_generate, **arguments[i])
tic = time.time()
tic = time.perf_counter()
if args.parallel == 1:
for i in tqdm(range(len(labels))):
get_one_answer(i)
@@ -74,7 +74,7 @@ def main(args):
)
)
latency = time.time() - tic
latency = time.perf_counter() - tic
# Compute accuracy
print(states)

View File

@@ -49,11 +49,11 @@ def main(args):
sgl.set_default_backend(backend)
# Run requests
tic = time.time()
tic = time.perf_counter()
states = multi_document_qa.run_batch(
arguments, temperature=0, num_threads=args.parallel, progress_bar=True
)
latency = time.time() - tic
latency = time.perf_counter() - tic
# Compute accuracy
print([s["answer"] for s in states])

View File

@@ -35,7 +35,7 @@ def main(args):
def get_one_answer(i):
states[i] = multi_turns(generate=call_generate, **multi_qas[i])
tic = time.time()
tic = time.perf_counter()
if args.parallel == 1:
for i in tqdm(range(len(multi_qas))):
get_one_answer(i)
@@ -50,7 +50,7 @@ def main(args):
for _ in rets:
pass
latency = time.time() - tic
latency = time.perf_counter() - tic
# Compute accuracy
print(f"Latency: {latency:.3f}")

View File

@@ -27,7 +27,7 @@ def main(args):
backend = select_sglang_backend(args)
tic = time.time()
tic = time.perf_counter()
states = multi_turns.run_batch(
multi_qas,
temperature=0,
@@ -35,7 +35,7 @@ def main(args):
num_threads=args.parallel,
progress_bar=True,
)
latency = time.time() - tic
latency = time.perf_counter() - tic
print(f"Latency: {latency:.3f}")

View File

@@ -84,7 +84,7 @@ def main(args):
backend = select_sglang_backend(args)
tic = time.time()
tic = time.perf_counter()
states = multi_turns.run_batch(
multi_qas,
temperature=0,
@@ -92,7 +92,7 @@ def main(args):
num_threads="auto",
progress_bar=True,
)
latency = time.time() - tic
latency = time.perf_counter() - tic
print(f"Latency: {latency:.3f}")

View File

@@ -146,7 +146,7 @@ def main(args):
states.append(answer)
tic = time.time()
tic = time.perf_counter()
if args.backend != "lmql":
if args.parallel == 1:
@@ -173,7 +173,7 @@ def main(args):
tasks = [run_single_agent_async(arg) for arg in bt]
loop.run_until_complete(asyncio.gather(*tasks))
latency = time.time() - tic
latency = time.perf_counter() - tic
print(f"Latency: {latency:.3f}")

View File

@@ -115,14 +115,14 @@ def main(args):
sgl.set_default_backend(backend)
states = []
tic = time.time()
tic = time.perf_counter()
states = webthink.run_batch(
arguments,
temperature=0,
num_threads=args.parallel,
progress_bar=True,
)
latency = time.time() - tic
latency = time.perf_counter() - tic
# Compute accuracy
print(f"Latency: {latency:.3f}")

View File

@@ -51,7 +51,7 @@ def main(args):
)
# Run requests
tic = time.time()
tic = time.perf_counter()
states = reasoning_gen.run_batch(
questions,
num_threads=args.parallel,
@@ -60,7 +60,7 @@ def main(args):
max_new_tokens=32768,
top_p=0.95,
)
latency = time.time() - tic
latency = time.perf_counter() - tic
# Extract results and record outcomes in a list.
outcomes = []

View File

@@ -68,7 +68,7 @@ def main(args):
call_generate = partial(get_call_generate(args), temperature=0)
# Run requests
tic = time.time()
tic = time.perf_counter()
if args.backend != "lmql":
def get_one_answer(i):
@@ -102,7 +102,7 @@ def main(args):
loop.run_until_complete(
asyncio.gather(*[get_one_answer_async(i) for i in batch])
)
latency = time.time() - tic
latency = time.perf_counter() - tic
# Compute accuracy
print(f"Latency: {latency:.3f}")

View File

@@ -65,11 +65,11 @@ def main(args):
sgl.set_default_backend(select_sglang_backend(args))
# Run requests
tic = time.time()
tic = time.perf_counter()
states = suggest_tips.run_batch(
arguments, temperature=0, num_threads=args.parallel, progress_bar=True
)
latency = time.time() - tic
latency = time.perf_counter() - tic
# Compute accuracy
print(f"Latency: {latency:.3f}")

View File

@@ -138,7 +138,7 @@ def main(args):
# Run requests
states = [None] * len(questions)
tic = time.time()
tic = time.perf_counter()
if args.backend != "lmql":
def get_one_answer(i):
@@ -177,7 +177,7 @@ def main(args):
tasks = [get_one_answer_async(k) for k in bt]
loop.run_until_complete(asyncio.gather(*tasks))
latency = time.time() - tic
latency = time.perf_counter() - tic
answers_text = []
for s in states:

View File

@@ -119,7 +119,7 @@ def main(args):
backend = select_sglang_backend(args)
# Run requests
tic = time.time()
tic = time.perf_counter()
states = tree_search.run_batch(
arguments,
temperature=0,
@@ -127,7 +127,7 @@ def main(args):
num_threads=args.parallel,
progress_bar=True,
)
latency = time.time() - tic
latency = time.perf_counter() - tic
answers_text = []
for s in states:
answers_text.append([x for xs in s.ret_value for x in xs])

View File

@@ -121,7 +121,7 @@ def main(args):
def get_one_answer(i):
states[i] = tree_search(**arguments[i], call_generate=call_generate)
tic = time.time()
tic = time.perf_counter()
if args.parallel == 1:
for i in tqdm(range(len(questions))):
get_one_answer(i)
@@ -134,7 +134,7 @@ def main(args):
)
)
latency = time.time() - tic
latency = time.perf_counter() - tic
answers_text = []
for s in states:

View File

@@ -107,7 +107,7 @@ def main(args):
backend = select_sglang_backend(args)
# Run requests
tic = time.time()
tic = time.perf_counter()
states = tree_search.run_batch(
arguments,
temperature=0,
@@ -115,7 +115,7 @@ def main(args):
num_threads=args.parallel,
progress_bar=True,
)
latency = time.time() - tic
latency = time.perf_counter() - tic
answers_text = []
for s in states:
answers_text.append([x for xs in s["answer"] for x in xs])

View File

@@ -90,7 +90,7 @@ def run_eval(args):
#####################################
# Run requests
tic = time.time()
tic = time.perf_counter()
states = few_shot_gsm8k.run_batch(
arguments,
temperature=args.temperature if hasattr(args, "temperature") else 0,
@@ -99,7 +99,7 @@ def run_eval(args):
return_logprob=getattr(args, "return_logprob", None),
logprob_start_len=getattr(args, "logprob_start_len", None),
)
latency = time.time() - tic
latency = time.perf_counter() - tic
preds = []
for i in range(len(states)):

View File

@@ -89,7 +89,7 @@ def run_eval(args):
}
# Run requests
tic = time.time()
tic = time.perf_counter()
loop = asyncio.get_event_loop()
@@ -98,7 +98,7 @@ def run_eval(args):
)
# End requests
latency = time.time() - tic
latency = time.perf_counter() - tic
# Shutdown the engine
engine.shutdown()

View File

@@ -71,9 +71,9 @@ def run_eval(args):
)
# Run eval
tic = time.time()
tic = time.perf_counter()
result = eval_obj(sampler)
latency = time.time() - tic
latency = time.perf_counter() - tic
# Dump reports
metrics = result.metrics | {"score": result.score}

View File

@@ -503,7 +503,7 @@ def test_hellaswag_select():
#####################################
# Run requests
tic = time.time()
tic = time.perf_counter()
rets = few_shot_hellaswag.run_batch(
arguments,
temperature=0,
@@ -514,13 +514,13 @@ def test_hellaswag_select():
preds = []
for i, ret in enumerate(rets):
preds.append(choices[i].index(ret["answer"]))
latency = time.time() - tic
latency = time.perf_counter() - tic
# Compute accuracy
accuracy = np.mean(np.array(preds) == np.array(labels))
# Test generator style of run_batch
tic = time.time()
tic = time.perf_counter()
rets = few_shot_hellaswag.run_batch(
arguments,
temperature=0,
@@ -531,7 +531,7 @@ def test_hellaswag_select():
preds_gen = []
for i, ret in enumerate(rets):
preds_gen.append(choices[i].index(ret["answer"]))
latency_gen = time.time() - tic
latency_gen = time.perf_counter() - tic
# Compute accuracy
accuracy_gen = np.mean(np.array(preds_gen) == np.array(labels))

View File

@@ -449,9 +449,9 @@ def popen_launch_server(
else:
process = subprocess.Popen(command, stdout=None, stderr=None, env=env)
start_time = time.time()
start_time = time.perf_counter()
with requests.Session() as session:
while time.time() - start_time < timeout:
while time.perf_counter() - start_time < timeout:
try:
headers = {
"Content-Type": "application/json; charset=utf-8",
@@ -584,7 +584,7 @@ class TestFile:
def run_unittest_files(files: List[TestFile], timeout_per_file: float):
tic = time.time()
tic = time.perf_counter()
success = True
for i, file in enumerate(files):
@@ -599,13 +599,13 @@ def run_unittest_files(files: List[TestFile], timeout_per_file: float):
f".\n.\nBegin ({i}/{len(files) - 1}):\npython3 {filename}\n.\n.\n",
flush=True,
)
tic = time.time()
tic = time.perf_counter()
process = subprocess.Popen(
["python3", filename], stdout=None, stderr=None, env=os.environ
)
process.wait()
elapsed = time.time() - tic
elapsed = time.perf_counter() - tic
print(
f".\n.\nEnd ({i}/{len(files) - 1}):\n{filename=}, {elapsed=:.0f}, {estimated_time=}\n.\n.\n",
@@ -631,9 +631,9 @@ def run_unittest_files(files: List[TestFile], timeout_per_file: float):
break
if success:
print(f"Success. Time elapsed: {time.time() - tic:.2f}s", flush=True)
print(f"Success. Time elapsed: {time.perf_counter() - tic:.2f}s", flush=True)
else:
print(f"Fail. Time elapsed: {time.time() - tic:.2f}s", flush=True)
print(f"Fail. Time elapsed: {time.perf_counter() - tic:.2f}s", flush=True)
return 0 if success else -1

View File

@@ -92,9 +92,9 @@ def popen_launch_router(
process = subprocess.Popen(command, stdout=None, stderr=None)
start_time = time.time()
start_time = time.perf_counter()
with requests.Session() as session:
while time.time() - start_time < timeout:
while time.perf_counter() - start_time < timeout:
try:
response = session.get(f"{base_url}/health")
if response.status_code == 200:
@@ -155,11 +155,11 @@ def terminate_and_wait(process, timeout=300):
return
process.terminate()
start_time = time.time()
start_time = time.perf_counter()
while process.poll() is None:
print(f"Terminating process {process.pid}")
if time.time() - start_time > timeout:
if time.perf_counter() - start_time > timeout:
raise TimeoutError(
f"Process {process.pid} failed to terminate within {timeout}s"
)

View File

@@ -184,9 +184,9 @@ class ExperimentRunner:
self.logger = logging.getLogger(__name__)
def wait_for_server(self, port: int, timeout: int = 300) -> bool:
start_time = time.time()
start_time = time.perf_counter()
while time.time() - start_time < timeout:
while time.perf_counter() - start_time < timeout:
try:
response = requests.get(f"http://localhost:{port}/health")
if response.status_code == 200:
@@ -197,7 +197,7 @@ class ExperimentRunner:
return False
def run_task(self, config: TaskConfig) -> TaskResult:
start_time = time.time()
start_time = time.perf_counter()
client_output = []
try:
@@ -247,7 +247,7 @@ class ExperimentRunner:
name=config.name,
success=True,
output=formatted_output,
runtime=time.time() - start_time,
runtime=time.perf_counter() - start_time,
timestamp=datetime.now().isoformat(),
)
@@ -256,7 +256,7 @@ class ExperimentRunner:
name=config.name,
success=False,
output=str(e),
runtime=time.time() - start_time,
runtime=time.perf_counter() - start_time,
timestamp=datetime.now().isoformat(),
)

View File

@@ -79,9 +79,9 @@ class TestEncoderEmbeddingModels(CustomTestCase):
# warm up
hf_outputs = hf_runner.forward(truncated_prompts)
st_start_time = time.time()
st_start_time = time.perf_counter()
hf_outputs = hf_runner.forward(truncated_prompts)
st_end_time = time.time()
st_end_time = time.perf_counter()
with SRTRunner(
model_path,
@@ -95,9 +95,9 @@ class TestEncoderEmbeddingModels(CustomTestCase):
# warm up
srt_outputs = srt_runner.forward(truncated_prompts)
sgl_start_time = time.time()
sgl_start_time = time.perf_counter()
srt_outputs = srt_runner.forward(truncated_prompts)
sgl_end_time = time.time()
sgl_end_time = time.perf_counter()
transformer_time = st_end_time - st_start_time
sgl_time = sgl_end_time - sgl_start_time

View File

@@ -130,9 +130,9 @@ class TestGPTQModelDynamic(CustomTestCase):
def test_throughput(self):
max_tokens = 256
tic = time.time()
tic = time.perf_counter()
result = self.run_decode(max_tokens)
tok = time.time()
tok = time.perf_counter()
print(f"result = `{result}`")
@@ -185,9 +185,9 @@ class TestGPTQModelDynamicWithMarlin(CustomTestCase):
def test_throughput(self):
max_tokens = 256
tic = time.time()
tic = time.perf_counter()
result = self.run_decode(max_tokens)
tok = time.time()
tok = time.perf_counter()
print(f"result = `{result}`")

View File

@@ -42,10 +42,10 @@ class TestReleaseMemoryOccupation(CustomTestCase):
)
print("release_memory_occupation start")
t = time.time()
t = time.perf_counter()
engine.release_memory_occupation()
if _DEBUG_EXTRA:
print("release_memory_occupation", time.time() - t)
print("release_memory_occupation", time.perf_counter() - t)
if _DEBUG_EXTRA:
time.sleep(5)
@@ -60,10 +60,10 @@ class TestReleaseMemoryOccupation(CustomTestCase):
time.sleep(5)
print("resume_memory_occupation start")
t = time.time()
t = time.perf_counter()
engine.resume_memory_occupation()
if _DEBUG_EXTRA:
print("resume_memory_occupation", time.time() - t)
print("resume_memory_occupation", time.perf_counter() - t)
self.assertEqual(
_try_allocate_big_tensor(),

View File

@@ -62,9 +62,9 @@ class TestTorchCompile(CustomTestCase):
res = self.run_decode(16)
max_tokens = 256
tic = time.time()
tic = time.perf_counter()
res = self.run_decode(max_tokens)
tok = time.time()
tok = time.perf_counter()
print(f"{res=}")
throughput = max_tokens / (tok - tic)
print(f"Throughput: {throughput} tokens/s")

View File

@@ -62,9 +62,9 @@ class TestTorchCompileMoe(CustomTestCase):
res = self.run_decode(16)
max_tokens = 256
tic = time.time()
tic = time.perf_counter()
res = self.run_decode(max_tokens)
tok = time.time()
tok = time.perf_counter()
print(f"{res=}")
throughput = max_tokens / (tok - tic)
self.assertGreaterEqual(throughput, 285)

View File

@@ -61,9 +61,9 @@ class TestTorchAO(CustomTestCase):
max_tokens = 256
tic = time.time()
tic = time.perf_counter()
res = self.run_decode(max_tokens)
tok = time.time()
tok = time.perf_counter()
print(res["text"])
throughput = max_tokens / (tok - tic)
print(f"Throughput: {throughput} tokens/s")

View File

@@ -164,7 +164,7 @@ def init_process_hf(
)
dist.barrier(group=group, device_ids=[rank])
torch.cuda.synchronize()
time_begin_broadcast = time.time()
time_begin_broadcast = time.perf_counter()
# The last parameter is lm_head.weight, which is tied
# with embed_tokens.weight. Actually, we only need
@@ -182,7 +182,7 @@ def init_process_hf(
group=group,
)
torch.cuda.synchronize()
time_end_broadcast = time.time()
time_end_broadcast = time.perf_counter()
# Measure the latency of broadcasting/weights update.
broadcast_time = time_end_broadcast - time_begin_broadcast
@@ -282,7 +282,7 @@ def init_process_sgl(
)
torch.cuda.synchronize()
time_begin_update = time.time()
time_begin_update = time.perf_counter()
# The last parameter is lm_head.weight, which is tied
# with embed_tokens.weight. Actually, we only need
@@ -312,7 +312,7 @@ def init_process_sgl(
},
)
torch.cuda.synchronize()
time_end_update = time.time()
time_end_update = time.perf_counter()
# Measure the latency of broadcast/weights update.
update_time = time_end_update - time_begin_update

View File

@@ -21,9 +21,9 @@ def test_update_weights_from_tensor(tp_size):
memory_before = torch.cuda.memory_allocated()
new_tensor = torch.full((16384, 2048), 1.5, device="cuda")
time_start = time.time()
time_start = time.perf_counter()
engine.update_weights_from_tensor([(x, new_tensor) for x in param_names])
print(f"Time delta: {time.time() - time_start:.03f}")
print(f"Time delta: {time.perf_counter() - time_start:.03f}")
for param_name in param_names[:3]:
_check_param(engine, param_name, [1.5] * 5)

View File

@@ -62,9 +62,9 @@ class TestW8A8(CustomTestCase):
def test_throughput(self):
max_tokens = 256
tic = time.time()
tic = time.perf_counter()
res = self.run_decode(max_tokens)
tok = time.time()
tok = time.perf_counter()
print(res["text"])
throughput = max_tokens / (tok - tic)
print(f"Throughput: {throughput} tokens/s")