diff --git a/benchmark/bench_in_batch_prefix/bench_in_batch_prefix.py b/benchmark/bench_in_batch_prefix/bench_in_batch_prefix.py index 86648e5ff..282097112 100644 --- a/benchmark/bench_in_batch_prefix/bench_in_batch_prefix.py +++ b/benchmark/bench_in_batch_prefix/bench_in_batch_prefix.py @@ -64,11 +64,11 @@ def test_batch_by_batch(all_prompts, gen_len): tot_time = 0 for i in range(len(all_prompts)): - tic = time.time() + tic = time.perf_counter() text_qa.run_batch( list(zip(all_prompts[i], [gen_len] * len(all_prompts[i]))), ) - tot_time += time.time() - tic + tot_time += time.perf_counter() - tic return tot_time @@ -78,13 +78,13 @@ def test_batch_by_batch_with_hint(all_prompts, gen_len): tot_time = 0 for i in range(len(all_prompts)): - tic = time.time() + tic = time.perf_counter() # Send a hint to cache the prefix text_qa.run_batch(list(zip(all_prompts[i][:1], [gen_len]))) # Send the batch text_qa.run_batch(list(zip(all_prompts[i], [gen_len] * len(all_prompts[i])))) - tot_time += time.time() - tic + tot_time += time.perf_counter() - tic return tot_time @@ -94,11 +94,11 @@ def test_send_all(all_prompts, gen_len): all_prompts = [x for prompt_list in all_prompts for x in prompt_list] - tic = time.time() + tic = time.perf_counter() text_qa.run_batch( list(zip(all_prompts, [gen_len] * len(all_prompts))), ) - tot_time = time.time() - tic + tot_time = time.perf_counter() - tic return tot_time diff --git a/benchmark/benchmark_batch/benchmark_batch.py b/benchmark/benchmark_batch/benchmark_batch.py index 15ef0ab6a..a8592d48a 100644 --- a/benchmark/benchmark_batch/benchmark_batch.py +++ b/benchmark/benchmark_batch/benchmark_batch.py @@ -81,7 +81,7 @@ def send_batch_request(endpoint, prompts, gen_tokens, request_id): } data = {"text": prompts, "sampling_params": sampling_params} - start_time = time.time() + start_time = time.perf_counter() try: response = requests.post( endpoint.base_url + "/generate", json=data, timeout=3600 @@ -90,7 +90,7 @@ def send_batch_request(endpoint, prompts, gen_tokens, request_id): error = response.json() raise RuntimeError(f"Request {request_id} failed: {error}") result = response.json() - elapsed_time = (time.time() - start_time) * 1000 # Convert to ms + elapsed_time = (time.perf_counter() - start_time) * 1000 # Convert to ms avg_per_prompt = elapsed_time / len(prompts) if prompts else 0 return request_id, elapsed_time, avg_per_prompt, True, len(prompts) except Exception as e: @@ -104,7 +104,7 @@ def run_benchmark(endpoint, batched_prompts, batch_size, gen_tokens): num_requests = len(batched_prompts) # Record start time for total latency - benchmark_start_time = time.time() + benchmark_start_time = time.perf_counter() for i, batch_prompts in enumerate(batched_prompts): request_id = i + 1 @@ -119,7 +119,7 @@ def run_benchmark(endpoint, batched_prompts, batch_size, gen_tokens): results.append(result) # Calculate total latency - total_latency = (time.time() - benchmark_start_time) * 1000 # Convert to ms + total_latency = (time.perf_counter() - benchmark_start_time) * 1000 # Convert to ms return results, total_latency diff --git a/benchmark/benchmark_batch/benchmark_tokenizer.py b/benchmark/benchmark_batch/benchmark_tokenizer.py index c00bfb84b..88a5820b6 100644 --- a/benchmark/benchmark_batch/benchmark_tokenizer.py +++ b/benchmark/benchmark_batch/benchmark_tokenizer.py @@ -44,10 +44,10 @@ def benchmark_sequential_vs_batch(prompts, batch_size, tokenizer): for run in range(NUM_RUNS): batch_prompts = prompts[:batch_size] # Use same prompts for fair comparison - start_time = time.time() + start_time = time.perf_counter() for prompt in batch_prompts: tokens = tokenizer.encode(prompt) - sequential_time = (time.time() - start_time) * 1000 + sequential_time = (time.perf_counter() - start_time) * 1000 sequential_times.append(sequential_time) # Batch tokenization using tokenizer() @@ -55,9 +55,9 @@ def benchmark_sequential_vs_batch(prompts, batch_size, tokenizer): for run in range(NUM_RUNS): batch_prompts = prompts[:batch_size] # Use same prompts for fair comparison - start_time = time.time() + start_time = time.perf_counter() tokens = tokenizer(batch_prompts) - batch_time = (time.time() - start_time) * 1000 + batch_time = (time.perf_counter() - start_time) * 1000 batch_times.append(batch_time) return { diff --git a/benchmark/generative_agents/bench_other.py b/benchmark/generative_agents/bench_other.py index 48f6ebc40..c0b3a3406 100644 --- a/benchmark/generative_agents/bench_other.py +++ b/benchmark/generative_agents/bench_other.py @@ -39,7 +39,7 @@ def main(args): answer = await call_generate(**arg, temperature=0) states.append(answer) - tic = time.time() + tic = time.perf_counter() # we always sequentially execute agent calls to maintain its dependency if args.backend != "lmql": for arg in tqdm(arguments): @@ -50,7 +50,7 @@ def main(args): loop = asyncio.get_event_loop() for arg in tqdm(arguments): loop.run_until_complete(get_one_answer_async(arg)) - latency = time.time() - tic + latency = time.perf_counter() - tic print(f"Latency: {latency:.3f}") diff --git a/benchmark/generative_agents/bench_sglang.py b/benchmark/generative_agents/bench_sglang.py index b42a32b44..034b16591 100644 --- a/benchmark/generative_agents/bench_sglang.py +++ b/benchmark/generative_agents/bench_sglang.py @@ -35,14 +35,14 @@ def main(args): states = [] # Run requests - tic = time.time() + tic = time.perf_counter() for a in arguments: # only a single key in the dict for func, arg in a.items(): result = func.run(**arg) result.sync() states.append(result) - latency = time.time() - tic + latency = time.perf_counter() - tic # Compute accuracy print(f"Latency: {latency:.3f}") diff --git a/benchmark/gsm8k/bench_other.py b/benchmark/gsm8k/bench_other.py index a8bbcfb5c..6dcb9ad7c 100644 --- a/benchmark/gsm8k/bench_other.py +++ b/benchmark/gsm8k/bench_other.py @@ -75,7 +75,7 @@ def main(args): ) states[i] = answer - tic = time.time() + tic = time.perf_counter() if args.parallel == 1: for i in tqdm(range(len(questions))): get_one_answer(i) @@ -106,9 +106,9 @@ def main(args): for j in range(len(rets)): states[i + j] = rets[j] - tic = time.time() + tic = time.perf_counter() asyncio.run(batched_call(batch_size=args.parallel)) - latency = time.time() - tic + latency = time.perf_counter() - tic preds = [] for i in range(len(states)): diff --git a/benchmark/gsm8k/bench_sglang.py b/benchmark/gsm8k/bench_sglang.py index b6bdbef09..05ac0beb1 100644 --- a/benchmark/gsm8k/bench_sglang.py +++ b/benchmark/gsm8k/bench_sglang.py @@ -84,14 +84,14 @@ def main(args): ##################################### # Run requests - tic = time.time() + tic = time.perf_counter() states = few_shot_gsm8k.run_batch( arguments, temperature=0, num_threads=args.parallel, progress_bar=True, ) - latency = time.time() - tic + latency = time.perf_counter() - tic preds = [] for i in range(len(states)): diff --git a/benchmark/hellaswag/bench_other.py b/benchmark/hellaswag/bench_other.py index 04be4569a..cde0794bb 100644 --- a/benchmark/hellaswag/bench_other.py +++ b/benchmark/hellaswag/bench_other.py @@ -57,7 +57,7 @@ def main(args): context=few_shot_examples + questions[i], choices=choices[i] ) - tic = time.time() + tic = time.perf_counter() if args.parallel == 1: for i in tqdm(range(len(questions))): get_one_answer(i) @@ -82,10 +82,10 @@ def main(args): for j in range(len(rets)): preds[i + j] = rets[j] - tic = time.time() + tic = time.perf_counter() asyncio.run(batched_call(batch_size=args.parallel)) - latency = time.time() - tic + latency = time.perf_counter() - tic # Compute accuracy acc = np.mean(np.array(preds) == np.array(labels)) diff --git a/benchmark/hellaswag/bench_sglang.py b/benchmark/hellaswag/bench_sglang.py index 798521f97..6345a453b 100644 --- a/benchmark/hellaswag/bench_sglang.py +++ b/benchmark/hellaswag/bench_sglang.py @@ -68,7 +68,7 @@ def main(args): ##################################### # Run requests - tic = time.time() + tic = time.perf_counter() rets = few_shot_hellaswag.run_batch( arguments, temperature=0, @@ -76,7 +76,7 @@ def main(args): progress_bar=True, ) preds = [choices[i].index(rets[i]["answer"]) for i in range(len(rets))] - latency = time.time() - tic + latency = time.perf_counter() - tic # Compute accuracy acc = np.mean(np.array(preds) == np.array(labels)) diff --git a/benchmark/hicache/bench_multiturn.py b/benchmark/hicache/bench_multiturn.py index 6bd0bd99e..a2a88b634 100644 --- a/benchmark/hicache/bench_multiturn.py +++ b/benchmark/hicache/bench_multiturn.py @@ -261,7 +261,7 @@ class WorkloadGenerator: client_id, payload = item response = await async_request_sglang_generate(payload, self.url, self.pbar) if self.pbar.n == self.pbar.total: - self.finished_time = time.time() + self.finished_time = time.perf_counter() self.response_queue.put((client_id, response)) except Exception as e: print(f"Request failed: {e}") @@ -334,7 +334,7 @@ class WorkloadGenerator: request_thread = threading.Thread(target=self.request_sender, daemon=True) response_thread = threading.Thread(target=self.response_handler, daemon=True) - self.start_time = time.time() + self.start_time = time.perf_counter() request_thread.start() response_thread.start() diff --git a/benchmark/json_decode_regex/bench_other.py b/benchmark/json_decode_regex/bench_other.py index d80ea1de7..87051ea82 100644 --- a/benchmark/json_decode_regex/bench_other.py +++ b/benchmark/json_decode_regex/bench_other.py @@ -53,7 +53,7 @@ def main(args): def get_one_answer(i): states[i] = json_decode(generate=call_generate, **arguments[i]) - tic = time.time() + tic = time.perf_counter() if args.parallel == 1: for i in tqdm(range(len(arguments))): get_one_answer(i) @@ -68,7 +68,7 @@ def main(args): for _ in rets: pass - latency = time.time() - tic + latency = time.perf_counter() - tic # Compute accuracy print(f"Latency: {latency:.3f}") diff --git a/benchmark/json_decode_regex/bench_sglang.py b/benchmark/json_decode_regex/bench_sglang.py index 4139ebf8a..9aab11e43 100644 --- a/benchmark/json_decode_regex/bench_sglang.py +++ b/benchmark/json_decode_regex/bench_sglang.py @@ -63,11 +63,11 @@ def main(args): json_warm_up.run().sync() # Run requests - tic = time.time() + tic = time.perf_counter() states = json_decode.run_batch( arguments, temperature=0, num_threads=args.parallel, progress_bar=True ) - latency = time.time() - tic + latency = time.perf_counter() - tic # Compute accuracy print(f"Latency: {latency:.3f}") diff --git a/benchmark/json_jump_forward/bench_other.py b/benchmark/json_jump_forward/bench_other.py index 9eb5c58b3..a64e950d7 100644 --- a/benchmark/json_jump_forward/bench_other.py +++ b/benchmark/json_jump_forward/bench_other.py @@ -175,7 +175,7 @@ def bench_character(args): else: raise ValueError(f"Invalid backend: {args.backend}") - tic = time.time() + tic = time.perf_counter() if args.backend != "lmql": if args.parallel == 1: @@ -202,7 +202,7 @@ def bench_character(args): asyncio.gather(*[get_one_answer_async(i) for i in bt]) ) - latency = time.time() - tic + latency = time.perf_counter() - tic return states, latency @@ -236,7 +236,7 @@ def bench_city_doc(args): else: raise ValueError(f"Invalid backend: {args.backend}") - tic = time.time() + tic = time.perf_counter() if args.parallel == 1: for i in tqdm(range(len(arguments))): get_one_answer(i) @@ -246,7 +246,7 @@ def bench_city_doc(args): for _ in rets: pass - latency = time.time() - tic + latency = time.perf_counter() - tic return states, latency diff --git a/benchmark/json_jump_forward/bench_sglang.py b/benchmark/json_jump_forward/bench_sglang.py index 10cf2699b..29f635f75 100644 --- a/benchmark/json_jump_forward/bench_sglang.py +++ b/benchmark/json_jump_forward/bench_sglang.py @@ -67,14 +67,14 @@ def bench_city_doc(args): sgl.set_default_backend(backend) # Run requests - tic = time.time() + tic = time.perf_counter() states = city_gen.run_batch( arguments, temperature=0, num_threads=args.parallel, progress_bar=True, ) - latency = time.time() - tic + latency = time.perf_counter() - tic return states, latency @@ -91,14 +91,14 @@ def bench_character(args): sgl.set_default_backend(backend) # Run requests - tic = time.time() + tic = time.perf_counter() states = character_gen.run_batch( arguments, temperature=0, num_threads=args.parallel, progress_bar=True, ) - latency = time.time() - tic + latency = time.perf_counter() - tic return states, latency diff --git a/benchmark/json_schema/bench_sglang.py b/benchmark/json_schema/bench_sglang.py index 4693baae3..55365ff2e 100644 --- a/benchmark/json_schema/bench_sglang.py +++ b/benchmark/json_schema/bench_sglang.py @@ -85,14 +85,14 @@ def bench_schema(args): sgl.set_default_backend(backend) # Run requests - tic = time.time() + tic = time.perf_counter() states = schema_gen.run_batch( arguments, temperature=0, num_threads=args.parallel, progress_bar=True, ) - latency = time.time() - tic + latency = time.perf_counter() - tic # Check if the outputs are valid indexes = [] diff --git a/benchmark/kernels/fused_moe_triton/tuning_fused_moe_triton.py b/benchmark/kernels/fused_moe_triton/tuning_fused_moe_triton.py index a3ead1eca..be349e456 100644 --- a/benchmark/kernels/fused_moe_triton/tuning_fused_moe_triton.py +++ b/benchmark/kernels/fused_moe_triton/tuning_fused_moe_triton.py @@ -487,7 +487,7 @@ def main(args: argparse.Namespace): ] print(f"Start tuning over {len(search_space)} configurations...") - start = time.time() + start = time.perf_counter() configs = _distribute( "tune", [ @@ -522,7 +522,7 @@ def main(args: argparse.Namespace): use_int8_w8a16, block_shape, ) - end = time.time() + end = time.perf_counter() print(f"Tuning took {end - start:.2f} seconds") else: outputs = _distribute( diff --git a/benchmark/kernels/quantization/tuning_block_wise_kernel.py b/benchmark/kernels/quantization/tuning_block_wise_kernel.py index 7b0dfb47a..1b51e54b7 100644 --- a/benchmark/kernels/quantization/tuning_block_wise_kernel.py +++ b/benchmark/kernels/quantization/tuning_block_wise_kernel.py @@ -359,7 +359,7 @@ def tune_on_gpu(args_dict): config for config in search_space if block_k % config["BLOCK_SIZE_K"] == 0 ] - start = time.time() + start = time.perf_counter() results = {} for shape in tqdm(weight_shapes, desc=f"GPU {gpu_id} - Shapes"): N, K = shape[0], shape[1] @@ -379,7 +379,7 @@ def tune_on_gpu(args_dict): best_configs = {M: config for M, config in zip(batch_sizes, benchmark_results)} save_configs(N, K, block_n, block_k, best_configs, save_path, input_type) - end = time.time() + end = time.perf_counter() print(f"Tuning on GPU {gpu_id} took {end - start:.2f} seconds") diff --git a/benchmark/line_retrieval/bench_sglang.py b/benchmark/line_retrieval/bench_sglang.py index 922d5009d..e974e7dd3 100644 --- a/benchmark/line_retrieval/bench_sglang.py +++ b/benchmark/line_retrieval/bench_sglang.py @@ -70,7 +70,7 @@ def eval_model(args, line_obj, num_hoops, src_indices, dst_percents): # Select backend backend = select_sglang_backend(args) - tic = time.time() + tic = time.perf_counter() states = line_retrieval.run_batch( arguments, temperature=0, @@ -78,7 +78,7 @@ def eval_model(args, line_obj, num_hoops, src_indices, dst_percents): num_threads=args.parallel, progress_bar=True, ) - latency = time.time() - tic + latency = time.perf_counter() - tic corrects = [] for i in range(len(arguments)): diff --git a/benchmark/llava_bench/bench_sglang.py b/benchmark/llava_bench/bench_sglang.py index f84c8a90f..b9e8c1405 100644 --- a/benchmark/llava_bench/bench_sglang.py +++ b/benchmark/llava_bench/bench_sglang.py @@ -41,7 +41,7 @@ def main(args): sgl.set_default_backend(backend) # Run requests - tic = time.time() + tic = time.perf_counter() if args.parallel == 1: for i in tqdm.tqdm(range(len(lines))): image_file = arguments[i]["image_file"] @@ -52,7 +52,7 @@ def main(args): states = image_qa.run_batch( arguments, temperature=0, num_threads=args.parallel, progress_bar=True ) - latency = time.time() - tic + latency = time.perf_counter() - tic print(f"Latency: {latency:.3f}") diff --git a/benchmark/llm_judge/bench_other.py b/benchmark/llm_judge/bench_other.py index 2231bcdbb..8e6029067 100644 --- a/benchmark/llm_judge/bench_other.py +++ b/benchmark/llm_judge/bench_other.py @@ -85,7 +85,7 @@ def main(args): call_generate = partial(get_call_generate(args), temperature=0) # Run requests - tic = time.time() + tic = time.perf_counter() if args.backend != "lmql": @@ -120,7 +120,7 @@ def main(args): asyncio.gather(*[get_one_answer_async(i) for i in bt]) ) - latency = time.time() - tic + latency = time.perf_counter() - tic # Compute accuracy print(f"Latency: {latency:.3f}") diff --git a/benchmark/llm_judge/bench_sglang.py b/benchmark/llm_judge/bench_sglang.py index 38c95974e..97e6c3979 100644 --- a/benchmark/llm_judge/bench_sglang.py +++ b/benchmark/llm_judge/bench_sglang.py @@ -59,7 +59,7 @@ def main(args): backend = select_sglang_backend(args) # Run requests - tic = time.time() + tic = time.perf_counter() states = multi_dimension_judge.run_batch( arguments, temperature=0, @@ -67,7 +67,7 @@ def main(args): num_threads=args.parallel, progress_bar=True, ) - latency = time.time() - tic + latency = time.perf_counter() - tic print(f"Latency: {latency:.3f}") diff --git a/benchmark/long_json_decode/bench_other.py b/benchmark/long_json_decode/bench_other.py index a83c797c4..0ad38a014 100644 --- a/benchmark/long_json_decode/bench_other.py +++ b/benchmark/long_json_decode/bench_other.py @@ -45,7 +45,7 @@ def main(args): def get_one_answer(i): states[i] = json_decode(generate=call_generate, **arguments[i]) - tic = time.time() + tic = time.perf_counter() if args.parallel == 1: for i in tqdm(range(len(arguments))): get_one_answer(i) @@ -58,7 +58,7 @@ def main(args): ) ) - latency = time.time() - tic + latency = time.perf_counter() - tic # Compute accuracy print(f"Latency: {latency:.3f}") diff --git a/benchmark/long_json_decode/bench_sglang.py b/benchmark/long_json_decode/bench_sglang.py index 6e19a732f..8394cfc2e 100644 --- a/benchmark/long_json_decode/bench_sglang.py +++ b/benchmark/long_json_decode/bench_sglang.py @@ -46,11 +46,11 @@ def main(args): sgl.set_default_backend(backend) # Run requests - tic = time.time() + tic = time.perf_counter() states = json_decode.run_batch( arguments, temperature=0, num_threads=args.parallel, progress_bar=True ) - latency = time.time() - tic + latency = time.perf_counter() - tic # Compute accuracy print(f"Latency: {latency:.3f}") diff --git a/benchmark/mmlu/bench_other.py b/benchmark/mmlu/bench_other.py index c5d48dac6..f1b166c2b 100644 --- a/benchmark/mmlu/bench_other.py +++ b/benchmark/mmlu/bench_other.py @@ -76,7 +76,7 @@ def evaluate(args, subject, dev_df, test_df, call_generate): pred = call_generate(prompts[i], temperature=0, max_tokens=max_tokens) preds[i] = pred.strip()[0] - tic = time.time() + tic = time.perf_counter() if args.parallel == 1: for i in range(len(prompts)): get_one_answer(i) @@ -94,9 +94,9 @@ def evaluate(args, subject, dev_df, test_df, call_generate): for j in range(len(rets)): preds[i + j] = rets[j].strip()[0] - tic = time.time() + tic = time.perf_counter() asyncio.run(batched_call(batch_size=args.parallel)) - latency = time.time() - tic + latency = time.perf_counter() - tic # Compute accuracy cors = [pred == label for pred, label in zip(preds, labels)] diff --git a/benchmark/mmlu/bench_sglang.py b/benchmark/mmlu/bench_sglang.py index 210b6111e..0bae7b6e4 100644 --- a/benchmark/mmlu/bench_sglang.py +++ b/benchmark/mmlu/bench_sglang.py @@ -116,7 +116,7 @@ def main(args): backend = select_sglang_backend(args) # Run - tic = time.time() + tic = time.perf_counter() states = few_shot_mmlu.run_batch( arguments, temperature=0, @@ -128,7 +128,7 @@ def main(args): preds = [ s["answer"].strip()[0] if len(s["answer"].strip()) > 0 else "" for s in states ] - latency = time.time() - tic + latency = time.perf_counter() - tic # Compute accuracy cors = [pred == label for pred, label in zip(preds, labels)] diff --git a/benchmark/mmmu/bench_sglang.py b/benchmark/mmmu/bench_sglang.py index 58a4039ef..a177fd137 100644 --- a/benchmark/mmmu/bench_sglang.py +++ b/benchmark/mmmu/bench_sglang.py @@ -119,7 +119,7 @@ async def eval_mmmu(args) -> None: api_key="sk", base_url=f"http://127.0.0.1:{args.port}/v1" ) semaphore = asyncio.Semaphore(args.concurrency) - start = time.time() + start = time.perf_counter() base_url = f"http://127.0.0.1:{args.port}" if args.profile: @@ -147,7 +147,7 @@ async def eval_mmmu(args) -> None: if profile_output.success: print("Profiler stopped") - print(f"Benchmark time: {time.time() - start}") + print(f"Benchmark time: {time.perf_counter() - start}") args.output_path = f"./val_sglang.json" save_json(args.output_path, out_samples) eval_result(model_answer_path=args.output_path, answer_dict=answer_dict) diff --git a/benchmark/mtbench/bench_other.py b/benchmark/mtbench/bench_other.py index 2c321e8a1..5e579e9a6 100644 --- a/benchmark/mtbench/bench_other.py +++ b/benchmark/mtbench/bench_other.py @@ -66,7 +66,7 @@ def main(args): answers[i] = cur_answers # Run requests - tic = time.time() + tic = time.perf_counter() if args.parallel == 1: for i in tqdm(range(len(questions))): get_answer(i) @@ -79,7 +79,7 @@ def main(args): ) ) - latency = time.time() - tic + latency = time.perf_counter() - tic print(f"#questions: {len(questions)}, Latency: {latency:.2f}") diff --git a/benchmark/mtbench/bench_sglang.py b/benchmark/mtbench/bench_sglang.py index b57d1647d..0d0545b3a 100644 --- a/benchmark/mtbench/bench_sglang.py +++ b/benchmark/mtbench/bench_sglang.py @@ -57,7 +57,7 @@ def main(args): sgl.set_default_backend(backend) # Run requests - tic = time.time() + tic = time.perf_counter() rets = answer_mt_bench.run_batch( arguments, temperature=0, @@ -66,7 +66,7 @@ def main(args): progress_bar=True, ) answers = [[s["answer_1"], s["answer_2"]] for s in rets] - latency = time.time() - tic + latency = time.perf_counter() - tic print(f"#questions: {len(questions)}, Latency: {latency:.2f}") diff --git a/benchmark/mtbench/bench_sglang_eagle.py b/benchmark/mtbench/bench_sglang_eagle.py index e1207afe1..3eb6036c7 100644 --- a/benchmark/mtbench/bench_sglang_eagle.py +++ b/benchmark/mtbench/bench_sglang_eagle.py @@ -68,7 +68,7 @@ def main(args): sgl.set_default_backend(backend) # Run requests - tic = time.time() + tic = time.perf_counter() rets = answer_mt_bench.run_batch( arguments, temperature=0, @@ -78,7 +78,7 @@ def main(args): ) answers = [[s["answer_1"], s["answer_2"]] for s in rets] - latency = time.time() - tic + latency = time.perf_counter() - tic num_output_tokens = sum( s.get_meta_info("answer_1")["completion_tokens"] + s.get_meta_info("answer_2")["completion_tokens"] diff --git a/benchmark/multi_chain_reasoning/bench_other.py b/benchmark/multi_chain_reasoning/bench_other.py index e0ff2be45..f361496ad 100644 --- a/benchmark/multi_chain_reasoning/bench_other.py +++ b/benchmark/multi_chain_reasoning/bench_other.py @@ -113,7 +113,7 @@ def main(args): answer = multi_chain_gsm8k(questions[i], args.num_chains, call_generate) states[i] = answer - tic = time.time() + tic = time.perf_counter() if args.parallel == 1: for i in tqdm(range(len(questions))): get_one_answer(i) @@ -134,7 +134,7 @@ def main(args): ) states[i] = answer - tic = time.time() + tic = time.perf_counter() loop = asyncio.get_event_loop() batches = [ list(range(i, min(i + args.parallel, len(questions)))) @@ -144,7 +144,7 @@ def main(args): tasks = [get_one_answer_asyncio(k) for k in bt] loop.run_until_complete(asyncio.gather(*tasks)) - latency = time.time() - tic + latency = time.perf_counter() - tic preds = [] for i in range(len(states)): diff --git a/benchmark/multi_chain_reasoning/bench_sglang.py b/benchmark/multi_chain_reasoning/bench_sglang.py index 98a6b511e..1d3129db2 100644 --- a/benchmark/multi_chain_reasoning/bench_sglang.py +++ b/benchmark/multi_chain_reasoning/bench_sglang.py @@ -90,7 +90,7 @@ def main(args): backend = select_sglang_backend(args) # Run requests - tic = time.time() + tic = time.perf_counter() states = multi_chain_gsm8k.run_batch( arguments, temperature=0, @@ -98,7 +98,7 @@ def main(args): num_threads=args.parallel, progress_bar=True, ) - latency = time.time() - tic + latency = time.perf_counter() - tic preds = [] for i in range(len(states)): diff --git a/benchmark/multi_document_qa/bench_other.py b/benchmark/multi_document_qa/bench_other.py index 6f0addcb7..627837c5c 100644 --- a/benchmark/multi_document_qa/bench_other.py +++ b/benchmark/multi_document_qa/bench_other.py @@ -61,7 +61,7 @@ def main(args): def get_one_answer(i): states[i] = multi_document_qa(generate=call_generate, **arguments[i]) - tic = time.time() + tic = time.perf_counter() if args.parallel == 1: for i in tqdm(range(len(labels))): get_one_answer(i) @@ -74,7 +74,7 @@ def main(args): ) ) - latency = time.time() - tic + latency = time.perf_counter() - tic # Compute accuracy print(states) diff --git a/benchmark/multi_document_qa/bench_sglang.py b/benchmark/multi_document_qa/bench_sglang.py index 645520166..0b4b0dbc6 100644 --- a/benchmark/multi_document_qa/bench_sglang.py +++ b/benchmark/multi_document_qa/bench_sglang.py @@ -49,11 +49,11 @@ def main(args): sgl.set_default_backend(backend) # Run requests - tic = time.time() + tic = time.perf_counter() states = multi_document_qa.run_batch( arguments, temperature=0, num_threads=args.parallel, progress_bar=True ) - latency = time.time() - tic + latency = time.perf_counter() - tic # Compute accuracy print([s["answer"] for s in states]) diff --git a/benchmark/multi_turn_chat/bench_other.py b/benchmark/multi_turn_chat/bench_other.py index 81d67ab7b..9189af5be 100644 --- a/benchmark/multi_turn_chat/bench_other.py +++ b/benchmark/multi_turn_chat/bench_other.py @@ -35,7 +35,7 @@ def main(args): def get_one_answer(i): states[i] = multi_turns(generate=call_generate, **multi_qas[i]) - tic = time.time() + tic = time.perf_counter() if args.parallel == 1: for i in tqdm(range(len(multi_qas))): get_one_answer(i) @@ -50,7 +50,7 @@ def main(args): for _ in rets: pass - latency = time.time() - tic + latency = time.perf_counter() - tic # Compute accuracy print(f"Latency: {latency:.3f}") diff --git a/benchmark/multi_turn_chat/bench_sglang.py b/benchmark/multi_turn_chat/bench_sglang.py index 7feaced73..1051bf19e 100644 --- a/benchmark/multi_turn_chat/bench_sglang.py +++ b/benchmark/multi_turn_chat/bench_sglang.py @@ -27,7 +27,7 @@ def main(args): backend = select_sglang_backend(args) - tic = time.time() + tic = time.perf_counter() states = multi_turns.run_batch( multi_qas, temperature=0, @@ -35,7 +35,7 @@ def main(args): num_threads=args.parallel, progress_bar=True, ) - latency = time.time() - tic + latency = time.perf_counter() - tic print(f"Latency: {latency:.3f}") diff --git a/benchmark/multi_turn_chat/long_prompt_multi_turn.py b/benchmark/multi_turn_chat/long_prompt_multi_turn.py index 20f6dd5e3..bda5bb9cc 100644 --- a/benchmark/multi_turn_chat/long_prompt_multi_turn.py +++ b/benchmark/multi_turn_chat/long_prompt_multi_turn.py @@ -84,7 +84,7 @@ def main(args): backend = select_sglang_backend(args) - tic = time.time() + tic = time.perf_counter() states = multi_turns.run_batch( multi_qas, temperature=0, @@ -92,7 +92,7 @@ def main(args): num_threads="auto", progress_bar=True, ) - latency = time.time() - tic + latency = time.perf_counter() - tic print(f"Latency: {latency:.3f}") diff --git a/benchmark/react/bench_other.py b/benchmark/react/bench_other.py index 91c5546f1..08666662b 100644 --- a/benchmark/react/bench_other.py +++ b/benchmark/react/bench_other.py @@ -146,7 +146,7 @@ def main(args): states.append(answer) - tic = time.time() + tic = time.perf_counter() if args.backend != "lmql": if args.parallel == 1: @@ -173,7 +173,7 @@ def main(args): tasks = [run_single_agent_async(arg) for arg in bt] loop.run_until_complete(asyncio.gather(*tasks)) - latency = time.time() - tic + latency = time.perf_counter() - tic print(f"Latency: {latency:.3f}") diff --git a/benchmark/react/bench_sglang.py b/benchmark/react/bench_sglang.py index b07105e2c..331638e9f 100644 --- a/benchmark/react/bench_sglang.py +++ b/benchmark/react/bench_sglang.py @@ -115,14 +115,14 @@ def main(args): sgl.set_default_backend(backend) states = [] - tic = time.time() + tic = time.perf_counter() states = webthink.run_batch( arguments, temperature=0, num_threads=args.parallel, progress_bar=True, ) - latency = time.time() - tic + latency = time.perf_counter() - tic # Compute accuracy print(f"Latency: {latency:.3f}") diff --git a/benchmark/reasoning_benchmark/bench_sglang.py b/benchmark/reasoning_benchmark/bench_sglang.py index c83204960..ccbff9d17 100644 --- a/benchmark/reasoning_benchmark/bench_sglang.py +++ b/benchmark/reasoning_benchmark/bench_sglang.py @@ -51,7 +51,7 @@ def main(args): ) # Run requests - tic = time.time() + tic = time.perf_counter() states = reasoning_gen.run_batch( questions, num_threads=args.parallel, @@ -60,7 +60,7 @@ def main(args): max_new_tokens=32768, top_p=0.95, ) - latency = time.time() - tic + latency = time.perf_counter() - tic # Extract results and record outcomes in a list. outcomes = [] diff --git a/benchmark/tip_suggestion/bench_other.py b/benchmark/tip_suggestion/bench_other.py index fcc4fd624..2630081bd 100644 --- a/benchmark/tip_suggestion/bench_other.py +++ b/benchmark/tip_suggestion/bench_other.py @@ -68,7 +68,7 @@ def main(args): call_generate = partial(get_call_generate(args), temperature=0) # Run requests - tic = time.time() + tic = time.perf_counter() if args.backend != "lmql": def get_one_answer(i): @@ -102,7 +102,7 @@ def main(args): loop.run_until_complete( asyncio.gather(*[get_one_answer_async(i) for i in batch]) ) - latency = time.time() - tic + latency = time.perf_counter() - tic # Compute accuracy print(f"Latency: {latency:.3f}") diff --git a/benchmark/tip_suggestion/bench_sglang.py b/benchmark/tip_suggestion/bench_sglang.py index 6d17821bc..86c476f97 100644 --- a/benchmark/tip_suggestion/bench_sglang.py +++ b/benchmark/tip_suggestion/bench_sglang.py @@ -65,11 +65,11 @@ def main(args): sgl.set_default_backend(select_sglang_backend(args)) # Run requests - tic = time.time() + tic = time.perf_counter() states = suggest_tips.run_batch( arguments, temperature=0, num_threads=args.parallel, progress_bar=True ) - latency = time.time() - tic + latency = time.perf_counter() - tic # Compute accuracy print(f"Latency: {latency:.3f}") diff --git a/benchmark/tree_of_thought_deep/bench_other.py b/benchmark/tree_of_thought_deep/bench_other.py index 21c7df351..0ef8c6360 100644 --- a/benchmark/tree_of_thought_deep/bench_other.py +++ b/benchmark/tree_of_thought_deep/bench_other.py @@ -138,7 +138,7 @@ def main(args): # Run requests states = [None] * len(questions) - tic = time.time() + tic = time.perf_counter() if args.backend != "lmql": def get_one_answer(i): @@ -177,7 +177,7 @@ def main(args): tasks = [get_one_answer_async(k) for k in bt] loop.run_until_complete(asyncio.gather(*tasks)) - latency = time.time() - tic + latency = time.perf_counter() - tic answers_text = [] for s in states: diff --git a/benchmark/tree_of_thought_deep/bench_sglang.py b/benchmark/tree_of_thought_deep/bench_sglang.py index bfb2a4113..bcdb6e54d 100644 --- a/benchmark/tree_of_thought_deep/bench_sglang.py +++ b/benchmark/tree_of_thought_deep/bench_sglang.py @@ -119,7 +119,7 @@ def main(args): backend = select_sglang_backend(args) # Run requests - tic = time.time() + tic = time.perf_counter() states = tree_search.run_batch( arguments, temperature=0, @@ -127,7 +127,7 @@ def main(args): num_threads=args.parallel, progress_bar=True, ) - latency = time.time() - tic + latency = time.perf_counter() - tic answers_text = [] for s in states: answers_text.append([x for xs in s.ret_value for x in xs]) diff --git a/benchmark/tree_of_thought_v0/bench_other.py b/benchmark/tree_of_thought_v0/bench_other.py index 86e133577..703ecd7f4 100644 --- a/benchmark/tree_of_thought_v0/bench_other.py +++ b/benchmark/tree_of_thought_v0/bench_other.py @@ -121,7 +121,7 @@ def main(args): def get_one_answer(i): states[i] = tree_search(**arguments[i], call_generate=call_generate) - tic = time.time() + tic = time.perf_counter() if args.parallel == 1: for i in tqdm(range(len(questions))): get_one_answer(i) @@ -134,7 +134,7 @@ def main(args): ) ) - latency = time.time() - tic + latency = time.perf_counter() - tic answers_text = [] for s in states: diff --git a/benchmark/tree_of_thought_v0/bench_sglang.py b/benchmark/tree_of_thought_v0/bench_sglang.py index f0d130778..6d7575f36 100644 --- a/benchmark/tree_of_thought_v0/bench_sglang.py +++ b/benchmark/tree_of_thought_v0/bench_sglang.py @@ -107,7 +107,7 @@ def main(args): backend = select_sglang_backend(args) # Run requests - tic = time.time() + tic = time.perf_counter() states = tree_search.run_batch( arguments, temperature=0, @@ -115,7 +115,7 @@ def main(args): num_threads=args.parallel, progress_bar=True, ) - latency = time.time() - tic + latency = time.perf_counter() - tic answers_text = [] for s in states: answers_text.append([x for xs in s["answer"] for x in xs]) diff --git a/python/sglang/test/few_shot_gsm8k.py b/python/sglang/test/few_shot_gsm8k.py index 4f655eb60..5aac87bd2 100644 --- a/python/sglang/test/few_shot_gsm8k.py +++ b/python/sglang/test/few_shot_gsm8k.py @@ -90,7 +90,7 @@ def run_eval(args): ##################################### # Run requests - tic = time.time() + tic = time.perf_counter() states = few_shot_gsm8k.run_batch( arguments, temperature=args.temperature if hasattr(args, "temperature") else 0, @@ -99,7 +99,7 @@ def run_eval(args): return_logprob=getattr(args, "return_logprob", None), logprob_start_len=getattr(args, "logprob_start_len", None), ) - latency = time.time() - tic + latency = time.perf_counter() - tic preds = [] for i in range(len(states)): diff --git a/python/sglang/test/few_shot_gsm8k_engine.py b/python/sglang/test/few_shot_gsm8k_engine.py index 67844e2f1..2453a91e4 100644 --- a/python/sglang/test/few_shot_gsm8k_engine.py +++ b/python/sglang/test/few_shot_gsm8k_engine.py @@ -89,7 +89,7 @@ def run_eval(args): } # Run requests - tic = time.time() + tic = time.perf_counter() loop = asyncio.get_event_loop() @@ -98,7 +98,7 @@ def run_eval(args): ) # End requests - latency = time.time() - tic + latency = time.perf_counter() - tic # Shutdown the engine engine.shutdown() diff --git a/python/sglang/test/run_eval.py b/python/sglang/test/run_eval.py index fe88171ce..51743be09 100644 --- a/python/sglang/test/run_eval.py +++ b/python/sglang/test/run_eval.py @@ -71,9 +71,9 @@ def run_eval(args): ) # Run eval - tic = time.time() + tic = time.perf_counter() result = eval_obj(sampler) - latency = time.time() - tic + latency = time.perf_counter() - tic # Dump reports metrics = result.metrics | {"score": result.score} diff --git a/python/sglang/test/test_programs.py b/python/sglang/test/test_programs.py index 262637eed..6756f2dd7 100644 --- a/python/sglang/test/test_programs.py +++ b/python/sglang/test/test_programs.py @@ -503,7 +503,7 @@ def test_hellaswag_select(): ##################################### # Run requests - tic = time.time() + tic = time.perf_counter() rets = few_shot_hellaswag.run_batch( arguments, temperature=0, @@ -514,13 +514,13 @@ def test_hellaswag_select(): preds = [] for i, ret in enumerate(rets): preds.append(choices[i].index(ret["answer"])) - latency = time.time() - tic + latency = time.perf_counter() - tic # Compute accuracy accuracy = np.mean(np.array(preds) == np.array(labels)) # Test generator style of run_batch - tic = time.time() + tic = time.perf_counter() rets = few_shot_hellaswag.run_batch( arguments, temperature=0, @@ -531,7 +531,7 @@ def test_hellaswag_select(): preds_gen = [] for i, ret in enumerate(rets): preds_gen.append(choices[i].index(ret["answer"])) - latency_gen = time.time() - tic + latency_gen = time.perf_counter() - tic # Compute accuracy accuracy_gen = np.mean(np.array(preds_gen) == np.array(labels)) diff --git a/python/sglang/test/test_utils.py b/python/sglang/test/test_utils.py index 1e78d6dc1..150f385c9 100644 --- a/python/sglang/test/test_utils.py +++ b/python/sglang/test/test_utils.py @@ -449,9 +449,9 @@ def popen_launch_server( else: process = subprocess.Popen(command, stdout=None, stderr=None, env=env) - start_time = time.time() + start_time = time.perf_counter() with requests.Session() as session: - while time.time() - start_time < timeout: + while time.perf_counter() - start_time < timeout: try: headers = { "Content-Type": "application/json; charset=utf-8", @@ -584,7 +584,7 @@ class TestFile: def run_unittest_files(files: List[TestFile], timeout_per_file: float): - tic = time.time() + tic = time.perf_counter() success = True for i, file in enumerate(files): @@ -599,13 +599,13 @@ def run_unittest_files(files: List[TestFile], timeout_per_file: float): f".\n.\nBegin ({i}/{len(files) - 1}):\npython3 {filename}\n.\n.\n", flush=True, ) - tic = time.time() + tic = time.perf_counter() process = subprocess.Popen( ["python3", filename], stdout=None, stderr=None, env=os.environ ) process.wait() - elapsed = time.time() - tic + elapsed = time.perf_counter() - tic print( f".\n.\nEnd ({i}/{len(files) - 1}):\n{filename=}, {elapsed=:.0f}, {estimated_time=}\n.\n.\n", @@ -631,9 +631,9 @@ def run_unittest_files(files: List[TestFile], timeout_per_file: float): break if success: - print(f"Success. Time elapsed: {time.time() - tic:.2f}s", flush=True) + print(f"Success. Time elapsed: {time.perf_counter() - tic:.2f}s", flush=True) else: - print(f"Fail. Time elapsed: {time.time() - tic:.2f}s", flush=True) + print(f"Fail. Time elapsed: {time.perf_counter() - tic:.2f}s", flush=True) return 0 if success else -1 diff --git a/sgl-router/py_test/test_launch_server.py b/sgl-router/py_test/test_launch_server.py index 33dd3e854..afffe334f 100644 --- a/sgl-router/py_test/test_launch_server.py +++ b/sgl-router/py_test/test_launch_server.py @@ -92,9 +92,9 @@ def popen_launch_router( process = subprocess.Popen(command, stdout=None, stderr=None) - start_time = time.time() + start_time = time.perf_counter() with requests.Session() as session: - while time.time() - start_time < timeout: + while time.perf_counter() - start_time < timeout: try: response = session.get(f"{base_url}/health") if response.status_code == 200: @@ -155,11 +155,11 @@ def terminate_and_wait(process, timeout=300): return process.terminate() - start_time = time.time() + start_time = time.perf_counter() while process.poll() is None: print(f"Terminating process {process.pid}") - if time.time() - start_time > timeout: + if time.perf_counter() - start_time > timeout: raise TimeoutError( f"Process {process.pid} failed to terminate within {timeout}s" ) diff --git a/test/srt/experiment_runner.py b/test/srt/experiment_runner.py index 7feeef1aa..f32f61d3b 100644 --- a/test/srt/experiment_runner.py +++ b/test/srt/experiment_runner.py @@ -184,9 +184,9 @@ class ExperimentRunner: self.logger = logging.getLogger(__name__) def wait_for_server(self, port: int, timeout: int = 300) -> bool: - start_time = time.time() + start_time = time.perf_counter() - while time.time() - start_time < timeout: + while time.perf_counter() - start_time < timeout: try: response = requests.get(f"http://localhost:{port}/health") if response.status_code == 200: @@ -197,7 +197,7 @@ class ExperimentRunner: return False def run_task(self, config: TaskConfig) -> TaskResult: - start_time = time.time() + start_time = time.perf_counter() client_output = [] try: @@ -247,7 +247,7 @@ class ExperimentRunner: name=config.name, success=True, output=formatted_output, - runtime=time.time() - start_time, + runtime=time.perf_counter() - start_time, timestamp=datetime.now().isoformat(), ) @@ -256,7 +256,7 @@ class ExperimentRunner: name=config.name, success=False, output=str(e), - runtime=time.time() - start_time, + runtime=time.perf_counter() - start_time, timestamp=datetime.now().isoformat(), ) diff --git a/test/srt/models/test_encoder_embedding_models.py b/test/srt/models/test_encoder_embedding_models.py index 5202917c4..bea5d4aff 100644 --- a/test/srt/models/test_encoder_embedding_models.py +++ b/test/srt/models/test_encoder_embedding_models.py @@ -79,9 +79,9 @@ class TestEncoderEmbeddingModels(CustomTestCase): # warm up hf_outputs = hf_runner.forward(truncated_prompts) - st_start_time = time.time() + st_start_time = time.perf_counter() hf_outputs = hf_runner.forward(truncated_prompts) - st_end_time = time.time() + st_end_time = time.perf_counter() with SRTRunner( model_path, @@ -95,9 +95,9 @@ class TestEncoderEmbeddingModels(CustomTestCase): # warm up srt_outputs = srt_runner.forward(truncated_prompts) - sgl_start_time = time.time() + sgl_start_time = time.perf_counter() srt_outputs = srt_runner.forward(truncated_prompts) - sgl_end_time = time.time() + sgl_end_time = time.perf_counter() transformer_time = st_end_time - st_start_time sgl_time = sgl_end_time - sgl_start_time diff --git a/test/srt/test_gptqmodel_dynamic.py b/test/srt/test_gptqmodel_dynamic.py index 27ccd9a4b..284465b8b 100644 --- a/test/srt/test_gptqmodel_dynamic.py +++ b/test/srt/test_gptqmodel_dynamic.py @@ -130,9 +130,9 @@ class TestGPTQModelDynamic(CustomTestCase): def test_throughput(self): max_tokens = 256 - tic = time.time() + tic = time.perf_counter() result = self.run_decode(max_tokens) - tok = time.time() + tok = time.perf_counter() print(f"result = `{result}`") @@ -185,9 +185,9 @@ class TestGPTQModelDynamicWithMarlin(CustomTestCase): def test_throughput(self): max_tokens = 256 - tic = time.time() + tic = time.perf_counter() result = self.run_decode(max_tokens) - tok = time.time() + tok = time.perf_counter() print(f"result = `{result}`") diff --git a/test/srt/test_release_memory_occupation.py b/test/srt/test_release_memory_occupation.py index 7ccd9f1f7..7a7659280 100644 --- a/test/srt/test_release_memory_occupation.py +++ b/test/srt/test_release_memory_occupation.py @@ -42,10 +42,10 @@ class TestReleaseMemoryOccupation(CustomTestCase): ) print("release_memory_occupation start") - t = time.time() + t = time.perf_counter() engine.release_memory_occupation() if _DEBUG_EXTRA: - print("release_memory_occupation", time.time() - t) + print("release_memory_occupation", time.perf_counter() - t) if _DEBUG_EXTRA: time.sleep(5) @@ -60,10 +60,10 @@ class TestReleaseMemoryOccupation(CustomTestCase): time.sleep(5) print("resume_memory_occupation start") - t = time.time() + t = time.perf_counter() engine.resume_memory_occupation() if _DEBUG_EXTRA: - print("resume_memory_occupation", time.time() - t) + print("resume_memory_occupation", time.perf_counter() - t) self.assertEqual( _try_allocate_big_tensor(), diff --git a/test/srt/test_torch_compile.py b/test/srt/test_torch_compile.py index 760cec84b..904e49f9d 100644 --- a/test/srt/test_torch_compile.py +++ b/test/srt/test_torch_compile.py @@ -62,9 +62,9 @@ class TestTorchCompile(CustomTestCase): res = self.run_decode(16) max_tokens = 256 - tic = time.time() + tic = time.perf_counter() res = self.run_decode(max_tokens) - tok = time.time() + tok = time.perf_counter() print(f"{res=}") throughput = max_tokens / (tok - tic) print(f"Throughput: {throughput} tokens/s") diff --git a/test/srt/test_torch_compile_moe.py b/test/srt/test_torch_compile_moe.py index 42415b155..63423af43 100644 --- a/test/srt/test_torch_compile_moe.py +++ b/test/srt/test_torch_compile_moe.py @@ -62,9 +62,9 @@ class TestTorchCompileMoe(CustomTestCase): res = self.run_decode(16) max_tokens = 256 - tic = time.time() + tic = time.perf_counter() res = self.run_decode(max_tokens) - tok = time.time() + tok = time.perf_counter() print(f"{res=}") throughput = max_tokens / (tok - tic) self.assertGreaterEqual(throughput, 285) diff --git a/test/srt/test_torchao.py b/test/srt/test_torchao.py index 77ec0a570..13c7b60b5 100644 --- a/test/srt/test_torchao.py +++ b/test/srt/test_torchao.py @@ -61,9 +61,9 @@ class TestTorchAO(CustomTestCase): max_tokens = 256 - tic = time.time() + tic = time.perf_counter() res = self.run_decode(max_tokens) - tok = time.time() + tok = time.perf_counter() print(res["text"]) throughput = max_tokens / (tok - tic) print(f"Throughput: {throughput} tokens/s") diff --git a/test/srt/test_update_weights_from_distributed.py b/test/srt/test_update_weights_from_distributed.py index e558a56e3..064406703 100644 --- a/test/srt/test_update_weights_from_distributed.py +++ b/test/srt/test_update_weights_from_distributed.py @@ -164,7 +164,7 @@ def init_process_hf( ) dist.barrier(group=group, device_ids=[rank]) torch.cuda.synchronize() - time_begin_broadcast = time.time() + time_begin_broadcast = time.perf_counter() # The last parameter is lm_head.weight, which is tied # with embed_tokens.weight. Actually, we only need @@ -182,7 +182,7 @@ def init_process_hf( group=group, ) torch.cuda.synchronize() - time_end_broadcast = time.time() + time_end_broadcast = time.perf_counter() # Measure the latency of broadcasting/weights update. broadcast_time = time_end_broadcast - time_begin_broadcast @@ -282,7 +282,7 @@ def init_process_sgl( ) torch.cuda.synchronize() - time_begin_update = time.time() + time_begin_update = time.perf_counter() # The last parameter is lm_head.weight, which is tied # with embed_tokens.weight. Actually, we only need @@ -312,7 +312,7 @@ def init_process_sgl( }, ) torch.cuda.synchronize() - time_end_update = time.time() + time_end_update = time.perf_counter() # Measure the latency of broadcast/weights update. update_time = time_end_update - time_begin_update diff --git a/test/srt/test_update_weights_from_tensor.py b/test/srt/test_update_weights_from_tensor.py index 1f3592447..38187652b 100644 --- a/test/srt/test_update_weights_from_tensor.py +++ b/test/srt/test_update_weights_from_tensor.py @@ -21,9 +21,9 @@ def test_update_weights_from_tensor(tp_size): memory_before = torch.cuda.memory_allocated() new_tensor = torch.full((16384, 2048), 1.5, device="cuda") - time_start = time.time() + time_start = time.perf_counter() engine.update_weights_from_tensor([(x, new_tensor) for x in param_names]) - print(f"Time delta: {time.time() - time_start:.03f}") + print(f"Time delta: {time.perf_counter() - time_start:.03f}") for param_name in param_names[:3]: _check_param(engine, param_name, [1.5] * 5) diff --git a/test/srt/test_w8a8_quantization.py b/test/srt/test_w8a8_quantization.py index 2cb2fa073..3d4ce1afa 100644 --- a/test/srt/test_w8a8_quantization.py +++ b/test/srt/test_w8a8_quantization.py @@ -62,9 +62,9 @@ class TestW8A8(CustomTestCase): def test_throughput(self): max_tokens = 256 - tic = time.time() + tic = time.perf_counter() res = self.run_decode(max_tokens) - tok = time.time() + tok = time.perf_counter() print(res["text"]) throughput = max_tokens / (tok - tic) print(f"Throughput: {throughput} tokens/s")