Replace time.time() to time.perf_counter() for benchmarking. (#6178)

Signed-off-by: Lifu Huang <lifu.hlf@gmail.com>
2025-05-11 14:32:49 -07:00
parent e9a47f4cb5
commit 6e2da51561
61 changed files with 158 additions and 158 deletions
--- a/benchmark/bench_in_batch_prefix/bench_in_batch_prefix.py
+++ b/benchmark/bench_in_batch_prefix/bench_in_batch_prefix.py
@@ -64,11 +64,11 @@ def test_batch_by_batch(all_prompts, gen_len):
    tot_time = 0
    for i in range(len(all_prompts)):
-        tic = time.time()
+        tic = time.perf_counter()
        text_qa.run_batch(
            list(zip(all_prompts[i], [gen_len] * len(all_prompts[i]))),
        )
-        tot_time += time.time() - tic
+        tot_time += time.perf_counter() - tic
    return tot_time
@@ -78,13 +78,13 @@ def test_batch_by_batch_with_hint(all_prompts, gen_len):
    tot_time = 0
    for i in range(len(all_prompts)):
-        tic = time.time()
+        tic = time.perf_counter()
        # Send a hint to cache the prefix
        text_qa.run_batch(list(zip(all_prompts[i][:1], [gen_len])))
        # Send the batch
        text_qa.run_batch(list(zip(all_prompts[i], [gen_len] * len(all_prompts[i]))))
-        tot_time += time.time() - tic
+        tot_time += time.perf_counter() - tic
    return tot_time
@@ -94,11 +94,11 @@ def test_send_all(all_prompts, gen_len):
    all_prompts = [x for prompt_list in all_prompts for x in prompt_list]
-    tic = time.time()
+    tic = time.perf_counter()
    text_qa.run_batch(
        list(zip(all_prompts, [gen_len] * len(all_prompts))),
    )
-    tot_time = time.time() - tic
+    tot_time = time.perf_counter() - tic
    return tot_time
--- a/benchmark/benchmark_batch/benchmark_batch.py
+++ b/benchmark/benchmark_batch/benchmark_batch.py
@@ -81,7 +81,7 @@ def send_batch_request(endpoint, prompts, gen_tokens, request_id):
    }
    data = {"text": prompts, "sampling_params": sampling_params}
-    start_time = time.time()
+    start_time = time.perf_counter()
    try:
        response = requests.post(
            endpoint.base_url + "/generate", json=data, timeout=3600
@@ -90,7 +90,7 @@ def send_batch_request(endpoint, prompts, gen_tokens, request_id):
            error = response.json()
            raise RuntimeError(f"Request {request_id} failed: {error}")
        result = response.json()
-        elapsed_time = (time.time() - start_time) * 1000  # Convert to ms
+        elapsed_time = (time.perf_counter() - start_time) * 1000  # Convert to ms
        avg_per_prompt = elapsed_time / len(prompts) if prompts else 0
        return request_id, elapsed_time, avg_per_prompt, True, len(prompts)
    except Exception as e:
@@ -104,7 +104,7 @@ def run_benchmark(endpoint, batched_prompts, batch_size, gen_tokens):
    num_requests = len(batched_prompts)
    # Record start time for total latency
-    benchmark_start_time = time.time()
+    benchmark_start_time = time.perf_counter()
    for i, batch_prompts in enumerate(batched_prompts):
        request_id = i + 1
@@ -119,7 +119,7 @@ def run_benchmark(endpoint, batched_prompts, batch_size, gen_tokens):
        results.append(result)
    # Calculate total latency
-    total_latency = (time.time() - benchmark_start_time) * 1000  # Convert to ms
+    total_latency = (time.perf_counter() - benchmark_start_time) * 1000  # Convert to ms
    return results, total_latency
--- a/benchmark/benchmark_batch/benchmark_tokenizer.py
+++ b/benchmark/benchmark_batch/benchmark_tokenizer.py
@@ -44,10 +44,10 @@ def benchmark_sequential_vs_batch(prompts, batch_size, tokenizer):
    for run in range(NUM_RUNS):
        batch_prompts = prompts[:batch_size]  # Use same prompts for fair comparison
-        start_time = time.time()
+        start_time = time.perf_counter()
        for prompt in batch_prompts:
            tokens = tokenizer.encode(prompt)
-        sequential_time = (time.time() - start_time) * 1000
+        sequential_time = (time.perf_counter() - start_time) * 1000
        sequential_times.append(sequential_time)
    # Batch tokenization using tokenizer()
@@ -55,9 +55,9 @@ def benchmark_sequential_vs_batch(prompts, batch_size, tokenizer):
    for run in range(NUM_RUNS):
        batch_prompts = prompts[:batch_size]  # Use same prompts for fair comparison
-        start_time = time.time()
+        start_time = time.perf_counter()
        tokens = tokenizer(batch_prompts)
-        batch_time = (time.time() - start_time) * 1000
+        batch_time = (time.perf_counter() - start_time) * 1000
        batch_times.append(batch_time)
    return {
--- a/benchmark/generative_agents/bench_other.py
+++ b/benchmark/generative_agents/bench_other.py
@@ -39,7 +39,7 @@ def main(args):
        answer = await call_generate(**arg, temperature=0)
        states.append(answer)
-    tic = time.time()
+    tic = time.perf_counter()
    # we always sequentially execute agent calls to maintain its dependency
    if args.backend != "lmql":
        for arg in tqdm(arguments):
@@ -50,7 +50,7 @@ def main(args):
        loop = asyncio.get_event_loop()
        for arg in tqdm(arguments):
            loop.run_until_complete(get_one_answer_async(arg))
-    latency = time.time() - tic
+    latency = time.perf_counter() - tic
    print(f"Latency: {latency:.3f}")
--- a/benchmark/generative_agents/bench_sglang.py
+++ b/benchmark/generative_agents/bench_sglang.py
@@ -35,14 +35,14 @@ def main(args):
    states = []
    # Run requests
-    tic = time.time()
+    tic = time.perf_counter()
    for a in arguments:
        # only a single key in the dict
        for func, arg in a.items():
            result = func.run(**arg)
        result.sync()
        states.append(result)
-    latency = time.time() - tic
+    latency = time.perf_counter() - tic
    # Compute accuracy
    print(f"Latency: {latency:.3f}")
--- a/benchmark/gsm8k/bench_other.py
+++ b/benchmark/gsm8k/bench_other.py
@@ -75,7 +75,7 @@ def main(args):
            )
            states[i] = answer
-        tic = time.time()
+        tic = time.perf_counter()
        if args.parallel == 1:
            for i in tqdm(range(len(questions))):
                get_one_answer(i)
@@ -106,9 +106,9 @@ def main(args):
                for j in range(len(rets)):
                    states[i + j] = rets[j]
-        tic = time.time()
+        tic = time.perf_counter()
        asyncio.run(batched_call(batch_size=args.parallel))
-    latency = time.time() - tic
+    latency = time.perf_counter() - tic
    preds = []
    for i in range(len(states)):
--- a/benchmark/gsm8k/bench_sglang.py
+++ b/benchmark/gsm8k/bench_sglang.py
@@ -84,14 +84,14 @@ def main(args):
    #####################################
    # Run requests
-    tic = time.time()
+    tic = time.perf_counter()
    states = few_shot_gsm8k.run_batch(
        arguments,
        temperature=0,
        num_threads=args.parallel,
        progress_bar=True,
    )
-    latency = time.time() - tic
+    latency = time.perf_counter() - tic
    preds = []
    for i in range(len(states)):
--- a/benchmark/hellaswag/bench_other.py
+++ b/benchmark/hellaswag/bench_other.py
@@ -57,7 +57,7 @@ def main(args):
                context=few_shot_examples + questions[i], choices=choices[i]
            )
-        tic = time.time()
+        tic = time.perf_counter()
        if args.parallel == 1:
            for i in tqdm(range(len(questions))):
                get_one_answer(i)
@@ -82,10 +82,10 @@ def main(args):
                for j in range(len(rets)):
                    preds[i + j] = rets[j]
-        tic = time.time()
+        tic = time.perf_counter()
        asyncio.run(batched_call(batch_size=args.parallel))
-    latency = time.time() - tic
+    latency = time.perf_counter() - tic
    # Compute accuracy
    acc = np.mean(np.array(preds) == np.array(labels))
--- a/benchmark/hellaswag/bench_sglang.py
+++ b/benchmark/hellaswag/bench_sglang.py
@@ -68,7 +68,7 @@ def main(args):
    #####################################
    # Run requests
-    tic = time.time()
+    tic = time.perf_counter()
    rets = few_shot_hellaswag.run_batch(
        arguments,
        temperature=0,
@@ -76,7 +76,7 @@ def main(args):
        progress_bar=True,
    )
    preds = [choices[i].index(rets[i]["answer"]) for i in range(len(rets))]
-    latency = time.time() - tic
+    latency = time.perf_counter() - tic
    # Compute accuracy
    acc = np.mean(np.array(preds) == np.array(labels))
--- a/benchmark/hicache/bench_multiturn.py
+++ b/benchmark/hicache/bench_multiturn.py
@@ -261,7 +261,7 @@ class WorkloadGenerator:
            client_id, payload = item
            response = await async_request_sglang_generate(payload, self.url, self.pbar)
            if self.pbar.n == self.pbar.total:
-                self.finished_time = time.time()
+                self.finished_time = time.perf_counter()
            self.response_queue.put((client_id, response))
        except Exception as e:
            print(f"Request failed: {e}")
@@ -334,7 +334,7 @@ class WorkloadGenerator:
        request_thread = threading.Thread(target=self.request_sender, daemon=True)
        response_thread = threading.Thread(target=self.response_handler, daemon=True)
-        self.start_time = time.time()
+        self.start_time = time.perf_counter()
        request_thread.start()
        response_thread.start()
--- a/benchmark/json_decode_regex/bench_other.py
+++ b/benchmark/json_decode_regex/bench_other.py
@@ -53,7 +53,7 @@ def main(args):
    def get_one_answer(i):
        states[i] = json_decode(generate=call_generate, **arguments[i])
-    tic = time.time()
+    tic = time.perf_counter()
    if args.parallel == 1:
        for i in tqdm(range(len(arguments))):
            get_one_answer(i)
@@ -68,7 +68,7 @@ def main(args):
            for _ in rets:
                pass
-    latency = time.time() - tic
+    latency = time.perf_counter() - tic
    # Compute accuracy
    print(f"Latency: {latency:.3f}")
--- a/benchmark/json_decode_regex/bench_sglang.py
+++ b/benchmark/json_decode_regex/bench_sglang.py
@@ -63,11 +63,11 @@ def main(args):
    json_warm_up.run().sync()
    # Run requests
-    tic = time.time()
+    tic = time.perf_counter()
    states = json_decode.run_batch(
        arguments, temperature=0, num_threads=args.parallel, progress_bar=True
    )
-    latency = time.time() - tic
+    latency = time.perf_counter() - tic
    # Compute accuracy
    print(f"Latency: {latency:.3f}")
--- a/benchmark/json_jump_forward/bench_other.py
+++ b/benchmark/json_jump_forward/bench_other.py
@@ -175,7 +175,7 @@ def bench_character(args):
    else:
        raise ValueError(f"Invalid backend: {args.backend}")
-    tic = time.time()
+    tic = time.perf_counter()
    if args.backend != "lmql":
        if args.parallel == 1:
@@ -202,7 +202,7 @@ def bench_character(args):
                asyncio.gather(*[get_one_answer_async(i) for i in bt])
            )
-    latency = time.time() - tic
+    latency = time.perf_counter() - tic
    return states, latency
@@ -236,7 +236,7 @@ def bench_city_doc(args):
    else:
        raise ValueError(f"Invalid backend: {args.backend}")
-    tic = time.time()
+    tic = time.perf_counter()
    if args.parallel == 1:
        for i in tqdm(range(len(arguments))):
            get_one_answer(i)
@@ -246,7 +246,7 @@ def bench_city_doc(args):
            for _ in rets:
                pass
-    latency = time.time() - tic
+    latency = time.perf_counter() - tic
    return states, latency
--- a/benchmark/json_jump_forward/bench_sglang.py
+++ b/benchmark/json_jump_forward/bench_sglang.py
@@ -67,14 +67,14 @@ def bench_city_doc(args):
    sgl.set_default_backend(backend)
    # Run requests
-    tic = time.time()
+    tic = time.perf_counter()
    states = city_gen.run_batch(
        arguments,
        temperature=0,
        num_threads=args.parallel,
        progress_bar=True,
    )
-    latency = time.time() - tic
+    latency = time.perf_counter() - tic
    return states, latency
@@ -91,14 +91,14 @@ def bench_character(args):
    sgl.set_default_backend(backend)
    # Run requests
-    tic = time.time()
+    tic = time.perf_counter()
    states = character_gen.run_batch(
        arguments,
        temperature=0,
        num_threads=args.parallel,
        progress_bar=True,
    )
-    latency = time.time() - tic
+    latency = time.perf_counter() - tic
    return states, latency
--- a/benchmark/json_schema/bench_sglang.py
+++ b/benchmark/json_schema/bench_sglang.py
@@ -85,14 +85,14 @@ def bench_schema(args):
    sgl.set_default_backend(backend)
    # Run requests
-    tic = time.time()
+    tic = time.perf_counter()
    states = schema_gen.run_batch(
        arguments,
        temperature=0,
        num_threads=args.parallel,
        progress_bar=True,
    )
-    latency = time.time() - tic
+    latency = time.perf_counter() - tic
    # Check if the outputs are valid
    indexes = []
--- a/benchmark/kernels/fused_moe_triton/tuning_fused_moe_triton.py
+++ b/benchmark/kernels/fused_moe_triton/tuning_fused_moe_triton.py
@@ -487,7 +487,7 @@ def main(args: argparse.Namespace):
            ]
        print(f"Start tuning over {len(search_space)} configurations...")
-        start = time.time()
+        start = time.perf_counter()
        configs = _distribute(
            "tune",
            [
@@ -522,7 +522,7 @@ def main(args: argparse.Namespace):
            use_int8_w8a16,
            block_shape,
        )
-        end = time.time()
+        end = time.perf_counter()
        print(f"Tuning took {end - start:.2f} seconds")
    else:
        outputs = _distribute(
--- a/benchmark/kernels/quantization/tuning_block_wise_kernel.py
+++ b/benchmark/kernels/quantization/tuning_block_wise_kernel.py
@@ -359,7 +359,7 @@ def tune_on_gpu(args_dict):
        config for config in search_space if block_k % config["BLOCK_SIZE_K"] == 0
    ]
-    start = time.time()
+    start = time.perf_counter()
    results = {}
    for shape in tqdm(weight_shapes, desc=f"GPU {gpu_id} - Shapes"):
        N, K = shape[0], shape[1]
@@ -379,7 +379,7 @@ def tune_on_gpu(args_dict):
        best_configs = {M: config for M, config in zip(batch_sizes, benchmark_results)}
        save_configs(N, K, block_n, block_k, best_configs, save_path, input_type)
-    end = time.time()
+    end = time.perf_counter()
    print(f"Tuning on GPU {gpu_id} took {end - start:.2f} seconds")
--- a/benchmark/line_retrieval/bench_sglang.py
+++ b/benchmark/line_retrieval/bench_sglang.py
@@ -70,7 +70,7 @@ def eval_model(args, line_obj, num_hoops, src_indices, dst_percents):
    # Select backend
    backend = select_sglang_backend(args)
-    tic = time.time()
+    tic = time.perf_counter()
    states = line_retrieval.run_batch(
        arguments,
        temperature=0,
@@ -78,7 +78,7 @@ def eval_model(args, line_obj, num_hoops, src_indices, dst_percents):
        num_threads=args.parallel,
        progress_bar=True,
    )
-    latency = time.time() - tic
+    latency = time.perf_counter() - tic
    corrects = []
    for i in range(len(arguments)):
--- a/benchmark/llava_bench/bench_sglang.py
+++ b/benchmark/llava_bench/bench_sglang.py
@@ -41,7 +41,7 @@ def main(args):
    sgl.set_default_backend(backend)
    # Run requests
-    tic = time.time()
+    tic = time.perf_counter()
    if args.parallel == 1:
        for i in tqdm.tqdm(range(len(lines))):
            image_file = arguments[i]["image_file"]
@@ -52,7 +52,7 @@ def main(args):
        states = image_qa.run_batch(
            arguments, temperature=0, num_threads=args.parallel, progress_bar=True
        )
-    latency = time.time() - tic
+    latency = time.perf_counter() - tic
    print(f"Latency: {latency:.3f}")
--- a/benchmark/llm_judge/bench_other.py
+++ b/benchmark/llm_judge/bench_other.py
@@ -85,7 +85,7 @@ def main(args):
    call_generate = partial(get_call_generate(args), temperature=0)
    # Run requests
-    tic = time.time()
+    tic = time.perf_counter()
    if args.backend != "lmql":
@@ -120,7 +120,7 @@ def main(args):
                asyncio.gather(*[get_one_answer_async(i) for i in bt])
            )
-    latency = time.time() - tic
+    latency = time.perf_counter() - tic
    # Compute accuracy
    print(f"Latency: {latency:.3f}")
--- a/benchmark/llm_judge/bench_sglang.py
+++ b/benchmark/llm_judge/bench_sglang.py
@@ -59,7 +59,7 @@ def main(args):
    backend = select_sglang_backend(args)
    # Run requests
-    tic = time.time()
+    tic = time.perf_counter()
    states = multi_dimension_judge.run_batch(
        arguments,
        temperature=0,
@@ -67,7 +67,7 @@ def main(args):
        num_threads=args.parallel,
        progress_bar=True,
    )
-    latency = time.time() - tic
+    latency = time.perf_counter() - tic
    print(f"Latency: {latency:.3f}")
--- a/benchmark/long_json_decode/bench_other.py
+++ b/benchmark/long_json_decode/bench_other.py
@@ -45,7 +45,7 @@ def main(args):
    def get_one_answer(i):
        states[i] = json_decode(generate=call_generate, **arguments[i])
-    tic = time.time()
+    tic = time.perf_counter()
    if args.parallel == 1:
        for i in tqdm(range(len(arguments))):
            get_one_answer(i)
@@ -58,7 +58,7 @@ def main(args):
                )
            )
-    latency = time.time() - tic
+    latency = time.perf_counter() - tic
    # Compute accuracy
    print(f"Latency: {latency:.3f}")
--- a/benchmark/long_json_decode/bench_sglang.py
+++ b/benchmark/long_json_decode/bench_sglang.py
@@ -46,11 +46,11 @@ def main(args):
    sgl.set_default_backend(backend)
    # Run requests
-    tic = time.time()
+    tic = time.perf_counter()
    states = json_decode.run_batch(
        arguments, temperature=0, num_threads=args.parallel, progress_bar=True
    )
-    latency = time.time() - tic
+    latency = time.perf_counter() - tic
    # Compute accuracy
    print(f"Latency: {latency:.3f}")
--- a/benchmark/mmlu/bench_other.py
+++ b/benchmark/mmlu/bench_other.py
@@ -76,7 +76,7 @@ def evaluate(args, subject, dev_df, test_df, call_generate):
            pred = call_generate(prompts[i], temperature=0, max_tokens=max_tokens)
            preds[i] = pred.strip()[0]
-        tic = time.time()
+        tic = time.perf_counter()
        if args.parallel == 1:
            for i in range(len(prompts)):
                get_one_answer(i)
@@ -94,9 +94,9 @@ def evaluate(args, subject, dev_df, test_df, call_generate):
                for j in range(len(rets)):
                    preds[i + j] = rets[j].strip()[0]
-        tic = time.time()
+        tic = time.perf_counter()
        asyncio.run(batched_call(batch_size=args.parallel))
-    latency = time.time() - tic
+    latency = time.perf_counter() - tic
    # Compute accuracy
    cors = [pred == label for pred, label in zip(preds, labels)]
--- a/benchmark/mmlu/bench_sglang.py
+++ b/benchmark/mmlu/bench_sglang.py
@@ -116,7 +116,7 @@ def main(args):
    backend = select_sglang_backend(args)
    # Run
-    tic = time.time()
+    tic = time.perf_counter()
    states = few_shot_mmlu.run_batch(
        arguments,
        temperature=0,
@@ -128,7 +128,7 @@ def main(args):
    preds = [
        s["answer"].strip()[0] if len(s["answer"].strip()) > 0 else "" for s in states
    ]
-    latency = time.time() - tic
+    latency = time.perf_counter() - tic
    # Compute accuracy
    cors = [pred == label for pred, label in zip(preds, labels)]
--- a/benchmark/mmmu/bench_sglang.py
+++ b/benchmark/mmmu/bench_sglang.py
@@ -119,7 +119,7 @@ async def eval_mmmu(args) -> None:
        api_key="sk", base_url=f"http://127.0.0.1:{args.port}/v1"
    )
    semaphore = asyncio.Semaphore(args.concurrency)
-    start = time.time()
+    start = time.perf_counter()
    base_url = f"http://127.0.0.1:{args.port}"
    if args.profile:
@@ -147,7 +147,7 @@ async def eval_mmmu(args) -> None:
        if profile_output.success:
            print("Profiler stopped")
-    print(f"Benchmark time: {time.time() - start}")
+    print(f"Benchmark time: {time.perf_counter() - start}")
    args.output_path = f"./val_sglang.json"
    save_json(args.output_path, out_samples)
    eval_result(model_answer_path=args.output_path, answer_dict=answer_dict)
--- a/benchmark/mtbench/bench_other.py
+++ b/benchmark/mtbench/bench_other.py
@@ -66,7 +66,7 @@ def main(args):
        answers[i] = cur_answers
    # Run requests
-    tic = time.time()
+    tic = time.perf_counter()
    if args.parallel == 1:
        for i in tqdm(range(len(questions))):
            get_answer(i)
@@ -79,7 +79,7 @@ def main(args):
                )
            )
-    latency = time.time() - tic
+    latency = time.perf_counter() - tic
    print(f"#questions: {len(questions)}, Latency: {latency:.2f}")
--- a/benchmark/mtbench/bench_sglang.py
+++ b/benchmark/mtbench/bench_sglang.py
@@ -57,7 +57,7 @@ def main(args):
    sgl.set_default_backend(backend)
    # Run requests
-    tic = time.time()
+    tic = time.perf_counter()
    rets = answer_mt_bench.run_batch(
        arguments,
        temperature=0,
@@ -66,7 +66,7 @@ def main(args):
        progress_bar=True,
    )
    answers = [[s["answer_1"], s["answer_2"]] for s in rets]
-    latency = time.time() - tic
+    latency = time.perf_counter() - tic
    print(f"#questions: {len(questions)}, Latency: {latency:.2f}")
--- a/benchmark/mtbench/bench_sglang_eagle.py
+++ b/benchmark/mtbench/bench_sglang_eagle.py
@@ -68,7 +68,7 @@ def main(args):
    sgl.set_default_backend(backend)
    # Run requests
-    tic = time.time()
+    tic = time.perf_counter()
    rets = answer_mt_bench.run_batch(
        arguments,
        temperature=0,
@@ -78,7 +78,7 @@ def main(args):
    )
    answers = [[s["answer_1"], s["answer_2"]] for s in rets]
-    latency = time.time() - tic
+    latency = time.perf_counter() - tic
    num_output_tokens = sum(
        s.get_meta_info("answer_1")["completion_tokens"]
        + s.get_meta_info("answer_2")["completion_tokens"]
--- a/benchmark/multi_chain_reasoning/bench_other.py
+++ b/benchmark/multi_chain_reasoning/bench_other.py
@@ -113,7 +113,7 @@ def main(args):
            answer = multi_chain_gsm8k(questions[i], args.num_chains, call_generate)
            states[i] = answer
-        tic = time.time()
+        tic = time.perf_counter()
        if args.parallel == 1:
            for i in tqdm(range(len(questions))):
                get_one_answer(i)
@@ -134,7 +134,7 @@ def main(args):
            )
            states[i] = answer
-        tic = time.time()
+        tic = time.perf_counter()
        loop = asyncio.get_event_loop()
        batches = [
            list(range(i, min(i + args.parallel, len(questions))))
@@ -144,7 +144,7 @@ def main(args):
            tasks = [get_one_answer_asyncio(k) for k in bt]
            loop.run_until_complete(asyncio.gather(*tasks))
-    latency = time.time() - tic
+    latency = time.perf_counter() - tic
    preds = []
    for i in range(len(states)):
--- a/benchmark/multi_chain_reasoning/bench_sglang.py
+++ b/benchmark/multi_chain_reasoning/bench_sglang.py
@@ -90,7 +90,7 @@ def main(args):
    backend = select_sglang_backend(args)
    # Run requests
-    tic = time.time()
+    tic = time.perf_counter()
    states = multi_chain_gsm8k.run_batch(
        arguments,
        temperature=0,
@@ -98,7 +98,7 @@ def main(args):
        num_threads=args.parallel,
        progress_bar=True,
    )
-    latency = time.time() - tic
+    latency = time.perf_counter() - tic
    preds = []
    for i in range(len(states)):
--- a/benchmark/multi_document_qa/bench_other.py
+++ b/benchmark/multi_document_qa/bench_other.py
@@ -61,7 +61,7 @@ def main(args):
    def get_one_answer(i):
        states[i] = multi_document_qa(generate=call_generate, **arguments[i])
-    tic = time.time()
+    tic = time.perf_counter()
    if args.parallel == 1:
        for i in tqdm(range(len(labels))):
            get_one_answer(i)
@@ -74,7 +74,7 @@ def main(args):
                )
            )
-    latency = time.time() - tic
+    latency = time.perf_counter() - tic
    # Compute accuracy
    print(states)
--- a/benchmark/multi_document_qa/bench_sglang.py
+++ b/benchmark/multi_document_qa/bench_sglang.py
@@ -49,11 +49,11 @@ def main(args):
    sgl.set_default_backend(backend)
    # Run requests
-    tic = time.time()
+    tic = time.perf_counter()
    states = multi_document_qa.run_batch(
        arguments, temperature=0, num_threads=args.parallel, progress_bar=True
    )
-    latency = time.time() - tic
+    latency = time.perf_counter() - tic
    # Compute accuracy
    print([s["answer"] for s in states])
--- a/benchmark/multi_turn_chat/bench_other.py
+++ b/benchmark/multi_turn_chat/bench_other.py
@@ -35,7 +35,7 @@ def main(args):
    def get_one_answer(i):
        states[i] = multi_turns(generate=call_generate, **multi_qas[i])
-    tic = time.time()
+    tic = time.perf_counter()
    if args.parallel == 1:
        for i in tqdm(range(len(multi_qas))):
            get_one_answer(i)
@@ -50,7 +50,7 @@ def main(args):
            for _ in rets:
                pass
-    latency = time.time() - tic
+    latency = time.perf_counter() - tic
    # Compute accuracy
    print(f"Latency: {latency:.3f}")
--- a/benchmark/multi_turn_chat/bench_sglang.py
+++ b/benchmark/multi_turn_chat/bench_sglang.py
@@ -27,7 +27,7 @@ def main(args):
    backend = select_sglang_backend(args)
-    tic = time.time()
+    tic = time.perf_counter()
    states = multi_turns.run_batch(
        multi_qas,
        temperature=0,
@@ -35,7 +35,7 @@ def main(args):
        num_threads=args.parallel,
        progress_bar=True,
    )
-    latency = time.time() - tic
+    latency = time.perf_counter() - tic
    print(f"Latency: {latency:.3f}")
--- a/benchmark/multi_turn_chat/long_prompt_multi_turn.py
+++ b/benchmark/multi_turn_chat/long_prompt_multi_turn.py
@@ -84,7 +84,7 @@ def main(args):
    backend = select_sglang_backend(args)
-    tic = time.time()
+    tic = time.perf_counter()
    states = multi_turns.run_batch(
        multi_qas,
        temperature=0,
@@ -92,7 +92,7 @@ def main(args):
        num_threads="auto",
        progress_bar=True,
    )
-    latency = time.time() - tic
+    latency = time.perf_counter() - tic
    print(f"Latency: {latency:.3f}")
--- a/benchmark/react/bench_other.py
+++ b/benchmark/react/bench_other.py
@@ -146,7 +146,7 @@ def main(args):
            states.append(answer)
-    tic = time.time()
+    tic = time.perf_counter()
    if args.backend != "lmql":
        if args.parallel == 1:
@@ -173,7 +173,7 @@ def main(args):
            tasks = [run_single_agent_async(arg) for arg in bt]
            loop.run_until_complete(asyncio.gather(*tasks))
-    latency = time.time() - tic
+    latency = time.perf_counter() - tic
    print(f"Latency: {latency:.3f}")
--- a/benchmark/react/bench_sglang.py
+++ b/benchmark/react/bench_sglang.py
@@ -115,14 +115,14 @@ def main(args):
    sgl.set_default_backend(backend)
    states = []
-    tic = time.time()
+    tic = time.perf_counter()
    states = webthink.run_batch(
        arguments,
        temperature=0,
        num_threads=args.parallel,
        progress_bar=True,
    )
-    latency = time.time() - tic
+    latency = time.perf_counter() - tic
    # Compute accuracy
    print(f"Latency: {latency:.3f}")
--- a/benchmark/reasoning_benchmark/bench_sglang.py
+++ b/benchmark/reasoning_benchmark/bench_sglang.py
@@ -51,7 +51,7 @@ def main(args):
    )
    # Run requests
-    tic = time.time()
+    tic = time.perf_counter()
    states = reasoning_gen.run_batch(
        questions,
        num_threads=args.parallel,
@@ -60,7 +60,7 @@ def main(args):
        max_new_tokens=32768,
        top_p=0.95,
    )
-    latency = time.time() - tic
+    latency = time.perf_counter() - tic
    # Extract results and record outcomes in a list.
    outcomes = []
--- a/benchmark/tip_suggestion/bench_other.py
+++ b/benchmark/tip_suggestion/bench_other.py
@@ -68,7 +68,7 @@ def main(args):
    call_generate = partial(get_call_generate(args), temperature=0)
    # Run requests
-    tic = time.time()
+    tic = time.perf_counter()
    if args.backend != "lmql":
        def get_one_answer(i):
@@ -102,7 +102,7 @@ def main(args):
            loop.run_until_complete(
                asyncio.gather(*[get_one_answer_async(i) for i in batch])
            )
-    latency = time.time() - tic
+    latency = time.perf_counter() - tic
    # Compute accuracy
    print(f"Latency: {latency:.3f}")
--- a/benchmark/tip_suggestion/bench_sglang.py
+++ b/benchmark/tip_suggestion/bench_sglang.py
@@ -65,11 +65,11 @@ def main(args):
    sgl.set_default_backend(select_sglang_backend(args))
    # Run requests
-    tic = time.time()
+    tic = time.perf_counter()
    states = suggest_tips.run_batch(
        arguments, temperature=0, num_threads=args.parallel, progress_bar=True
    )
-    latency = time.time() - tic
+    latency = time.perf_counter() - tic
    # Compute accuracy
    print(f"Latency: {latency:.3f}")
--- a/benchmark/tree_of_thought_deep/bench_other.py
+++ b/benchmark/tree_of_thought_deep/bench_other.py
@@ -138,7 +138,7 @@ def main(args):
    # Run requests
    states = [None] * len(questions)
-    tic = time.time()
+    tic = time.perf_counter()
    if args.backend != "lmql":
        def get_one_answer(i):
@@ -177,7 +177,7 @@ def main(args):
            tasks = [get_one_answer_async(k) for k in bt]
            loop.run_until_complete(asyncio.gather(*tasks))
-    latency = time.time() - tic
+    latency = time.perf_counter() - tic
    answers_text = []
    for s in states:
--- a/benchmark/tree_of_thought_deep/bench_sglang.py
+++ b/benchmark/tree_of_thought_deep/bench_sglang.py
@@ -119,7 +119,7 @@ def main(args):
    backend = select_sglang_backend(args)
    # Run requests
-    tic = time.time()
+    tic = time.perf_counter()
    states = tree_search.run_batch(
        arguments,
        temperature=0,
@@ -127,7 +127,7 @@ def main(args):
        num_threads=args.parallel,
        progress_bar=True,
    )
-    latency = time.time() - tic
+    latency = time.perf_counter() - tic
    answers_text = []
    for s in states:
        answers_text.append([x for xs in s.ret_value for x in xs])
--- a/benchmark/tree_of_thought_v0/bench_other.py
+++ b/benchmark/tree_of_thought_v0/bench_other.py
@@ -121,7 +121,7 @@ def main(args):
    def get_one_answer(i):
        states[i] = tree_search(**arguments[i], call_generate=call_generate)
-    tic = time.time()
+    tic = time.perf_counter()
    if args.parallel == 1:
        for i in tqdm(range(len(questions))):
            get_one_answer(i)
@@ -134,7 +134,7 @@ def main(args):
                )
            )
-    latency = time.time() - tic
+    latency = time.perf_counter() - tic
    answers_text = []
    for s in states:
--- a/benchmark/tree_of_thought_v0/bench_sglang.py
+++ b/benchmark/tree_of_thought_v0/bench_sglang.py
@@ -107,7 +107,7 @@ def main(args):
    backend = select_sglang_backend(args)
    # Run requests
-    tic = time.time()
+    tic = time.perf_counter()
    states = tree_search.run_batch(
        arguments,
        temperature=0,
@@ -115,7 +115,7 @@ def main(args):
        num_threads=args.parallel,
        progress_bar=True,
    )
-    latency = time.time() - tic
+    latency = time.perf_counter() - tic
    answers_text = []
    for s in states:
        answers_text.append([x for xs in s["answer"] for x in xs])
--- a/python/sglang/test/few_shot_gsm8k.py
+++ b/python/sglang/test/few_shot_gsm8k.py
@@ -90,7 +90,7 @@ def run_eval(args):
    #####################################
    # Run requests
-    tic = time.time()
+    tic = time.perf_counter()
    states = few_shot_gsm8k.run_batch(
        arguments,
        temperature=args.temperature if hasattr(args, "temperature") else 0,
@@ -99,7 +99,7 @@ def run_eval(args):
        return_logprob=getattr(args, "return_logprob", None),
        logprob_start_len=getattr(args, "logprob_start_len", None),
    )
-    latency = time.time() - tic
+    latency = time.perf_counter() - tic
    preds = []
    for i in range(len(states)):
--- a/python/sglang/test/few_shot_gsm8k_engine.py
+++ b/python/sglang/test/few_shot_gsm8k_engine.py
@@ -89,7 +89,7 @@ def run_eval(args):
    }
    # Run requests
-    tic = time.time()
+    tic = time.perf_counter()
    loop = asyncio.get_event_loop()
@@ -98,7 +98,7 @@ def run_eval(args):
    )
    # End requests
-    latency = time.time() - tic
+    latency = time.perf_counter() - tic
    # Shutdown the engine
    engine.shutdown()
--- a/python/sglang/test/run_eval.py
+++ b/python/sglang/test/run_eval.py
@@ -71,9 +71,9 @@ def run_eval(args):
    )
    # Run eval
-    tic = time.time()
+    tic = time.perf_counter()
    result = eval_obj(sampler)
-    latency = time.time() - tic
+    latency = time.perf_counter() - tic
    # Dump reports
    metrics = result.metrics | {"score": result.score}
--- a/python/sglang/test/test_programs.py
+++ b/python/sglang/test/test_programs.py
@@ -503,7 +503,7 @@ def test_hellaswag_select():
    #####################################
    # Run requests
-    tic = time.time()
+    tic = time.perf_counter()
    rets = few_shot_hellaswag.run_batch(
        arguments,
        temperature=0,
@@ -514,13 +514,13 @@ def test_hellaswag_select():
    preds = []
    for i, ret in enumerate(rets):
        preds.append(choices[i].index(ret["answer"]))
-    latency = time.time() - tic
+    latency = time.perf_counter() - tic
    # Compute accuracy
    accuracy = np.mean(np.array(preds) == np.array(labels))
    # Test generator style of run_batch
-    tic = time.time()
+    tic = time.perf_counter()
    rets = few_shot_hellaswag.run_batch(
        arguments,
        temperature=0,
@@ -531,7 +531,7 @@ def test_hellaswag_select():
    preds_gen = []
    for i, ret in enumerate(rets):
        preds_gen.append(choices[i].index(ret["answer"]))
-    latency_gen = time.time() - tic
+    latency_gen = time.perf_counter() - tic
    # Compute accuracy
    accuracy_gen = np.mean(np.array(preds_gen) == np.array(labels))
--- a/python/sglang/test/test_utils.py
+++ b/python/sglang/test/test_utils.py
@@ -449,9 +449,9 @@ def popen_launch_server(
    else:
        process = subprocess.Popen(command, stdout=None, stderr=None, env=env)
-    start_time = time.time()
+    start_time = time.perf_counter()
    with requests.Session() as session:
-        while time.time() - start_time < timeout:
+        while time.perf_counter() - start_time < timeout:
            try:
                headers = {
                    "Content-Type": "application/json; charset=utf-8",
@@ -584,7 +584,7 @@ class TestFile:
 def run_unittest_files(files: List[TestFile], timeout_per_file: float):
-    tic = time.time()
+    tic = time.perf_counter()
    success = True
    for i, file in enumerate(files):
@@ -599,13 +599,13 @@ def run_unittest_files(files: List[TestFile], timeout_per_file: float):
                f".\n.\nBegin ({i}/{len(files) - 1}):\npython3 {filename}\n.\n.\n",
                flush=True,
            )
-            tic = time.time()
+            tic = time.perf_counter()
            process = subprocess.Popen(
                ["python3", filename], stdout=None, stderr=None, env=os.environ
            )
            process.wait()
-            elapsed = time.time() - tic
+            elapsed = time.perf_counter() - tic
            print(
                f".\n.\nEnd ({i}/{len(files) - 1}):\n{filename=}, {elapsed=:.0f}, {estimated_time=}\n.\n.\n",
@@ -631,9 +631,9 @@ def run_unittest_files(files: List[TestFile], timeout_per_file: float):
            break
    if success:
-        print(f"Success. Time elapsed: {time.time() - tic:.2f}s", flush=True)
+        print(f"Success. Time elapsed: {time.perf_counter() - tic:.2f}s", flush=True)
    else:
-        print(f"Fail. Time elapsed: {time.time() - tic:.2f}s", flush=True)
+        print(f"Fail. Time elapsed: {time.perf_counter() - tic:.2f}s", flush=True)
    return 0 if success else -1
--- a/sgl-router/py_test/test_launch_server.py
+++ b/sgl-router/py_test/test_launch_server.py
@@ -92,9 +92,9 @@ def popen_launch_router(
    process = subprocess.Popen(command, stdout=None, stderr=None)
-    start_time = time.time()
+    start_time = time.perf_counter()
    with requests.Session() as session:
-        while time.time() - start_time < timeout:
+        while time.perf_counter() - start_time < timeout:
            try:
                response = session.get(f"{base_url}/health")
                if response.status_code == 200:
@@ -155,11 +155,11 @@ def terminate_and_wait(process, timeout=300):
        return
    process.terminate()
-    start_time = time.time()
+    start_time = time.perf_counter()
    while process.poll() is None:
        print(f"Terminating process {process.pid}")
-        if time.time() - start_time > timeout:
+        if time.perf_counter() - start_time > timeout:
            raise TimeoutError(
                f"Process {process.pid} failed to terminate within {timeout}s"
            )
--- a/test/srt/experiment_runner.py
+++ b/test/srt/experiment_runner.py
@@ -184,9 +184,9 @@ class ExperimentRunner:
        self.logger = logging.getLogger(__name__)
    def wait_for_server(self, port: int, timeout: int = 300) -> bool:
-        start_time = time.time()
+        start_time = time.perf_counter()
-        while time.time() - start_time < timeout:
+        while time.perf_counter() - start_time < timeout:
            try:
                response = requests.get(f"http://localhost:{port}/health")
                if response.status_code == 200:
@@ -197,7 +197,7 @@ class ExperimentRunner:
        return False
    def run_task(self, config: TaskConfig) -> TaskResult:
-        start_time = time.time()
+        start_time = time.perf_counter()
        client_output = []
        try:
@@ -247,7 +247,7 @@ class ExperimentRunner:
                name=config.name,
                success=True,
                output=formatted_output,
-                runtime=time.time() - start_time,
+                runtime=time.perf_counter() - start_time,
                timestamp=datetime.now().isoformat(),
            )
@@ -256,7 +256,7 @@ class ExperimentRunner:
                name=config.name,
                success=False,
                output=str(e),
-                runtime=time.time() - start_time,
+                runtime=time.perf_counter() - start_time,
                timestamp=datetime.now().isoformat(),
            )
--- a/test/srt/models/test_encoder_embedding_models.py
+++ b/test/srt/models/test_encoder_embedding_models.py
@@ -79,9 +79,9 @@ class TestEncoderEmbeddingModels(CustomTestCase):
            # warm up
            hf_outputs = hf_runner.forward(truncated_prompts)
-            st_start_time = time.time()
+            st_start_time = time.perf_counter()
            hf_outputs = hf_runner.forward(truncated_prompts)
-            st_end_time = time.time()
+            st_end_time = time.perf_counter()
        with SRTRunner(
            model_path,
@@ -95,9 +95,9 @@ class TestEncoderEmbeddingModels(CustomTestCase):
            # warm up
            srt_outputs = srt_runner.forward(truncated_prompts)
-            sgl_start_time = time.time()
+            sgl_start_time = time.perf_counter()
            srt_outputs = srt_runner.forward(truncated_prompts)
-            sgl_end_time = time.time()
+            sgl_end_time = time.perf_counter()
        transformer_time = st_end_time - st_start_time
        sgl_time = sgl_end_time - sgl_start_time
--- a/test/srt/test_gptqmodel_dynamic.py
+++ b/test/srt/test_gptqmodel_dynamic.py
@@ -130,9 +130,9 @@ class TestGPTQModelDynamic(CustomTestCase):
    def test_throughput(self):
        max_tokens = 256
-        tic = time.time()
+        tic = time.perf_counter()
        result = self.run_decode(max_tokens)
-        tok = time.time()
+        tok = time.perf_counter()
        print(f"result = `{result}`")
@@ -185,9 +185,9 @@ class TestGPTQModelDynamicWithMarlin(CustomTestCase):
    def test_throughput(self):
        max_tokens = 256
-        tic = time.time()
+        tic = time.perf_counter()
        result = self.run_decode(max_tokens)
-        tok = time.time()
+        tok = time.perf_counter()
        print(f"result = `{result}`")
--- a/test/srt/test_release_memory_occupation.py
+++ b/test/srt/test_release_memory_occupation.py
@@ -42,10 +42,10 @@ class TestReleaseMemoryOccupation(CustomTestCase):
        )
        print("release_memory_occupation start")
-        t = time.time()
+        t = time.perf_counter()
        engine.release_memory_occupation()
        if _DEBUG_EXTRA:
-            print("release_memory_occupation", time.time() - t)
+            print("release_memory_occupation", time.perf_counter() - t)
        if _DEBUG_EXTRA:
            time.sleep(5)
@@ -60,10 +60,10 @@ class TestReleaseMemoryOccupation(CustomTestCase):
            time.sleep(5)
        print("resume_memory_occupation start")
-        t = time.time()
+        t = time.perf_counter()
        engine.resume_memory_occupation()
        if _DEBUG_EXTRA:
-            print("resume_memory_occupation", time.time() - t)
+            print("resume_memory_occupation", time.perf_counter() - t)
        self.assertEqual(
            _try_allocate_big_tensor(),
--- a/test/srt/test_torch_compile.py
+++ b/test/srt/test_torch_compile.py
@@ -62,9 +62,9 @@ class TestTorchCompile(CustomTestCase):
        res = self.run_decode(16)
        max_tokens = 256
-        tic = time.time()
+        tic = time.perf_counter()
        res = self.run_decode(max_tokens)
-        tok = time.time()
+        tok = time.perf_counter()
        print(f"{res=}")
        throughput = max_tokens / (tok - tic)
        print(f"Throughput: {throughput} tokens/s")
--- a/test/srt/test_torch_compile_moe.py
+++ b/test/srt/test_torch_compile_moe.py
@@ -62,9 +62,9 @@ class TestTorchCompileMoe(CustomTestCase):
        res = self.run_decode(16)
        max_tokens = 256
-        tic = time.time()
+        tic = time.perf_counter()
        res = self.run_decode(max_tokens)
-        tok = time.time()
+        tok = time.perf_counter()
        print(f"{res=}")
        throughput = max_tokens / (tok - tic)
        self.assertGreaterEqual(throughput, 285)
--- a/test/srt/test_torchao.py
+++ b/test/srt/test_torchao.py
@@ -61,9 +61,9 @@ class TestTorchAO(CustomTestCase):
        max_tokens = 256
-        tic = time.time()
+        tic = time.perf_counter()
        res = self.run_decode(max_tokens)
-        tok = time.time()
+        tok = time.perf_counter()
        print(res["text"])
        throughput = max_tokens / (tok - tic)
        print(f"Throughput: {throughput} tokens/s")
--- a/test/srt/test_update_weights_from_distributed.py
+++ b/test/srt/test_update_weights_from_distributed.py
@@ -164,7 +164,7 @@ def init_process_hf(
    )
    dist.barrier(group=group, device_ids=[rank])
    torch.cuda.synchronize()
-    time_begin_broadcast = time.time()
+    time_begin_broadcast = time.perf_counter()
    # The last parameter is lm_head.weight, which is tied
    # with embed_tokens.weight. Actually, we only need
@@ -182,7 +182,7 @@ def init_process_hf(
            group=group,
        )
    torch.cuda.synchronize()
-    time_end_broadcast = time.time()
+    time_end_broadcast = time.perf_counter()
    # Measure the latency of broadcasting/weights update.
    broadcast_time = time_end_broadcast - time_begin_broadcast
@@ -282,7 +282,7 @@ def init_process_sgl(
        )
    torch.cuda.synchronize()
-    time_begin_update = time.time()
+    time_begin_update = time.perf_counter()
    # The last parameter is lm_head.weight, which is tied
    # with embed_tokens.weight. Actually, we only need
@@ -312,7 +312,7 @@ def init_process_sgl(
                },
            )
    torch.cuda.synchronize()
-    time_end_update = time.time()
+    time_end_update = time.perf_counter()
    # Measure the latency of broadcast/weights update.
    update_time = time_end_update - time_begin_update
--- a/test/srt/test_update_weights_from_tensor.py
+++ b/test/srt/test_update_weights_from_tensor.py
@@ -21,9 +21,9 @@ def test_update_weights_from_tensor(tp_size):
    memory_before = torch.cuda.memory_allocated()
    new_tensor = torch.full((16384, 2048), 1.5, device="cuda")
-    time_start = time.time()
+    time_start = time.perf_counter()
    engine.update_weights_from_tensor([(x, new_tensor) for x in param_names])
-    print(f"Time delta: {time.time() - time_start:.03f}")
+    print(f"Time delta: {time.perf_counter() - time_start:.03f}")
    for param_name in param_names[:3]:
        _check_param(engine, param_name, [1.5] * 5)
--- a/test/srt/test_w8a8_quantization.py
+++ b/test/srt/test_w8a8_quantization.py
@@ -62,9 +62,9 @@ class TestW8A8(CustomTestCase):
    def test_throughput(self):
        max_tokens = 256
-        tic = time.time()
+        tic = time.perf_counter()
        res = self.run_decode(max_tokens)
-        tok = time.time()
+        tok = time.perf_counter()
        print(res["text"])
        throughput = max_tokens / (tok - tic)
        print(f"Throughput: {throughput} tokens/s")