Replace time.time() to time.perf_counter() for benchmarking. (#6178)

Signed-off-by: Lifu Huang <lifu.hlf@gmail.com>
2025-05-11 14:32:49 -07:00
parent e9a47f4cb5
commit 6e2da51561
61 changed files with 158 additions and 158 deletions
--- a/test/srt/experiment_runner.py
+++ b/test/srt/experiment_runner.py
@@ -184,9 +184,9 @@ class ExperimentRunner:
        self.logger = logging.getLogger(__name__)

    def wait_for_server(self, port: int, timeout: int = 300) -> bool:
-        start_time = time.time()
+        start_time = time.perf_counter()

-        while time.time() - start_time < timeout:
+        while time.perf_counter() - start_time < timeout:
            try:
                response = requests.get(f"http://localhost:{port}/health")
                if response.status_code == 200:
@@ -197,7 +197,7 @@ class ExperimentRunner:
        return False

    def run_task(self, config: TaskConfig) -> TaskResult:
-        start_time = time.time()
+        start_time = time.perf_counter()
        client_output = []

        try:
@@ -247,7 +247,7 @@ class ExperimentRunner:
                name=config.name,
                success=True,
                output=formatted_output,
-                runtime=time.time() - start_time,
+                runtime=time.perf_counter() - start_time,
                timestamp=datetime.now().isoformat(),
            )

@@ -256,7 +256,7 @@ class ExperimentRunner:
                name=config.name,
                success=False,
                output=str(e),
-                runtime=time.time() - start_time,
+                runtime=time.perf_counter() - start_time,
                timestamp=datetime.now().isoformat(),
            )

--- a/test/srt/models/test_encoder_embedding_models.py
+++ b/test/srt/models/test_encoder_embedding_models.py
@@ -79,9 +79,9 @@ class TestEncoderEmbeddingModels(CustomTestCase):
            # warm up
            hf_outputs = hf_runner.forward(truncated_prompts)

-            st_start_time = time.time()
+            st_start_time = time.perf_counter()
            hf_outputs = hf_runner.forward(truncated_prompts)
-            st_end_time = time.time()
+            st_end_time = time.perf_counter()

        with SRTRunner(
            model_path,
@@ -95,9 +95,9 @@ class TestEncoderEmbeddingModels(CustomTestCase):
            # warm up
            srt_outputs = srt_runner.forward(truncated_prompts)

-            sgl_start_time = time.time()
+            sgl_start_time = time.perf_counter()
            srt_outputs = srt_runner.forward(truncated_prompts)
-            sgl_end_time = time.time()
+            sgl_end_time = time.perf_counter()

        transformer_time = st_end_time - st_start_time
        sgl_time = sgl_end_time - sgl_start_time
--- a/test/srt/test_gptqmodel_dynamic.py
+++ b/test/srt/test_gptqmodel_dynamic.py
@@ -130,9 +130,9 @@ class TestGPTQModelDynamic(CustomTestCase):
    def test_throughput(self):
        max_tokens = 256

-        tic = time.time()
+        tic = time.perf_counter()
        result = self.run_decode(max_tokens)
-        tok = time.time()
+        tok = time.perf_counter()

        print(f"result = `{result}`")

@@ -185,9 +185,9 @@ class TestGPTQModelDynamicWithMarlin(CustomTestCase):
    def test_throughput(self):
        max_tokens = 256

-        tic = time.time()
+        tic = time.perf_counter()
        result = self.run_decode(max_tokens)
-        tok = time.time()
+        tok = time.perf_counter()

        print(f"result = `{result}`")

--- a/test/srt/test_release_memory_occupation.py
+++ b/test/srt/test_release_memory_occupation.py
@@ -42,10 +42,10 @@ class TestReleaseMemoryOccupation(CustomTestCase):
        )

        print("release_memory_occupation start")
-        t = time.time()
+        t = time.perf_counter()
        engine.release_memory_occupation()
        if _DEBUG_EXTRA:
-            print("release_memory_occupation", time.time() - t)
+            print("release_memory_occupation", time.perf_counter() - t)

        if _DEBUG_EXTRA:
            time.sleep(5)
@@ -60,10 +60,10 @@ class TestReleaseMemoryOccupation(CustomTestCase):
            time.sleep(5)

        print("resume_memory_occupation start")
-        t = time.time()
+        t = time.perf_counter()
        engine.resume_memory_occupation()
        if _DEBUG_EXTRA:
-            print("resume_memory_occupation", time.time() - t)
+            print("resume_memory_occupation", time.perf_counter() - t)

        self.assertEqual(
            _try_allocate_big_tensor(),
--- a/test/srt/test_torch_compile.py
+++ b/test/srt/test_torch_compile.py
@@ -62,9 +62,9 @@ class TestTorchCompile(CustomTestCase):
        res = self.run_decode(16)

        max_tokens = 256
-        tic = time.time()
+        tic = time.perf_counter()
        res = self.run_decode(max_tokens)
-        tok = time.time()
+        tok = time.perf_counter()
        print(f"{res=}")
        throughput = max_tokens / (tok - tic)
        print(f"Throughput: {throughput} tokens/s")
--- a/test/srt/test_torch_compile_moe.py
+++ b/test/srt/test_torch_compile_moe.py
@@ -62,9 +62,9 @@ class TestTorchCompileMoe(CustomTestCase):
        res = self.run_decode(16)

        max_tokens = 256
-        tic = time.time()
+        tic = time.perf_counter()
        res = self.run_decode(max_tokens)
-        tok = time.time()
+        tok = time.perf_counter()
        print(f"{res=}")
        throughput = max_tokens / (tok - tic)
        self.assertGreaterEqual(throughput, 285)
--- a/test/srt/test_torchao.py
+++ b/test/srt/test_torchao.py
@@ -61,9 +61,9 @@ class TestTorchAO(CustomTestCase):

        max_tokens = 256

-        tic = time.time()
+        tic = time.perf_counter()
        res = self.run_decode(max_tokens)
-        tok = time.time()
+        tok = time.perf_counter()
        print(res["text"])
        throughput = max_tokens / (tok - tic)
        print(f"Throughput: {throughput} tokens/s")
--- a/test/srt/test_update_weights_from_distributed.py
+++ b/test/srt/test_update_weights_from_distributed.py
@@ -164,7 +164,7 @@ def init_process_hf(
    )
    dist.barrier(group=group, device_ids=[rank])
    torch.cuda.synchronize()
-    time_begin_broadcast = time.time()
+    time_begin_broadcast = time.perf_counter()

    # The last parameter is lm_head.weight, which is tied
    # with embed_tokens.weight. Actually, we only need
@@ -182,7 +182,7 @@ def init_process_hf(
            group=group,
        )
    torch.cuda.synchronize()
-    time_end_broadcast = time.time()
+    time_end_broadcast = time.perf_counter()

    # Measure the latency of broadcasting/weights update.
    broadcast_time = time_end_broadcast - time_begin_broadcast
@@ -282,7 +282,7 @@ def init_process_sgl(
        )

    torch.cuda.synchronize()
-    time_begin_update = time.time()
+    time_begin_update = time.perf_counter()

    # The last parameter is lm_head.weight, which is tied
    # with embed_tokens.weight. Actually, we only need
@@ -312,7 +312,7 @@ def init_process_sgl(
                },
            )
    torch.cuda.synchronize()
-    time_end_update = time.time()
+    time_end_update = time.perf_counter()

    # Measure the latency of broadcast/weights update.
    update_time = time_end_update - time_begin_update
--- a/test/srt/test_update_weights_from_tensor.py
+++ b/test/srt/test_update_weights_from_tensor.py
@@ -21,9 +21,9 @@ def test_update_weights_from_tensor(tp_size):
    memory_before = torch.cuda.memory_allocated()
    new_tensor = torch.full((16384, 2048), 1.5, device="cuda")

-    time_start = time.time()
+    time_start = time.perf_counter()
    engine.update_weights_from_tensor([(x, new_tensor) for x in param_names])
-    print(f"Time delta: {time.time() - time_start:.03f}")
+    print(f"Time delta: {time.perf_counter() - time_start:.03f}")

    for param_name in param_names[:3]:
        _check_param(engine, param_name, [1.5] * 5)
--- a/test/srt/test_w8a8_quantization.py
+++ b/test/srt/test_w8a8_quantization.py
@@ -62,9 +62,9 @@ class TestW8A8(CustomTestCase):
    def test_throughput(self):
        max_tokens = 256

-        tic = time.time()
+        tic = time.perf_counter()
        res = self.run_decode(max_tokens)
-        tok = time.time()
+        tok = time.perf_counter()
        print(res["text"])
        throughput = max_tokens / (tok - tic)
        print(f"Throughput: {throughput} tokens/s")