Replace time.time() to time.perf_counter() for benchmarking. (#6178)

Signed-off-by: Lifu Huang <lifu.hlf@gmail.com>
This commit is contained in:
Lifu Huang
2025-05-11 14:32:49 -07:00
committed by GitHub
parent e9a47f4cb5
commit 6e2da51561
61 changed files with 158 additions and 158 deletions

View File

@@ -184,9 +184,9 @@ class ExperimentRunner:
self.logger = logging.getLogger(__name__)
def wait_for_server(self, port: int, timeout: int = 300) -> bool:
start_time = time.time()
start_time = time.perf_counter()
while time.time() - start_time < timeout:
while time.perf_counter() - start_time < timeout:
try:
response = requests.get(f"http://localhost:{port}/health")
if response.status_code == 200:
@@ -197,7 +197,7 @@ class ExperimentRunner:
return False
def run_task(self, config: TaskConfig) -> TaskResult:
start_time = time.time()
start_time = time.perf_counter()
client_output = []
try:
@@ -247,7 +247,7 @@ class ExperimentRunner:
name=config.name,
success=True,
output=formatted_output,
runtime=time.time() - start_time,
runtime=time.perf_counter() - start_time,
timestamp=datetime.now().isoformat(),
)
@@ -256,7 +256,7 @@ class ExperimentRunner:
name=config.name,
success=False,
output=str(e),
runtime=time.time() - start_time,
runtime=time.perf_counter() - start_time,
timestamp=datetime.now().isoformat(),
)

View File

@@ -79,9 +79,9 @@ class TestEncoderEmbeddingModels(CustomTestCase):
# warm up
hf_outputs = hf_runner.forward(truncated_prompts)
st_start_time = time.time()
st_start_time = time.perf_counter()
hf_outputs = hf_runner.forward(truncated_prompts)
st_end_time = time.time()
st_end_time = time.perf_counter()
with SRTRunner(
model_path,
@@ -95,9 +95,9 @@ class TestEncoderEmbeddingModels(CustomTestCase):
# warm up
srt_outputs = srt_runner.forward(truncated_prompts)
sgl_start_time = time.time()
sgl_start_time = time.perf_counter()
srt_outputs = srt_runner.forward(truncated_prompts)
sgl_end_time = time.time()
sgl_end_time = time.perf_counter()
transformer_time = st_end_time - st_start_time
sgl_time = sgl_end_time - sgl_start_time

View File

@@ -130,9 +130,9 @@ class TestGPTQModelDynamic(CustomTestCase):
def test_throughput(self):
max_tokens = 256
tic = time.time()
tic = time.perf_counter()
result = self.run_decode(max_tokens)
tok = time.time()
tok = time.perf_counter()
print(f"result = `{result}`")
@@ -185,9 +185,9 @@ class TestGPTQModelDynamicWithMarlin(CustomTestCase):
def test_throughput(self):
max_tokens = 256
tic = time.time()
tic = time.perf_counter()
result = self.run_decode(max_tokens)
tok = time.time()
tok = time.perf_counter()
print(f"result = `{result}`")

View File

@@ -42,10 +42,10 @@ class TestReleaseMemoryOccupation(CustomTestCase):
)
print("release_memory_occupation start")
t = time.time()
t = time.perf_counter()
engine.release_memory_occupation()
if _DEBUG_EXTRA:
print("release_memory_occupation", time.time() - t)
print("release_memory_occupation", time.perf_counter() - t)
if _DEBUG_EXTRA:
time.sleep(5)
@@ -60,10 +60,10 @@ class TestReleaseMemoryOccupation(CustomTestCase):
time.sleep(5)
print("resume_memory_occupation start")
t = time.time()
t = time.perf_counter()
engine.resume_memory_occupation()
if _DEBUG_EXTRA:
print("resume_memory_occupation", time.time() - t)
print("resume_memory_occupation", time.perf_counter() - t)
self.assertEqual(
_try_allocate_big_tensor(),

View File

@@ -62,9 +62,9 @@ class TestTorchCompile(CustomTestCase):
res = self.run_decode(16)
max_tokens = 256
tic = time.time()
tic = time.perf_counter()
res = self.run_decode(max_tokens)
tok = time.time()
tok = time.perf_counter()
print(f"{res=}")
throughput = max_tokens / (tok - tic)
print(f"Throughput: {throughput} tokens/s")

View File

@@ -62,9 +62,9 @@ class TestTorchCompileMoe(CustomTestCase):
res = self.run_decode(16)
max_tokens = 256
tic = time.time()
tic = time.perf_counter()
res = self.run_decode(max_tokens)
tok = time.time()
tok = time.perf_counter()
print(f"{res=}")
throughput = max_tokens / (tok - tic)
self.assertGreaterEqual(throughput, 285)

View File

@@ -61,9 +61,9 @@ class TestTorchAO(CustomTestCase):
max_tokens = 256
tic = time.time()
tic = time.perf_counter()
res = self.run_decode(max_tokens)
tok = time.time()
tok = time.perf_counter()
print(res["text"])
throughput = max_tokens / (tok - tic)
print(f"Throughput: {throughput} tokens/s")

View File

@@ -164,7 +164,7 @@ def init_process_hf(
)
dist.barrier(group=group, device_ids=[rank])
torch.cuda.synchronize()
time_begin_broadcast = time.time()
time_begin_broadcast = time.perf_counter()
# The last parameter is lm_head.weight, which is tied
# with embed_tokens.weight. Actually, we only need
@@ -182,7 +182,7 @@ def init_process_hf(
group=group,
)
torch.cuda.synchronize()
time_end_broadcast = time.time()
time_end_broadcast = time.perf_counter()
# Measure the latency of broadcasting/weights update.
broadcast_time = time_end_broadcast - time_begin_broadcast
@@ -282,7 +282,7 @@ def init_process_sgl(
)
torch.cuda.synchronize()
time_begin_update = time.time()
time_begin_update = time.perf_counter()
# The last parameter is lm_head.weight, which is tied
# with embed_tokens.weight. Actually, we only need
@@ -312,7 +312,7 @@ def init_process_sgl(
},
)
torch.cuda.synchronize()
time_end_update = time.time()
time_end_update = time.perf_counter()
# Measure the latency of broadcast/weights update.
update_time = time_end_update - time_begin_update

View File

@@ -21,9 +21,9 @@ def test_update_weights_from_tensor(tp_size):
memory_before = torch.cuda.memory_allocated()
new_tensor = torch.full((16384, 2048), 1.5, device="cuda")
time_start = time.time()
time_start = time.perf_counter()
engine.update_weights_from_tensor([(x, new_tensor) for x in param_names])
print(f"Time delta: {time.time() - time_start:.03f}")
print(f"Time delta: {time.perf_counter() - time_start:.03f}")
for param_name in param_names[:3]:
_check_param(engine, param_name, [1.5] * 5)

View File

@@ -62,9 +62,9 @@ class TestW8A8(CustomTestCase):
def test_throughput(self):
max_tokens = 256
tic = time.time()
tic = time.perf_counter()
res = self.run_decode(max_tokens)
tok = time.time()
tok = time.perf_counter()
print(res["text"])
throughput = max_tokens / (tok - tic)
print(f"Throughput: {throughput} tokens/s")