Replace time.time() to time.perf_counter() for benchmarking. (#6178)
Signed-off-by: Lifu Huang <lifu.hlf@gmail.com>
This commit is contained in:
@@ -184,9 +184,9 @@ class ExperimentRunner:
|
||||
self.logger = logging.getLogger(__name__)
|
||||
|
||||
def wait_for_server(self, port: int, timeout: int = 300) -> bool:
|
||||
start_time = time.time()
|
||||
start_time = time.perf_counter()
|
||||
|
||||
while time.time() - start_time < timeout:
|
||||
while time.perf_counter() - start_time < timeout:
|
||||
try:
|
||||
response = requests.get(f"http://localhost:{port}/health")
|
||||
if response.status_code == 200:
|
||||
@@ -197,7 +197,7 @@ class ExperimentRunner:
|
||||
return False
|
||||
|
||||
def run_task(self, config: TaskConfig) -> TaskResult:
|
||||
start_time = time.time()
|
||||
start_time = time.perf_counter()
|
||||
client_output = []
|
||||
|
||||
try:
|
||||
@@ -247,7 +247,7 @@ class ExperimentRunner:
|
||||
name=config.name,
|
||||
success=True,
|
||||
output=formatted_output,
|
||||
runtime=time.time() - start_time,
|
||||
runtime=time.perf_counter() - start_time,
|
||||
timestamp=datetime.now().isoformat(),
|
||||
)
|
||||
|
||||
@@ -256,7 +256,7 @@ class ExperimentRunner:
|
||||
name=config.name,
|
||||
success=False,
|
||||
output=str(e),
|
||||
runtime=time.time() - start_time,
|
||||
runtime=time.perf_counter() - start_time,
|
||||
timestamp=datetime.now().isoformat(),
|
||||
)
|
||||
|
||||
|
||||
@@ -79,9 +79,9 @@ class TestEncoderEmbeddingModels(CustomTestCase):
|
||||
# warm up
|
||||
hf_outputs = hf_runner.forward(truncated_prompts)
|
||||
|
||||
st_start_time = time.time()
|
||||
st_start_time = time.perf_counter()
|
||||
hf_outputs = hf_runner.forward(truncated_prompts)
|
||||
st_end_time = time.time()
|
||||
st_end_time = time.perf_counter()
|
||||
|
||||
with SRTRunner(
|
||||
model_path,
|
||||
@@ -95,9 +95,9 @@ class TestEncoderEmbeddingModels(CustomTestCase):
|
||||
# warm up
|
||||
srt_outputs = srt_runner.forward(truncated_prompts)
|
||||
|
||||
sgl_start_time = time.time()
|
||||
sgl_start_time = time.perf_counter()
|
||||
srt_outputs = srt_runner.forward(truncated_prompts)
|
||||
sgl_end_time = time.time()
|
||||
sgl_end_time = time.perf_counter()
|
||||
|
||||
transformer_time = st_end_time - st_start_time
|
||||
sgl_time = sgl_end_time - sgl_start_time
|
||||
|
||||
@@ -130,9 +130,9 @@ class TestGPTQModelDynamic(CustomTestCase):
|
||||
def test_throughput(self):
|
||||
max_tokens = 256
|
||||
|
||||
tic = time.time()
|
||||
tic = time.perf_counter()
|
||||
result = self.run_decode(max_tokens)
|
||||
tok = time.time()
|
||||
tok = time.perf_counter()
|
||||
|
||||
print(f"result = `{result}`")
|
||||
|
||||
@@ -185,9 +185,9 @@ class TestGPTQModelDynamicWithMarlin(CustomTestCase):
|
||||
def test_throughput(self):
|
||||
max_tokens = 256
|
||||
|
||||
tic = time.time()
|
||||
tic = time.perf_counter()
|
||||
result = self.run_decode(max_tokens)
|
||||
tok = time.time()
|
||||
tok = time.perf_counter()
|
||||
|
||||
print(f"result = `{result}`")
|
||||
|
||||
|
||||
@@ -42,10 +42,10 @@ class TestReleaseMemoryOccupation(CustomTestCase):
|
||||
)
|
||||
|
||||
print("release_memory_occupation start")
|
||||
t = time.time()
|
||||
t = time.perf_counter()
|
||||
engine.release_memory_occupation()
|
||||
if _DEBUG_EXTRA:
|
||||
print("release_memory_occupation", time.time() - t)
|
||||
print("release_memory_occupation", time.perf_counter() - t)
|
||||
|
||||
if _DEBUG_EXTRA:
|
||||
time.sleep(5)
|
||||
@@ -60,10 +60,10 @@ class TestReleaseMemoryOccupation(CustomTestCase):
|
||||
time.sleep(5)
|
||||
|
||||
print("resume_memory_occupation start")
|
||||
t = time.time()
|
||||
t = time.perf_counter()
|
||||
engine.resume_memory_occupation()
|
||||
if _DEBUG_EXTRA:
|
||||
print("resume_memory_occupation", time.time() - t)
|
||||
print("resume_memory_occupation", time.perf_counter() - t)
|
||||
|
||||
self.assertEqual(
|
||||
_try_allocate_big_tensor(),
|
||||
|
||||
@@ -62,9 +62,9 @@ class TestTorchCompile(CustomTestCase):
|
||||
res = self.run_decode(16)
|
||||
|
||||
max_tokens = 256
|
||||
tic = time.time()
|
||||
tic = time.perf_counter()
|
||||
res = self.run_decode(max_tokens)
|
||||
tok = time.time()
|
||||
tok = time.perf_counter()
|
||||
print(f"{res=}")
|
||||
throughput = max_tokens / (tok - tic)
|
||||
print(f"Throughput: {throughput} tokens/s")
|
||||
|
||||
@@ -62,9 +62,9 @@ class TestTorchCompileMoe(CustomTestCase):
|
||||
res = self.run_decode(16)
|
||||
|
||||
max_tokens = 256
|
||||
tic = time.time()
|
||||
tic = time.perf_counter()
|
||||
res = self.run_decode(max_tokens)
|
||||
tok = time.time()
|
||||
tok = time.perf_counter()
|
||||
print(f"{res=}")
|
||||
throughput = max_tokens / (tok - tic)
|
||||
self.assertGreaterEqual(throughput, 285)
|
||||
|
||||
@@ -61,9 +61,9 @@ class TestTorchAO(CustomTestCase):
|
||||
|
||||
max_tokens = 256
|
||||
|
||||
tic = time.time()
|
||||
tic = time.perf_counter()
|
||||
res = self.run_decode(max_tokens)
|
||||
tok = time.time()
|
||||
tok = time.perf_counter()
|
||||
print(res["text"])
|
||||
throughput = max_tokens / (tok - tic)
|
||||
print(f"Throughput: {throughput} tokens/s")
|
||||
|
||||
@@ -164,7 +164,7 @@ def init_process_hf(
|
||||
)
|
||||
dist.barrier(group=group, device_ids=[rank])
|
||||
torch.cuda.synchronize()
|
||||
time_begin_broadcast = time.time()
|
||||
time_begin_broadcast = time.perf_counter()
|
||||
|
||||
# The last parameter is lm_head.weight, which is tied
|
||||
# with embed_tokens.weight. Actually, we only need
|
||||
@@ -182,7 +182,7 @@ def init_process_hf(
|
||||
group=group,
|
||||
)
|
||||
torch.cuda.synchronize()
|
||||
time_end_broadcast = time.time()
|
||||
time_end_broadcast = time.perf_counter()
|
||||
|
||||
# Measure the latency of broadcasting/weights update.
|
||||
broadcast_time = time_end_broadcast - time_begin_broadcast
|
||||
@@ -282,7 +282,7 @@ def init_process_sgl(
|
||||
)
|
||||
|
||||
torch.cuda.synchronize()
|
||||
time_begin_update = time.time()
|
||||
time_begin_update = time.perf_counter()
|
||||
|
||||
# The last parameter is lm_head.weight, which is tied
|
||||
# with embed_tokens.weight. Actually, we only need
|
||||
@@ -312,7 +312,7 @@ def init_process_sgl(
|
||||
},
|
||||
)
|
||||
torch.cuda.synchronize()
|
||||
time_end_update = time.time()
|
||||
time_end_update = time.perf_counter()
|
||||
|
||||
# Measure the latency of broadcast/weights update.
|
||||
update_time = time_end_update - time_begin_update
|
||||
|
||||
@@ -21,9 +21,9 @@ def test_update_weights_from_tensor(tp_size):
|
||||
memory_before = torch.cuda.memory_allocated()
|
||||
new_tensor = torch.full((16384, 2048), 1.5, device="cuda")
|
||||
|
||||
time_start = time.time()
|
||||
time_start = time.perf_counter()
|
||||
engine.update_weights_from_tensor([(x, new_tensor) for x in param_names])
|
||||
print(f"Time delta: {time.time() - time_start:.03f}")
|
||||
print(f"Time delta: {time.perf_counter() - time_start:.03f}")
|
||||
|
||||
for param_name in param_names[:3]:
|
||||
_check_param(engine, param_name, [1.5] * 5)
|
||||
|
||||
@@ -62,9 +62,9 @@ class TestW8A8(CustomTestCase):
|
||||
def test_throughput(self):
|
||||
max_tokens = 256
|
||||
|
||||
tic = time.time()
|
||||
tic = time.perf_counter()
|
||||
res = self.run_decode(max_tokens)
|
||||
tok = time.time()
|
||||
tok = time.perf_counter()
|
||||
print(res["text"])
|
||||
throughput = max_tokens / (tok - tic)
|
||||
print(f"Throughput: {throughput} tokens/s")
|
||||
|
||||
Reference in New Issue
Block a user