Replace time.time() to time.perf_counter() for benchmarking. (#6178)
Signed-off-by: Lifu Huang <lifu.hlf@gmail.com>
This commit is contained in:
@@ -64,11 +64,11 @@ def test_batch_by_batch(all_prompts, gen_len):
|
||||
|
||||
tot_time = 0
|
||||
for i in range(len(all_prompts)):
|
||||
tic = time.time()
|
||||
tic = time.perf_counter()
|
||||
text_qa.run_batch(
|
||||
list(zip(all_prompts[i], [gen_len] * len(all_prompts[i]))),
|
||||
)
|
||||
tot_time += time.time() - tic
|
||||
tot_time += time.perf_counter() - tic
|
||||
|
||||
return tot_time
|
||||
|
||||
@@ -78,13 +78,13 @@ def test_batch_by_batch_with_hint(all_prompts, gen_len):
|
||||
|
||||
tot_time = 0
|
||||
for i in range(len(all_prompts)):
|
||||
tic = time.time()
|
||||
tic = time.perf_counter()
|
||||
# Send a hint to cache the prefix
|
||||
text_qa.run_batch(list(zip(all_prompts[i][:1], [gen_len])))
|
||||
# Send the batch
|
||||
text_qa.run_batch(list(zip(all_prompts[i], [gen_len] * len(all_prompts[i]))))
|
||||
|
||||
tot_time += time.time() - tic
|
||||
tot_time += time.perf_counter() - tic
|
||||
|
||||
return tot_time
|
||||
|
||||
@@ -94,11 +94,11 @@ def test_send_all(all_prompts, gen_len):
|
||||
|
||||
all_prompts = [x for prompt_list in all_prompts for x in prompt_list]
|
||||
|
||||
tic = time.time()
|
||||
tic = time.perf_counter()
|
||||
text_qa.run_batch(
|
||||
list(zip(all_prompts, [gen_len] * len(all_prompts))),
|
||||
)
|
||||
tot_time = time.time() - tic
|
||||
tot_time = time.perf_counter() - tic
|
||||
|
||||
return tot_time
|
||||
|
||||
|
||||
@@ -81,7 +81,7 @@ def send_batch_request(endpoint, prompts, gen_tokens, request_id):
|
||||
}
|
||||
data = {"text": prompts, "sampling_params": sampling_params}
|
||||
|
||||
start_time = time.time()
|
||||
start_time = time.perf_counter()
|
||||
try:
|
||||
response = requests.post(
|
||||
endpoint.base_url + "/generate", json=data, timeout=3600
|
||||
@@ -90,7 +90,7 @@ def send_batch_request(endpoint, prompts, gen_tokens, request_id):
|
||||
error = response.json()
|
||||
raise RuntimeError(f"Request {request_id} failed: {error}")
|
||||
result = response.json()
|
||||
elapsed_time = (time.time() - start_time) * 1000 # Convert to ms
|
||||
elapsed_time = (time.perf_counter() - start_time) * 1000 # Convert to ms
|
||||
avg_per_prompt = elapsed_time / len(prompts) if prompts else 0
|
||||
return request_id, elapsed_time, avg_per_prompt, True, len(prompts)
|
||||
except Exception as e:
|
||||
@@ -104,7 +104,7 @@ def run_benchmark(endpoint, batched_prompts, batch_size, gen_tokens):
|
||||
num_requests = len(batched_prompts)
|
||||
|
||||
# Record start time for total latency
|
||||
benchmark_start_time = time.time()
|
||||
benchmark_start_time = time.perf_counter()
|
||||
|
||||
for i, batch_prompts in enumerate(batched_prompts):
|
||||
request_id = i + 1
|
||||
@@ -119,7 +119,7 @@ def run_benchmark(endpoint, batched_prompts, batch_size, gen_tokens):
|
||||
results.append(result)
|
||||
|
||||
# Calculate total latency
|
||||
total_latency = (time.time() - benchmark_start_time) * 1000 # Convert to ms
|
||||
total_latency = (time.perf_counter() - benchmark_start_time) * 1000 # Convert to ms
|
||||
|
||||
return results, total_latency
|
||||
|
||||
|
||||
@@ -44,10 +44,10 @@ def benchmark_sequential_vs_batch(prompts, batch_size, tokenizer):
|
||||
for run in range(NUM_RUNS):
|
||||
batch_prompts = prompts[:batch_size] # Use same prompts for fair comparison
|
||||
|
||||
start_time = time.time()
|
||||
start_time = time.perf_counter()
|
||||
for prompt in batch_prompts:
|
||||
tokens = tokenizer.encode(prompt)
|
||||
sequential_time = (time.time() - start_time) * 1000
|
||||
sequential_time = (time.perf_counter() - start_time) * 1000
|
||||
sequential_times.append(sequential_time)
|
||||
|
||||
# Batch tokenization using tokenizer()
|
||||
@@ -55,9 +55,9 @@ def benchmark_sequential_vs_batch(prompts, batch_size, tokenizer):
|
||||
for run in range(NUM_RUNS):
|
||||
batch_prompts = prompts[:batch_size] # Use same prompts for fair comparison
|
||||
|
||||
start_time = time.time()
|
||||
start_time = time.perf_counter()
|
||||
tokens = tokenizer(batch_prompts)
|
||||
batch_time = (time.time() - start_time) * 1000
|
||||
batch_time = (time.perf_counter() - start_time) * 1000
|
||||
batch_times.append(batch_time)
|
||||
|
||||
return {
|
||||
|
||||
@@ -39,7 +39,7 @@ def main(args):
|
||||
answer = await call_generate(**arg, temperature=0)
|
||||
states.append(answer)
|
||||
|
||||
tic = time.time()
|
||||
tic = time.perf_counter()
|
||||
# we always sequentially execute agent calls to maintain its dependency
|
||||
if args.backend != "lmql":
|
||||
for arg in tqdm(arguments):
|
||||
@@ -50,7 +50,7 @@ def main(args):
|
||||
loop = asyncio.get_event_loop()
|
||||
for arg in tqdm(arguments):
|
||||
loop.run_until_complete(get_one_answer_async(arg))
|
||||
latency = time.time() - tic
|
||||
latency = time.perf_counter() - tic
|
||||
|
||||
print(f"Latency: {latency:.3f}")
|
||||
|
||||
|
||||
@@ -35,14 +35,14 @@ def main(args):
|
||||
|
||||
states = []
|
||||
# Run requests
|
||||
tic = time.time()
|
||||
tic = time.perf_counter()
|
||||
for a in arguments:
|
||||
# only a single key in the dict
|
||||
for func, arg in a.items():
|
||||
result = func.run(**arg)
|
||||
result.sync()
|
||||
states.append(result)
|
||||
latency = time.time() - tic
|
||||
latency = time.perf_counter() - tic
|
||||
|
||||
# Compute accuracy
|
||||
print(f"Latency: {latency:.3f}")
|
||||
|
||||
@@ -75,7 +75,7 @@ def main(args):
|
||||
)
|
||||
states[i] = answer
|
||||
|
||||
tic = time.time()
|
||||
tic = time.perf_counter()
|
||||
if args.parallel == 1:
|
||||
for i in tqdm(range(len(questions))):
|
||||
get_one_answer(i)
|
||||
@@ -106,9 +106,9 @@ def main(args):
|
||||
for j in range(len(rets)):
|
||||
states[i + j] = rets[j]
|
||||
|
||||
tic = time.time()
|
||||
tic = time.perf_counter()
|
||||
asyncio.run(batched_call(batch_size=args.parallel))
|
||||
latency = time.time() - tic
|
||||
latency = time.perf_counter() - tic
|
||||
|
||||
preds = []
|
||||
for i in range(len(states)):
|
||||
|
||||
@@ -84,14 +84,14 @@ def main(args):
|
||||
#####################################
|
||||
|
||||
# Run requests
|
||||
tic = time.time()
|
||||
tic = time.perf_counter()
|
||||
states = few_shot_gsm8k.run_batch(
|
||||
arguments,
|
||||
temperature=0,
|
||||
num_threads=args.parallel,
|
||||
progress_bar=True,
|
||||
)
|
||||
latency = time.time() - tic
|
||||
latency = time.perf_counter() - tic
|
||||
|
||||
preds = []
|
||||
for i in range(len(states)):
|
||||
|
||||
@@ -57,7 +57,7 @@ def main(args):
|
||||
context=few_shot_examples + questions[i], choices=choices[i]
|
||||
)
|
||||
|
||||
tic = time.time()
|
||||
tic = time.perf_counter()
|
||||
if args.parallel == 1:
|
||||
for i in tqdm(range(len(questions))):
|
||||
get_one_answer(i)
|
||||
@@ -82,10 +82,10 @@ def main(args):
|
||||
for j in range(len(rets)):
|
||||
preds[i + j] = rets[j]
|
||||
|
||||
tic = time.time()
|
||||
tic = time.perf_counter()
|
||||
asyncio.run(batched_call(batch_size=args.parallel))
|
||||
|
||||
latency = time.time() - tic
|
||||
latency = time.perf_counter() - tic
|
||||
|
||||
# Compute accuracy
|
||||
acc = np.mean(np.array(preds) == np.array(labels))
|
||||
|
||||
@@ -68,7 +68,7 @@ def main(args):
|
||||
#####################################
|
||||
|
||||
# Run requests
|
||||
tic = time.time()
|
||||
tic = time.perf_counter()
|
||||
rets = few_shot_hellaswag.run_batch(
|
||||
arguments,
|
||||
temperature=0,
|
||||
@@ -76,7 +76,7 @@ def main(args):
|
||||
progress_bar=True,
|
||||
)
|
||||
preds = [choices[i].index(rets[i]["answer"]) for i in range(len(rets))]
|
||||
latency = time.time() - tic
|
||||
latency = time.perf_counter() - tic
|
||||
|
||||
# Compute accuracy
|
||||
acc = np.mean(np.array(preds) == np.array(labels))
|
||||
|
||||
@@ -261,7 +261,7 @@ class WorkloadGenerator:
|
||||
client_id, payload = item
|
||||
response = await async_request_sglang_generate(payload, self.url, self.pbar)
|
||||
if self.pbar.n == self.pbar.total:
|
||||
self.finished_time = time.time()
|
||||
self.finished_time = time.perf_counter()
|
||||
self.response_queue.put((client_id, response))
|
||||
except Exception as e:
|
||||
print(f"Request failed: {e}")
|
||||
@@ -334,7 +334,7 @@ class WorkloadGenerator:
|
||||
request_thread = threading.Thread(target=self.request_sender, daemon=True)
|
||||
response_thread = threading.Thread(target=self.response_handler, daemon=True)
|
||||
|
||||
self.start_time = time.time()
|
||||
self.start_time = time.perf_counter()
|
||||
request_thread.start()
|
||||
response_thread.start()
|
||||
|
||||
|
||||
@@ -53,7 +53,7 @@ def main(args):
|
||||
def get_one_answer(i):
|
||||
states[i] = json_decode(generate=call_generate, **arguments[i])
|
||||
|
||||
tic = time.time()
|
||||
tic = time.perf_counter()
|
||||
if args.parallel == 1:
|
||||
for i in tqdm(range(len(arguments))):
|
||||
get_one_answer(i)
|
||||
@@ -68,7 +68,7 @@ def main(args):
|
||||
for _ in rets:
|
||||
pass
|
||||
|
||||
latency = time.time() - tic
|
||||
latency = time.perf_counter() - tic
|
||||
|
||||
# Compute accuracy
|
||||
print(f"Latency: {latency:.3f}")
|
||||
|
||||
@@ -63,11 +63,11 @@ def main(args):
|
||||
json_warm_up.run().sync()
|
||||
|
||||
# Run requests
|
||||
tic = time.time()
|
||||
tic = time.perf_counter()
|
||||
states = json_decode.run_batch(
|
||||
arguments, temperature=0, num_threads=args.parallel, progress_bar=True
|
||||
)
|
||||
latency = time.time() - tic
|
||||
latency = time.perf_counter() - tic
|
||||
|
||||
# Compute accuracy
|
||||
print(f"Latency: {latency:.3f}")
|
||||
|
||||
@@ -175,7 +175,7 @@ def bench_character(args):
|
||||
else:
|
||||
raise ValueError(f"Invalid backend: {args.backend}")
|
||||
|
||||
tic = time.time()
|
||||
tic = time.perf_counter()
|
||||
|
||||
if args.backend != "lmql":
|
||||
if args.parallel == 1:
|
||||
@@ -202,7 +202,7 @@ def bench_character(args):
|
||||
asyncio.gather(*[get_one_answer_async(i) for i in bt])
|
||||
)
|
||||
|
||||
latency = time.time() - tic
|
||||
latency = time.perf_counter() - tic
|
||||
|
||||
return states, latency
|
||||
|
||||
@@ -236,7 +236,7 @@ def bench_city_doc(args):
|
||||
else:
|
||||
raise ValueError(f"Invalid backend: {args.backend}")
|
||||
|
||||
tic = time.time()
|
||||
tic = time.perf_counter()
|
||||
if args.parallel == 1:
|
||||
for i in tqdm(range(len(arguments))):
|
||||
get_one_answer(i)
|
||||
@@ -246,7 +246,7 @@ def bench_city_doc(args):
|
||||
for _ in rets:
|
||||
pass
|
||||
|
||||
latency = time.time() - tic
|
||||
latency = time.perf_counter() - tic
|
||||
|
||||
return states, latency
|
||||
|
||||
|
||||
@@ -67,14 +67,14 @@ def bench_city_doc(args):
|
||||
sgl.set_default_backend(backend)
|
||||
|
||||
# Run requests
|
||||
tic = time.time()
|
||||
tic = time.perf_counter()
|
||||
states = city_gen.run_batch(
|
||||
arguments,
|
||||
temperature=0,
|
||||
num_threads=args.parallel,
|
||||
progress_bar=True,
|
||||
)
|
||||
latency = time.time() - tic
|
||||
latency = time.perf_counter() - tic
|
||||
|
||||
return states, latency
|
||||
|
||||
@@ -91,14 +91,14 @@ def bench_character(args):
|
||||
sgl.set_default_backend(backend)
|
||||
|
||||
# Run requests
|
||||
tic = time.time()
|
||||
tic = time.perf_counter()
|
||||
states = character_gen.run_batch(
|
||||
arguments,
|
||||
temperature=0,
|
||||
num_threads=args.parallel,
|
||||
progress_bar=True,
|
||||
)
|
||||
latency = time.time() - tic
|
||||
latency = time.perf_counter() - tic
|
||||
|
||||
return states, latency
|
||||
|
||||
|
||||
@@ -85,14 +85,14 @@ def bench_schema(args):
|
||||
sgl.set_default_backend(backend)
|
||||
|
||||
# Run requests
|
||||
tic = time.time()
|
||||
tic = time.perf_counter()
|
||||
states = schema_gen.run_batch(
|
||||
arguments,
|
||||
temperature=0,
|
||||
num_threads=args.parallel,
|
||||
progress_bar=True,
|
||||
)
|
||||
latency = time.time() - tic
|
||||
latency = time.perf_counter() - tic
|
||||
|
||||
# Check if the outputs are valid
|
||||
indexes = []
|
||||
|
||||
@@ -487,7 +487,7 @@ def main(args: argparse.Namespace):
|
||||
]
|
||||
print(f"Start tuning over {len(search_space)} configurations...")
|
||||
|
||||
start = time.time()
|
||||
start = time.perf_counter()
|
||||
configs = _distribute(
|
||||
"tune",
|
||||
[
|
||||
@@ -522,7 +522,7 @@ def main(args: argparse.Namespace):
|
||||
use_int8_w8a16,
|
||||
block_shape,
|
||||
)
|
||||
end = time.time()
|
||||
end = time.perf_counter()
|
||||
print(f"Tuning took {end - start:.2f} seconds")
|
||||
else:
|
||||
outputs = _distribute(
|
||||
|
||||
@@ -359,7 +359,7 @@ def tune_on_gpu(args_dict):
|
||||
config for config in search_space if block_k % config["BLOCK_SIZE_K"] == 0
|
||||
]
|
||||
|
||||
start = time.time()
|
||||
start = time.perf_counter()
|
||||
results = {}
|
||||
for shape in tqdm(weight_shapes, desc=f"GPU {gpu_id} - Shapes"):
|
||||
N, K = shape[0], shape[1]
|
||||
@@ -379,7 +379,7 @@ def tune_on_gpu(args_dict):
|
||||
best_configs = {M: config for M, config in zip(batch_sizes, benchmark_results)}
|
||||
save_configs(N, K, block_n, block_k, best_configs, save_path, input_type)
|
||||
|
||||
end = time.time()
|
||||
end = time.perf_counter()
|
||||
print(f"Tuning on GPU {gpu_id} took {end - start:.2f} seconds")
|
||||
|
||||
|
||||
|
||||
@@ -70,7 +70,7 @@ def eval_model(args, line_obj, num_hoops, src_indices, dst_percents):
|
||||
# Select backend
|
||||
backend = select_sglang_backend(args)
|
||||
|
||||
tic = time.time()
|
||||
tic = time.perf_counter()
|
||||
states = line_retrieval.run_batch(
|
||||
arguments,
|
||||
temperature=0,
|
||||
@@ -78,7 +78,7 @@ def eval_model(args, line_obj, num_hoops, src_indices, dst_percents):
|
||||
num_threads=args.parallel,
|
||||
progress_bar=True,
|
||||
)
|
||||
latency = time.time() - tic
|
||||
latency = time.perf_counter() - tic
|
||||
|
||||
corrects = []
|
||||
for i in range(len(arguments)):
|
||||
|
||||
@@ -41,7 +41,7 @@ def main(args):
|
||||
sgl.set_default_backend(backend)
|
||||
|
||||
# Run requests
|
||||
tic = time.time()
|
||||
tic = time.perf_counter()
|
||||
if args.parallel == 1:
|
||||
for i in tqdm.tqdm(range(len(lines))):
|
||||
image_file = arguments[i]["image_file"]
|
||||
@@ -52,7 +52,7 @@ def main(args):
|
||||
states = image_qa.run_batch(
|
||||
arguments, temperature=0, num_threads=args.parallel, progress_bar=True
|
||||
)
|
||||
latency = time.time() - tic
|
||||
latency = time.perf_counter() - tic
|
||||
|
||||
print(f"Latency: {latency:.3f}")
|
||||
|
||||
|
||||
@@ -85,7 +85,7 @@ def main(args):
|
||||
call_generate = partial(get_call_generate(args), temperature=0)
|
||||
|
||||
# Run requests
|
||||
tic = time.time()
|
||||
tic = time.perf_counter()
|
||||
|
||||
if args.backend != "lmql":
|
||||
|
||||
@@ -120,7 +120,7 @@ def main(args):
|
||||
asyncio.gather(*[get_one_answer_async(i) for i in bt])
|
||||
)
|
||||
|
||||
latency = time.time() - tic
|
||||
latency = time.perf_counter() - tic
|
||||
|
||||
# Compute accuracy
|
||||
print(f"Latency: {latency:.3f}")
|
||||
|
||||
@@ -59,7 +59,7 @@ def main(args):
|
||||
backend = select_sglang_backend(args)
|
||||
|
||||
# Run requests
|
||||
tic = time.time()
|
||||
tic = time.perf_counter()
|
||||
states = multi_dimension_judge.run_batch(
|
||||
arguments,
|
||||
temperature=0,
|
||||
@@ -67,7 +67,7 @@ def main(args):
|
||||
num_threads=args.parallel,
|
||||
progress_bar=True,
|
||||
)
|
||||
latency = time.time() - tic
|
||||
latency = time.perf_counter() - tic
|
||||
|
||||
print(f"Latency: {latency:.3f}")
|
||||
|
||||
|
||||
@@ -45,7 +45,7 @@ def main(args):
|
||||
def get_one_answer(i):
|
||||
states[i] = json_decode(generate=call_generate, **arguments[i])
|
||||
|
||||
tic = time.time()
|
||||
tic = time.perf_counter()
|
||||
if args.parallel == 1:
|
||||
for i in tqdm(range(len(arguments))):
|
||||
get_one_answer(i)
|
||||
@@ -58,7 +58,7 @@ def main(args):
|
||||
)
|
||||
)
|
||||
|
||||
latency = time.time() - tic
|
||||
latency = time.perf_counter() - tic
|
||||
|
||||
# Compute accuracy
|
||||
print(f"Latency: {latency:.3f}")
|
||||
|
||||
@@ -46,11 +46,11 @@ def main(args):
|
||||
sgl.set_default_backend(backend)
|
||||
|
||||
# Run requests
|
||||
tic = time.time()
|
||||
tic = time.perf_counter()
|
||||
states = json_decode.run_batch(
|
||||
arguments, temperature=0, num_threads=args.parallel, progress_bar=True
|
||||
)
|
||||
latency = time.time() - tic
|
||||
latency = time.perf_counter() - tic
|
||||
|
||||
# Compute accuracy
|
||||
print(f"Latency: {latency:.3f}")
|
||||
|
||||
@@ -76,7 +76,7 @@ def evaluate(args, subject, dev_df, test_df, call_generate):
|
||||
pred = call_generate(prompts[i], temperature=0, max_tokens=max_tokens)
|
||||
preds[i] = pred.strip()[0]
|
||||
|
||||
tic = time.time()
|
||||
tic = time.perf_counter()
|
||||
if args.parallel == 1:
|
||||
for i in range(len(prompts)):
|
||||
get_one_answer(i)
|
||||
@@ -94,9 +94,9 @@ def evaluate(args, subject, dev_df, test_df, call_generate):
|
||||
for j in range(len(rets)):
|
||||
preds[i + j] = rets[j].strip()[0]
|
||||
|
||||
tic = time.time()
|
||||
tic = time.perf_counter()
|
||||
asyncio.run(batched_call(batch_size=args.parallel))
|
||||
latency = time.time() - tic
|
||||
latency = time.perf_counter() - tic
|
||||
|
||||
# Compute accuracy
|
||||
cors = [pred == label for pred, label in zip(preds, labels)]
|
||||
|
||||
@@ -116,7 +116,7 @@ def main(args):
|
||||
backend = select_sglang_backend(args)
|
||||
|
||||
# Run
|
||||
tic = time.time()
|
||||
tic = time.perf_counter()
|
||||
states = few_shot_mmlu.run_batch(
|
||||
arguments,
|
||||
temperature=0,
|
||||
@@ -128,7 +128,7 @@ def main(args):
|
||||
preds = [
|
||||
s["answer"].strip()[0] if len(s["answer"].strip()) > 0 else "" for s in states
|
||||
]
|
||||
latency = time.time() - tic
|
||||
latency = time.perf_counter() - tic
|
||||
|
||||
# Compute accuracy
|
||||
cors = [pred == label for pred, label in zip(preds, labels)]
|
||||
|
||||
@@ -119,7 +119,7 @@ async def eval_mmmu(args) -> None:
|
||||
api_key="sk", base_url=f"http://127.0.0.1:{args.port}/v1"
|
||||
)
|
||||
semaphore = asyncio.Semaphore(args.concurrency)
|
||||
start = time.time()
|
||||
start = time.perf_counter()
|
||||
base_url = f"http://127.0.0.1:{args.port}"
|
||||
|
||||
if args.profile:
|
||||
@@ -147,7 +147,7 @@ async def eval_mmmu(args) -> None:
|
||||
if profile_output.success:
|
||||
print("Profiler stopped")
|
||||
|
||||
print(f"Benchmark time: {time.time() - start}")
|
||||
print(f"Benchmark time: {time.perf_counter() - start}")
|
||||
args.output_path = f"./val_sglang.json"
|
||||
save_json(args.output_path, out_samples)
|
||||
eval_result(model_answer_path=args.output_path, answer_dict=answer_dict)
|
||||
|
||||
@@ -66,7 +66,7 @@ def main(args):
|
||||
answers[i] = cur_answers
|
||||
|
||||
# Run requests
|
||||
tic = time.time()
|
||||
tic = time.perf_counter()
|
||||
if args.parallel == 1:
|
||||
for i in tqdm(range(len(questions))):
|
||||
get_answer(i)
|
||||
@@ -79,7 +79,7 @@ def main(args):
|
||||
)
|
||||
)
|
||||
|
||||
latency = time.time() - tic
|
||||
latency = time.perf_counter() - tic
|
||||
|
||||
print(f"#questions: {len(questions)}, Latency: {latency:.2f}")
|
||||
|
||||
|
||||
@@ -57,7 +57,7 @@ def main(args):
|
||||
sgl.set_default_backend(backend)
|
||||
|
||||
# Run requests
|
||||
tic = time.time()
|
||||
tic = time.perf_counter()
|
||||
rets = answer_mt_bench.run_batch(
|
||||
arguments,
|
||||
temperature=0,
|
||||
@@ -66,7 +66,7 @@ def main(args):
|
||||
progress_bar=True,
|
||||
)
|
||||
answers = [[s["answer_1"], s["answer_2"]] for s in rets]
|
||||
latency = time.time() - tic
|
||||
latency = time.perf_counter() - tic
|
||||
|
||||
print(f"#questions: {len(questions)}, Latency: {latency:.2f}")
|
||||
|
||||
|
||||
@@ -68,7 +68,7 @@ def main(args):
|
||||
sgl.set_default_backend(backend)
|
||||
|
||||
# Run requests
|
||||
tic = time.time()
|
||||
tic = time.perf_counter()
|
||||
rets = answer_mt_bench.run_batch(
|
||||
arguments,
|
||||
temperature=0,
|
||||
@@ -78,7 +78,7 @@ def main(args):
|
||||
)
|
||||
answers = [[s["answer_1"], s["answer_2"]] for s in rets]
|
||||
|
||||
latency = time.time() - tic
|
||||
latency = time.perf_counter() - tic
|
||||
num_output_tokens = sum(
|
||||
s.get_meta_info("answer_1")["completion_tokens"]
|
||||
+ s.get_meta_info("answer_2")["completion_tokens"]
|
||||
|
||||
@@ -113,7 +113,7 @@ def main(args):
|
||||
answer = multi_chain_gsm8k(questions[i], args.num_chains, call_generate)
|
||||
states[i] = answer
|
||||
|
||||
tic = time.time()
|
||||
tic = time.perf_counter()
|
||||
if args.parallel == 1:
|
||||
for i in tqdm(range(len(questions))):
|
||||
get_one_answer(i)
|
||||
@@ -134,7 +134,7 @@ def main(args):
|
||||
)
|
||||
states[i] = answer
|
||||
|
||||
tic = time.time()
|
||||
tic = time.perf_counter()
|
||||
loop = asyncio.get_event_loop()
|
||||
batches = [
|
||||
list(range(i, min(i + args.parallel, len(questions))))
|
||||
@@ -144,7 +144,7 @@ def main(args):
|
||||
tasks = [get_one_answer_asyncio(k) for k in bt]
|
||||
loop.run_until_complete(asyncio.gather(*tasks))
|
||||
|
||||
latency = time.time() - tic
|
||||
latency = time.perf_counter() - tic
|
||||
|
||||
preds = []
|
||||
for i in range(len(states)):
|
||||
|
||||
@@ -90,7 +90,7 @@ def main(args):
|
||||
backend = select_sglang_backend(args)
|
||||
|
||||
# Run requests
|
||||
tic = time.time()
|
||||
tic = time.perf_counter()
|
||||
states = multi_chain_gsm8k.run_batch(
|
||||
arguments,
|
||||
temperature=0,
|
||||
@@ -98,7 +98,7 @@ def main(args):
|
||||
num_threads=args.parallel,
|
||||
progress_bar=True,
|
||||
)
|
||||
latency = time.time() - tic
|
||||
latency = time.perf_counter() - tic
|
||||
|
||||
preds = []
|
||||
for i in range(len(states)):
|
||||
|
||||
@@ -61,7 +61,7 @@ def main(args):
|
||||
def get_one_answer(i):
|
||||
states[i] = multi_document_qa(generate=call_generate, **arguments[i])
|
||||
|
||||
tic = time.time()
|
||||
tic = time.perf_counter()
|
||||
if args.parallel == 1:
|
||||
for i in tqdm(range(len(labels))):
|
||||
get_one_answer(i)
|
||||
@@ -74,7 +74,7 @@ def main(args):
|
||||
)
|
||||
)
|
||||
|
||||
latency = time.time() - tic
|
||||
latency = time.perf_counter() - tic
|
||||
|
||||
# Compute accuracy
|
||||
print(states)
|
||||
|
||||
@@ -49,11 +49,11 @@ def main(args):
|
||||
sgl.set_default_backend(backend)
|
||||
|
||||
# Run requests
|
||||
tic = time.time()
|
||||
tic = time.perf_counter()
|
||||
states = multi_document_qa.run_batch(
|
||||
arguments, temperature=0, num_threads=args.parallel, progress_bar=True
|
||||
)
|
||||
latency = time.time() - tic
|
||||
latency = time.perf_counter() - tic
|
||||
|
||||
# Compute accuracy
|
||||
print([s["answer"] for s in states])
|
||||
|
||||
@@ -35,7 +35,7 @@ def main(args):
|
||||
def get_one_answer(i):
|
||||
states[i] = multi_turns(generate=call_generate, **multi_qas[i])
|
||||
|
||||
tic = time.time()
|
||||
tic = time.perf_counter()
|
||||
if args.parallel == 1:
|
||||
for i in tqdm(range(len(multi_qas))):
|
||||
get_one_answer(i)
|
||||
@@ -50,7 +50,7 @@ def main(args):
|
||||
for _ in rets:
|
||||
pass
|
||||
|
||||
latency = time.time() - tic
|
||||
latency = time.perf_counter() - tic
|
||||
|
||||
# Compute accuracy
|
||||
print(f"Latency: {latency:.3f}")
|
||||
|
||||
@@ -27,7 +27,7 @@ def main(args):
|
||||
|
||||
backend = select_sglang_backend(args)
|
||||
|
||||
tic = time.time()
|
||||
tic = time.perf_counter()
|
||||
states = multi_turns.run_batch(
|
||||
multi_qas,
|
||||
temperature=0,
|
||||
@@ -35,7 +35,7 @@ def main(args):
|
||||
num_threads=args.parallel,
|
||||
progress_bar=True,
|
||||
)
|
||||
latency = time.time() - tic
|
||||
latency = time.perf_counter() - tic
|
||||
|
||||
print(f"Latency: {latency:.3f}")
|
||||
|
||||
|
||||
@@ -84,7 +84,7 @@ def main(args):
|
||||
|
||||
backend = select_sglang_backend(args)
|
||||
|
||||
tic = time.time()
|
||||
tic = time.perf_counter()
|
||||
states = multi_turns.run_batch(
|
||||
multi_qas,
|
||||
temperature=0,
|
||||
@@ -92,7 +92,7 @@ def main(args):
|
||||
num_threads="auto",
|
||||
progress_bar=True,
|
||||
)
|
||||
latency = time.time() - tic
|
||||
latency = time.perf_counter() - tic
|
||||
|
||||
print(f"Latency: {latency:.3f}")
|
||||
|
||||
|
||||
@@ -146,7 +146,7 @@ def main(args):
|
||||
|
||||
states.append(answer)
|
||||
|
||||
tic = time.time()
|
||||
tic = time.perf_counter()
|
||||
|
||||
if args.backend != "lmql":
|
||||
if args.parallel == 1:
|
||||
@@ -173,7 +173,7 @@ def main(args):
|
||||
tasks = [run_single_agent_async(arg) for arg in bt]
|
||||
loop.run_until_complete(asyncio.gather(*tasks))
|
||||
|
||||
latency = time.time() - tic
|
||||
latency = time.perf_counter() - tic
|
||||
|
||||
print(f"Latency: {latency:.3f}")
|
||||
|
||||
|
||||
@@ -115,14 +115,14 @@ def main(args):
|
||||
sgl.set_default_backend(backend)
|
||||
|
||||
states = []
|
||||
tic = time.time()
|
||||
tic = time.perf_counter()
|
||||
states = webthink.run_batch(
|
||||
arguments,
|
||||
temperature=0,
|
||||
num_threads=args.parallel,
|
||||
progress_bar=True,
|
||||
)
|
||||
latency = time.time() - tic
|
||||
latency = time.perf_counter() - tic
|
||||
|
||||
# Compute accuracy
|
||||
print(f"Latency: {latency:.3f}")
|
||||
|
||||
@@ -51,7 +51,7 @@ def main(args):
|
||||
)
|
||||
|
||||
# Run requests
|
||||
tic = time.time()
|
||||
tic = time.perf_counter()
|
||||
states = reasoning_gen.run_batch(
|
||||
questions,
|
||||
num_threads=args.parallel,
|
||||
@@ -60,7 +60,7 @@ def main(args):
|
||||
max_new_tokens=32768,
|
||||
top_p=0.95,
|
||||
)
|
||||
latency = time.time() - tic
|
||||
latency = time.perf_counter() - tic
|
||||
|
||||
# Extract results and record outcomes in a list.
|
||||
outcomes = []
|
||||
|
||||
@@ -68,7 +68,7 @@ def main(args):
|
||||
call_generate = partial(get_call_generate(args), temperature=0)
|
||||
|
||||
# Run requests
|
||||
tic = time.time()
|
||||
tic = time.perf_counter()
|
||||
if args.backend != "lmql":
|
||||
|
||||
def get_one_answer(i):
|
||||
@@ -102,7 +102,7 @@ def main(args):
|
||||
loop.run_until_complete(
|
||||
asyncio.gather(*[get_one_answer_async(i) for i in batch])
|
||||
)
|
||||
latency = time.time() - tic
|
||||
latency = time.perf_counter() - tic
|
||||
|
||||
# Compute accuracy
|
||||
print(f"Latency: {latency:.3f}")
|
||||
|
||||
@@ -65,11 +65,11 @@ def main(args):
|
||||
sgl.set_default_backend(select_sglang_backend(args))
|
||||
|
||||
# Run requests
|
||||
tic = time.time()
|
||||
tic = time.perf_counter()
|
||||
states = suggest_tips.run_batch(
|
||||
arguments, temperature=0, num_threads=args.parallel, progress_bar=True
|
||||
)
|
||||
latency = time.time() - tic
|
||||
latency = time.perf_counter() - tic
|
||||
|
||||
# Compute accuracy
|
||||
print(f"Latency: {latency:.3f}")
|
||||
|
||||
@@ -138,7 +138,7 @@ def main(args):
|
||||
# Run requests
|
||||
states = [None] * len(questions)
|
||||
|
||||
tic = time.time()
|
||||
tic = time.perf_counter()
|
||||
if args.backend != "lmql":
|
||||
|
||||
def get_one_answer(i):
|
||||
@@ -177,7 +177,7 @@ def main(args):
|
||||
tasks = [get_one_answer_async(k) for k in bt]
|
||||
loop.run_until_complete(asyncio.gather(*tasks))
|
||||
|
||||
latency = time.time() - tic
|
||||
latency = time.perf_counter() - tic
|
||||
|
||||
answers_text = []
|
||||
for s in states:
|
||||
|
||||
@@ -119,7 +119,7 @@ def main(args):
|
||||
backend = select_sglang_backend(args)
|
||||
|
||||
# Run requests
|
||||
tic = time.time()
|
||||
tic = time.perf_counter()
|
||||
states = tree_search.run_batch(
|
||||
arguments,
|
||||
temperature=0,
|
||||
@@ -127,7 +127,7 @@ def main(args):
|
||||
num_threads=args.parallel,
|
||||
progress_bar=True,
|
||||
)
|
||||
latency = time.time() - tic
|
||||
latency = time.perf_counter() - tic
|
||||
answers_text = []
|
||||
for s in states:
|
||||
answers_text.append([x for xs in s.ret_value for x in xs])
|
||||
|
||||
@@ -121,7 +121,7 @@ def main(args):
|
||||
def get_one_answer(i):
|
||||
states[i] = tree_search(**arguments[i], call_generate=call_generate)
|
||||
|
||||
tic = time.time()
|
||||
tic = time.perf_counter()
|
||||
if args.parallel == 1:
|
||||
for i in tqdm(range(len(questions))):
|
||||
get_one_answer(i)
|
||||
@@ -134,7 +134,7 @@ def main(args):
|
||||
)
|
||||
)
|
||||
|
||||
latency = time.time() - tic
|
||||
latency = time.perf_counter() - tic
|
||||
|
||||
answers_text = []
|
||||
for s in states:
|
||||
|
||||
@@ -107,7 +107,7 @@ def main(args):
|
||||
backend = select_sglang_backend(args)
|
||||
|
||||
# Run requests
|
||||
tic = time.time()
|
||||
tic = time.perf_counter()
|
||||
states = tree_search.run_batch(
|
||||
arguments,
|
||||
temperature=0,
|
||||
@@ -115,7 +115,7 @@ def main(args):
|
||||
num_threads=args.parallel,
|
||||
progress_bar=True,
|
||||
)
|
||||
latency = time.time() - tic
|
||||
latency = time.perf_counter() - tic
|
||||
answers_text = []
|
||||
for s in states:
|
||||
answers_text.append([x for xs in s["answer"] for x in xs])
|
||||
|
||||
@@ -90,7 +90,7 @@ def run_eval(args):
|
||||
#####################################
|
||||
|
||||
# Run requests
|
||||
tic = time.time()
|
||||
tic = time.perf_counter()
|
||||
states = few_shot_gsm8k.run_batch(
|
||||
arguments,
|
||||
temperature=args.temperature if hasattr(args, "temperature") else 0,
|
||||
@@ -99,7 +99,7 @@ def run_eval(args):
|
||||
return_logprob=getattr(args, "return_logprob", None),
|
||||
logprob_start_len=getattr(args, "logprob_start_len", None),
|
||||
)
|
||||
latency = time.time() - tic
|
||||
latency = time.perf_counter() - tic
|
||||
|
||||
preds = []
|
||||
for i in range(len(states)):
|
||||
|
||||
@@ -89,7 +89,7 @@ def run_eval(args):
|
||||
}
|
||||
|
||||
# Run requests
|
||||
tic = time.time()
|
||||
tic = time.perf_counter()
|
||||
|
||||
loop = asyncio.get_event_loop()
|
||||
|
||||
@@ -98,7 +98,7 @@ def run_eval(args):
|
||||
)
|
||||
|
||||
# End requests
|
||||
latency = time.time() - tic
|
||||
latency = time.perf_counter() - tic
|
||||
|
||||
# Shutdown the engine
|
||||
engine.shutdown()
|
||||
|
||||
@@ -71,9 +71,9 @@ def run_eval(args):
|
||||
)
|
||||
|
||||
# Run eval
|
||||
tic = time.time()
|
||||
tic = time.perf_counter()
|
||||
result = eval_obj(sampler)
|
||||
latency = time.time() - tic
|
||||
latency = time.perf_counter() - tic
|
||||
|
||||
# Dump reports
|
||||
metrics = result.metrics | {"score": result.score}
|
||||
|
||||
@@ -503,7 +503,7 @@ def test_hellaswag_select():
|
||||
#####################################
|
||||
|
||||
# Run requests
|
||||
tic = time.time()
|
||||
tic = time.perf_counter()
|
||||
rets = few_shot_hellaswag.run_batch(
|
||||
arguments,
|
||||
temperature=0,
|
||||
@@ -514,13 +514,13 @@ def test_hellaswag_select():
|
||||
preds = []
|
||||
for i, ret in enumerate(rets):
|
||||
preds.append(choices[i].index(ret["answer"]))
|
||||
latency = time.time() - tic
|
||||
latency = time.perf_counter() - tic
|
||||
|
||||
# Compute accuracy
|
||||
accuracy = np.mean(np.array(preds) == np.array(labels))
|
||||
|
||||
# Test generator style of run_batch
|
||||
tic = time.time()
|
||||
tic = time.perf_counter()
|
||||
rets = few_shot_hellaswag.run_batch(
|
||||
arguments,
|
||||
temperature=0,
|
||||
@@ -531,7 +531,7 @@ def test_hellaswag_select():
|
||||
preds_gen = []
|
||||
for i, ret in enumerate(rets):
|
||||
preds_gen.append(choices[i].index(ret["answer"]))
|
||||
latency_gen = time.time() - tic
|
||||
latency_gen = time.perf_counter() - tic
|
||||
|
||||
# Compute accuracy
|
||||
accuracy_gen = np.mean(np.array(preds_gen) == np.array(labels))
|
||||
|
||||
@@ -449,9 +449,9 @@ def popen_launch_server(
|
||||
else:
|
||||
process = subprocess.Popen(command, stdout=None, stderr=None, env=env)
|
||||
|
||||
start_time = time.time()
|
||||
start_time = time.perf_counter()
|
||||
with requests.Session() as session:
|
||||
while time.time() - start_time < timeout:
|
||||
while time.perf_counter() - start_time < timeout:
|
||||
try:
|
||||
headers = {
|
||||
"Content-Type": "application/json; charset=utf-8",
|
||||
@@ -584,7 +584,7 @@ class TestFile:
|
||||
|
||||
|
||||
def run_unittest_files(files: List[TestFile], timeout_per_file: float):
|
||||
tic = time.time()
|
||||
tic = time.perf_counter()
|
||||
success = True
|
||||
|
||||
for i, file in enumerate(files):
|
||||
@@ -599,13 +599,13 @@ def run_unittest_files(files: List[TestFile], timeout_per_file: float):
|
||||
f".\n.\nBegin ({i}/{len(files) - 1}):\npython3 {filename}\n.\n.\n",
|
||||
flush=True,
|
||||
)
|
||||
tic = time.time()
|
||||
tic = time.perf_counter()
|
||||
|
||||
process = subprocess.Popen(
|
||||
["python3", filename], stdout=None, stderr=None, env=os.environ
|
||||
)
|
||||
process.wait()
|
||||
elapsed = time.time() - tic
|
||||
elapsed = time.perf_counter() - tic
|
||||
|
||||
print(
|
||||
f".\n.\nEnd ({i}/{len(files) - 1}):\n{filename=}, {elapsed=:.0f}, {estimated_time=}\n.\n.\n",
|
||||
@@ -631,9 +631,9 @@ def run_unittest_files(files: List[TestFile], timeout_per_file: float):
|
||||
break
|
||||
|
||||
if success:
|
||||
print(f"Success. Time elapsed: {time.time() - tic:.2f}s", flush=True)
|
||||
print(f"Success. Time elapsed: {time.perf_counter() - tic:.2f}s", flush=True)
|
||||
else:
|
||||
print(f"Fail. Time elapsed: {time.time() - tic:.2f}s", flush=True)
|
||||
print(f"Fail. Time elapsed: {time.perf_counter() - tic:.2f}s", flush=True)
|
||||
|
||||
return 0 if success else -1
|
||||
|
||||
|
||||
@@ -92,9 +92,9 @@ def popen_launch_router(
|
||||
|
||||
process = subprocess.Popen(command, stdout=None, stderr=None)
|
||||
|
||||
start_time = time.time()
|
||||
start_time = time.perf_counter()
|
||||
with requests.Session() as session:
|
||||
while time.time() - start_time < timeout:
|
||||
while time.perf_counter() - start_time < timeout:
|
||||
try:
|
||||
response = session.get(f"{base_url}/health")
|
||||
if response.status_code == 200:
|
||||
@@ -155,11 +155,11 @@ def terminate_and_wait(process, timeout=300):
|
||||
return
|
||||
|
||||
process.terminate()
|
||||
start_time = time.time()
|
||||
start_time = time.perf_counter()
|
||||
|
||||
while process.poll() is None:
|
||||
print(f"Terminating process {process.pid}")
|
||||
if time.time() - start_time > timeout:
|
||||
if time.perf_counter() - start_time > timeout:
|
||||
raise TimeoutError(
|
||||
f"Process {process.pid} failed to terminate within {timeout}s"
|
||||
)
|
||||
|
||||
@@ -184,9 +184,9 @@ class ExperimentRunner:
|
||||
self.logger = logging.getLogger(__name__)
|
||||
|
||||
def wait_for_server(self, port: int, timeout: int = 300) -> bool:
|
||||
start_time = time.time()
|
||||
start_time = time.perf_counter()
|
||||
|
||||
while time.time() - start_time < timeout:
|
||||
while time.perf_counter() - start_time < timeout:
|
||||
try:
|
||||
response = requests.get(f"http://localhost:{port}/health")
|
||||
if response.status_code == 200:
|
||||
@@ -197,7 +197,7 @@ class ExperimentRunner:
|
||||
return False
|
||||
|
||||
def run_task(self, config: TaskConfig) -> TaskResult:
|
||||
start_time = time.time()
|
||||
start_time = time.perf_counter()
|
||||
client_output = []
|
||||
|
||||
try:
|
||||
@@ -247,7 +247,7 @@ class ExperimentRunner:
|
||||
name=config.name,
|
||||
success=True,
|
||||
output=formatted_output,
|
||||
runtime=time.time() - start_time,
|
||||
runtime=time.perf_counter() - start_time,
|
||||
timestamp=datetime.now().isoformat(),
|
||||
)
|
||||
|
||||
@@ -256,7 +256,7 @@ class ExperimentRunner:
|
||||
name=config.name,
|
||||
success=False,
|
||||
output=str(e),
|
||||
runtime=time.time() - start_time,
|
||||
runtime=time.perf_counter() - start_time,
|
||||
timestamp=datetime.now().isoformat(),
|
||||
)
|
||||
|
||||
|
||||
@@ -79,9 +79,9 @@ class TestEncoderEmbeddingModels(CustomTestCase):
|
||||
# warm up
|
||||
hf_outputs = hf_runner.forward(truncated_prompts)
|
||||
|
||||
st_start_time = time.time()
|
||||
st_start_time = time.perf_counter()
|
||||
hf_outputs = hf_runner.forward(truncated_prompts)
|
||||
st_end_time = time.time()
|
||||
st_end_time = time.perf_counter()
|
||||
|
||||
with SRTRunner(
|
||||
model_path,
|
||||
@@ -95,9 +95,9 @@ class TestEncoderEmbeddingModels(CustomTestCase):
|
||||
# warm up
|
||||
srt_outputs = srt_runner.forward(truncated_prompts)
|
||||
|
||||
sgl_start_time = time.time()
|
||||
sgl_start_time = time.perf_counter()
|
||||
srt_outputs = srt_runner.forward(truncated_prompts)
|
||||
sgl_end_time = time.time()
|
||||
sgl_end_time = time.perf_counter()
|
||||
|
||||
transformer_time = st_end_time - st_start_time
|
||||
sgl_time = sgl_end_time - sgl_start_time
|
||||
|
||||
@@ -130,9 +130,9 @@ class TestGPTQModelDynamic(CustomTestCase):
|
||||
def test_throughput(self):
|
||||
max_tokens = 256
|
||||
|
||||
tic = time.time()
|
||||
tic = time.perf_counter()
|
||||
result = self.run_decode(max_tokens)
|
||||
tok = time.time()
|
||||
tok = time.perf_counter()
|
||||
|
||||
print(f"result = `{result}`")
|
||||
|
||||
@@ -185,9 +185,9 @@ class TestGPTQModelDynamicWithMarlin(CustomTestCase):
|
||||
def test_throughput(self):
|
||||
max_tokens = 256
|
||||
|
||||
tic = time.time()
|
||||
tic = time.perf_counter()
|
||||
result = self.run_decode(max_tokens)
|
||||
tok = time.time()
|
||||
tok = time.perf_counter()
|
||||
|
||||
print(f"result = `{result}`")
|
||||
|
||||
|
||||
@@ -42,10 +42,10 @@ class TestReleaseMemoryOccupation(CustomTestCase):
|
||||
)
|
||||
|
||||
print("release_memory_occupation start")
|
||||
t = time.time()
|
||||
t = time.perf_counter()
|
||||
engine.release_memory_occupation()
|
||||
if _DEBUG_EXTRA:
|
||||
print("release_memory_occupation", time.time() - t)
|
||||
print("release_memory_occupation", time.perf_counter() - t)
|
||||
|
||||
if _DEBUG_EXTRA:
|
||||
time.sleep(5)
|
||||
@@ -60,10 +60,10 @@ class TestReleaseMemoryOccupation(CustomTestCase):
|
||||
time.sleep(5)
|
||||
|
||||
print("resume_memory_occupation start")
|
||||
t = time.time()
|
||||
t = time.perf_counter()
|
||||
engine.resume_memory_occupation()
|
||||
if _DEBUG_EXTRA:
|
||||
print("resume_memory_occupation", time.time() - t)
|
||||
print("resume_memory_occupation", time.perf_counter() - t)
|
||||
|
||||
self.assertEqual(
|
||||
_try_allocate_big_tensor(),
|
||||
|
||||
@@ -62,9 +62,9 @@ class TestTorchCompile(CustomTestCase):
|
||||
res = self.run_decode(16)
|
||||
|
||||
max_tokens = 256
|
||||
tic = time.time()
|
||||
tic = time.perf_counter()
|
||||
res = self.run_decode(max_tokens)
|
||||
tok = time.time()
|
||||
tok = time.perf_counter()
|
||||
print(f"{res=}")
|
||||
throughput = max_tokens / (tok - tic)
|
||||
print(f"Throughput: {throughput} tokens/s")
|
||||
|
||||
@@ -62,9 +62,9 @@ class TestTorchCompileMoe(CustomTestCase):
|
||||
res = self.run_decode(16)
|
||||
|
||||
max_tokens = 256
|
||||
tic = time.time()
|
||||
tic = time.perf_counter()
|
||||
res = self.run_decode(max_tokens)
|
||||
tok = time.time()
|
||||
tok = time.perf_counter()
|
||||
print(f"{res=}")
|
||||
throughput = max_tokens / (tok - tic)
|
||||
self.assertGreaterEqual(throughput, 285)
|
||||
|
||||
@@ -61,9 +61,9 @@ class TestTorchAO(CustomTestCase):
|
||||
|
||||
max_tokens = 256
|
||||
|
||||
tic = time.time()
|
||||
tic = time.perf_counter()
|
||||
res = self.run_decode(max_tokens)
|
||||
tok = time.time()
|
||||
tok = time.perf_counter()
|
||||
print(res["text"])
|
||||
throughput = max_tokens / (tok - tic)
|
||||
print(f"Throughput: {throughput} tokens/s")
|
||||
|
||||
@@ -164,7 +164,7 @@ def init_process_hf(
|
||||
)
|
||||
dist.barrier(group=group, device_ids=[rank])
|
||||
torch.cuda.synchronize()
|
||||
time_begin_broadcast = time.time()
|
||||
time_begin_broadcast = time.perf_counter()
|
||||
|
||||
# The last parameter is lm_head.weight, which is tied
|
||||
# with embed_tokens.weight. Actually, we only need
|
||||
@@ -182,7 +182,7 @@ def init_process_hf(
|
||||
group=group,
|
||||
)
|
||||
torch.cuda.synchronize()
|
||||
time_end_broadcast = time.time()
|
||||
time_end_broadcast = time.perf_counter()
|
||||
|
||||
# Measure the latency of broadcasting/weights update.
|
||||
broadcast_time = time_end_broadcast - time_begin_broadcast
|
||||
@@ -282,7 +282,7 @@ def init_process_sgl(
|
||||
)
|
||||
|
||||
torch.cuda.synchronize()
|
||||
time_begin_update = time.time()
|
||||
time_begin_update = time.perf_counter()
|
||||
|
||||
# The last parameter is lm_head.weight, which is tied
|
||||
# with embed_tokens.weight. Actually, we only need
|
||||
@@ -312,7 +312,7 @@ def init_process_sgl(
|
||||
},
|
||||
)
|
||||
torch.cuda.synchronize()
|
||||
time_end_update = time.time()
|
||||
time_end_update = time.perf_counter()
|
||||
|
||||
# Measure the latency of broadcast/weights update.
|
||||
update_time = time_end_update - time_begin_update
|
||||
|
||||
@@ -21,9 +21,9 @@ def test_update_weights_from_tensor(tp_size):
|
||||
memory_before = torch.cuda.memory_allocated()
|
||||
new_tensor = torch.full((16384, 2048), 1.5, device="cuda")
|
||||
|
||||
time_start = time.time()
|
||||
time_start = time.perf_counter()
|
||||
engine.update_weights_from_tensor([(x, new_tensor) for x in param_names])
|
||||
print(f"Time delta: {time.time() - time_start:.03f}")
|
||||
print(f"Time delta: {time.perf_counter() - time_start:.03f}")
|
||||
|
||||
for param_name in param_names[:3]:
|
||||
_check_param(engine, param_name, [1.5] * 5)
|
||||
|
||||
@@ -62,9 +62,9 @@ class TestW8A8(CustomTestCase):
|
||||
def test_throughput(self):
|
||||
max_tokens = 256
|
||||
|
||||
tic = time.time()
|
||||
tic = time.perf_counter()
|
||||
res = self.run_decode(max_tokens)
|
||||
tok = time.time()
|
||||
tok = time.perf_counter()
|
||||
print(res["text"])
|
||||
throughput = max_tokens / (tok - tic)
|
||||
print(f"Throughput: {throughput} tokens/s")
|
||||
|
||||
Reference in New Issue
Block a user