Replace time.time() to time.perf_counter() for benchmarking. (#6178)
Signed-off-by: Lifu Huang <lifu.hlf@gmail.com>
This commit is contained in:
@@ -64,11 +64,11 @@ def test_batch_by_batch(all_prompts, gen_len):
|
|||||||
|
|
||||||
tot_time = 0
|
tot_time = 0
|
||||||
for i in range(len(all_prompts)):
|
for i in range(len(all_prompts)):
|
||||||
tic = time.time()
|
tic = time.perf_counter()
|
||||||
text_qa.run_batch(
|
text_qa.run_batch(
|
||||||
list(zip(all_prompts[i], [gen_len] * len(all_prompts[i]))),
|
list(zip(all_prompts[i], [gen_len] * len(all_prompts[i]))),
|
||||||
)
|
)
|
||||||
tot_time += time.time() - tic
|
tot_time += time.perf_counter() - tic
|
||||||
|
|
||||||
return tot_time
|
return tot_time
|
||||||
|
|
||||||
@@ -78,13 +78,13 @@ def test_batch_by_batch_with_hint(all_prompts, gen_len):
|
|||||||
|
|
||||||
tot_time = 0
|
tot_time = 0
|
||||||
for i in range(len(all_prompts)):
|
for i in range(len(all_prompts)):
|
||||||
tic = time.time()
|
tic = time.perf_counter()
|
||||||
# Send a hint to cache the prefix
|
# Send a hint to cache the prefix
|
||||||
text_qa.run_batch(list(zip(all_prompts[i][:1], [gen_len])))
|
text_qa.run_batch(list(zip(all_prompts[i][:1], [gen_len])))
|
||||||
# Send the batch
|
# Send the batch
|
||||||
text_qa.run_batch(list(zip(all_prompts[i], [gen_len] * len(all_prompts[i]))))
|
text_qa.run_batch(list(zip(all_prompts[i], [gen_len] * len(all_prompts[i]))))
|
||||||
|
|
||||||
tot_time += time.time() - tic
|
tot_time += time.perf_counter() - tic
|
||||||
|
|
||||||
return tot_time
|
return tot_time
|
||||||
|
|
||||||
@@ -94,11 +94,11 @@ def test_send_all(all_prompts, gen_len):
|
|||||||
|
|
||||||
all_prompts = [x for prompt_list in all_prompts for x in prompt_list]
|
all_prompts = [x for prompt_list in all_prompts for x in prompt_list]
|
||||||
|
|
||||||
tic = time.time()
|
tic = time.perf_counter()
|
||||||
text_qa.run_batch(
|
text_qa.run_batch(
|
||||||
list(zip(all_prompts, [gen_len] * len(all_prompts))),
|
list(zip(all_prompts, [gen_len] * len(all_prompts))),
|
||||||
)
|
)
|
||||||
tot_time = time.time() - tic
|
tot_time = time.perf_counter() - tic
|
||||||
|
|
||||||
return tot_time
|
return tot_time
|
||||||
|
|
||||||
|
|||||||
@@ -81,7 +81,7 @@ def send_batch_request(endpoint, prompts, gen_tokens, request_id):
|
|||||||
}
|
}
|
||||||
data = {"text": prompts, "sampling_params": sampling_params}
|
data = {"text": prompts, "sampling_params": sampling_params}
|
||||||
|
|
||||||
start_time = time.time()
|
start_time = time.perf_counter()
|
||||||
try:
|
try:
|
||||||
response = requests.post(
|
response = requests.post(
|
||||||
endpoint.base_url + "/generate", json=data, timeout=3600
|
endpoint.base_url + "/generate", json=data, timeout=3600
|
||||||
@@ -90,7 +90,7 @@ def send_batch_request(endpoint, prompts, gen_tokens, request_id):
|
|||||||
error = response.json()
|
error = response.json()
|
||||||
raise RuntimeError(f"Request {request_id} failed: {error}")
|
raise RuntimeError(f"Request {request_id} failed: {error}")
|
||||||
result = response.json()
|
result = response.json()
|
||||||
elapsed_time = (time.time() - start_time) * 1000 # Convert to ms
|
elapsed_time = (time.perf_counter() - start_time) * 1000 # Convert to ms
|
||||||
avg_per_prompt = elapsed_time / len(prompts) if prompts else 0
|
avg_per_prompt = elapsed_time / len(prompts) if prompts else 0
|
||||||
return request_id, elapsed_time, avg_per_prompt, True, len(prompts)
|
return request_id, elapsed_time, avg_per_prompt, True, len(prompts)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@@ -104,7 +104,7 @@ def run_benchmark(endpoint, batched_prompts, batch_size, gen_tokens):
|
|||||||
num_requests = len(batched_prompts)
|
num_requests = len(batched_prompts)
|
||||||
|
|
||||||
# Record start time for total latency
|
# Record start time for total latency
|
||||||
benchmark_start_time = time.time()
|
benchmark_start_time = time.perf_counter()
|
||||||
|
|
||||||
for i, batch_prompts in enumerate(batched_prompts):
|
for i, batch_prompts in enumerate(batched_prompts):
|
||||||
request_id = i + 1
|
request_id = i + 1
|
||||||
@@ -119,7 +119,7 @@ def run_benchmark(endpoint, batched_prompts, batch_size, gen_tokens):
|
|||||||
results.append(result)
|
results.append(result)
|
||||||
|
|
||||||
# Calculate total latency
|
# Calculate total latency
|
||||||
total_latency = (time.time() - benchmark_start_time) * 1000 # Convert to ms
|
total_latency = (time.perf_counter() - benchmark_start_time) * 1000 # Convert to ms
|
||||||
|
|
||||||
return results, total_latency
|
return results, total_latency
|
||||||
|
|
||||||
|
|||||||
@@ -44,10 +44,10 @@ def benchmark_sequential_vs_batch(prompts, batch_size, tokenizer):
|
|||||||
for run in range(NUM_RUNS):
|
for run in range(NUM_RUNS):
|
||||||
batch_prompts = prompts[:batch_size] # Use same prompts for fair comparison
|
batch_prompts = prompts[:batch_size] # Use same prompts for fair comparison
|
||||||
|
|
||||||
start_time = time.time()
|
start_time = time.perf_counter()
|
||||||
for prompt in batch_prompts:
|
for prompt in batch_prompts:
|
||||||
tokens = tokenizer.encode(prompt)
|
tokens = tokenizer.encode(prompt)
|
||||||
sequential_time = (time.time() - start_time) * 1000
|
sequential_time = (time.perf_counter() - start_time) * 1000
|
||||||
sequential_times.append(sequential_time)
|
sequential_times.append(sequential_time)
|
||||||
|
|
||||||
# Batch tokenization using tokenizer()
|
# Batch tokenization using tokenizer()
|
||||||
@@ -55,9 +55,9 @@ def benchmark_sequential_vs_batch(prompts, batch_size, tokenizer):
|
|||||||
for run in range(NUM_RUNS):
|
for run in range(NUM_RUNS):
|
||||||
batch_prompts = prompts[:batch_size] # Use same prompts for fair comparison
|
batch_prompts = prompts[:batch_size] # Use same prompts for fair comparison
|
||||||
|
|
||||||
start_time = time.time()
|
start_time = time.perf_counter()
|
||||||
tokens = tokenizer(batch_prompts)
|
tokens = tokenizer(batch_prompts)
|
||||||
batch_time = (time.time() - start_time) * 1000
|
batch_time = (time.perf_counter() - start_time) * 1000
|
||||||
batch_times.append(batch_time)
|
batch_times.append(batch_time)
|
||||||
|
|
||||||
return {
|
return {
|
||||||
|
|||||||
@@ -39,7 +39,7 @@ def main(args):
|
|||||||
answer = await call_generate(**arg, temperature=0)
|
answer = await call_generate(**arg, temperature=0)
|
||||||
states.append(answer)
|
states.append(answer)
|
||||||
|
|
||||||
tic = time.time()
|
tic = time.perf_counter()
|
||||||
# we always sequentially execute agent calls to maintain its dependency
|
# we always sequentially execute agent calls to maintain its dependency
|
||||||
if args.backend != "lmql":
|
if args.backend != "lmql":
|
||||||
for arg in tqdm(arguments):
|
for arg in tqdm(arguments):
|
||||||
@@ -50,7 +50,7 @@ def main(args):
|
|||||||
loop = asyncio.get_event_loop()
|
loop = asyncio.get_event_loop()
|
||||||
for arg in tqdm(arguments):
|
for arg in tqdm(arguments):
|
||||||
loop.run_until_complete(get_one_answer_async(arg))
|
loop.run_until_complete(get_one_answer_async(arg))
|
||||||
latency = time.time() - tic
|
latency = time.perf_counter() - tic
|
||||||
|
|
||||||
print(f"Latency: {latency:.3f}")
|
print(f"Latency: {latency:.3f}")
|
||||||
|
|
||||||
|
|||||||
@@ -35,14 +35,14 @@ def main(args):
|
|||||||
|
|
||||||
states = []
|
states = []
|
||||||
# Run requests
|
# Run requests
|
||||||
tic = time.time()
|
tic = time.perf_counter()
|
||||||
for a in arguments:
|
for a in arguments:
|
||||||
# only a single key in the dict
|
# only a single key in the dict
|
||||||
for func, arg in a.items():
|
for func, arg in a.items():
|
||||||
result = func.run(**arg)
|
result = func.run(**arg)
|
||||||
result.sync()
|
result.sync()
|
||||||
states.append(result)
|
states.append(result)
|
||||||
latency = time.time() - tic
|
latency = time.perf_counter() - tic
|
||||||
|
|
||||||
# Compute accuracy
|
# Compute accuracy
|
||||||
print(f"Latency: {latency:.3f}")
|
print(f"Latency: {latency:.3f}")
|
||||||
|
|||||||
@@ -75,7 +75,7 @@ def main(args):
|
|||||||
)
|
)
|
||||||
states[i] = answer
|
states[i] = answer
|
||||||
|
|
||||||
tic = time.time()
|
tic = time.perf_counter()
|
||||||
if args.parallel == 1:
|
if args.parallel == 1:
|
||||||
for i in tqdm(range(len(questions))):
|
for i in tqdm(range(len(questions))):
|
||||||
get_one_answer(i)
|
get_one_answer(i)
|
||||||
@@ -106,9 +106,9 @@ def main(args):
|
|||||||
for j in range(len(rets)):
|
for j in range(len(rets)):
|
||||||
states[i + j] = rets[j]
|
states[i + j] = rets[j]
|
||||||
|
|
||||||
tic = time.time()
|
tic = time.perf_counter()
|
||||||
asyncio.run(batched_call(batch_size=args.parallel))
|
asyncio.run(batched_call(batch_size=args.parallel))
|
||||||
latency = time.time() - tic
|
latency = time.perf_counter() - tic
|
||||||
|
|
||||||
preds = []
|
preds = []
|
||||||
for i in range(len(states)):
|
for i in range(len(states)):
|
||||||
|
|||||||
@@ -84,14 +84,14 @@ def main(args):
|
|||||||
#####################################
|
#####################################
|
||||||
|
|
||||||
# Run requests
|
# Run requests
|
||||||
tic = time.time()
|
tic = time.perf_counter()
|
||||||
states = few_shot_gsm8k.run_batch(
|
states = few_shot_gsm8k.run_batch(
|
||||||
arguments,
|
arguments,
|
||||||
temperature=0,
|
temperature=0,
|
||||||
num_threads=args.parallel,
|
num_threads=args.parallel,
|
||||||
progress_bar=True,
|
progress_bar=True,
|
||||||
)
|
)
|
||||||
latency = time.time() - tic
|
latency = time.perf_counter() - tic
|
||||||
|
|
||||||
preds = []
|
preds = []
|
||||||
for i in range(len(states)):
|
for i in range(len(states)):
|
||||||
|
|||||||
@@ -57,7 +57,7 @@ def main(args):
|
|||||||
context=few_shot_examples + questions[i], choices=choices[i]
|
context=few_shot_examples + questions[i], choices=choices[i]
|
||||||
)
|
)
|
||||||
|
|
||||||
tic = time.time()
|
tic = time.perf_counter()
|
||||||
if args.parallel == 1:
|
if args.parallel == 1:
|
||||||
for i in tqdm(range(len(questions))):
|
for i in tqdm(range(len(questions))):
|
||||||
get_one_answer(i)
|
get_one_answer(i)
|
||||||
@@ -82,10 +82,10 @@ def main(args):
|
|||||||
for j in range(len(rets)):
|
for j in range(len(rets)):
|
||||||
preds[i + j] = rets[j]
|
preds[i + j] = rets[j]
|
||||||
|
|
||||||
tic = time.time()
|
tic = time.perf_counter()
|
||||||
asyncio.run(batched_call(batch_size=args.parallel))
|
asyncio.run(batched_call(batch_size=args.parallel))
|
||||||
|
|
||||||
latency = time.time() - tic
|
latency = time.perf_counter() - tic
|
||||||
|
|
||||||
# Compute accuracy
|
# Compute accuracy
|
||||||
acc = np.mean(np.array(preds) == np.array(labels))
|
acc = np.mean(np.array(preds) == np.array(labels))
|
||||||
|
|||||||
@@ -68,7 +68,7 @@ def main(args):
|
|||||||
#####################################
|
#####################################
|
||||||
|
|
||||||
# Run requests
|
# Run requests
|
||||||
tic = time.time()
|
tic = time.perf_counter()
|
||||||
rets = few_shot_hellaswag.run_batch(
|
rets = few_shot_hellaswag.run_batch(
|
||||||
arguments,
|
arguments,
|
||||||
temperature=0,
|
temperature=0,
|
||||||
@@ -76,7 +76,7 @@ def main(args):
|
|||||||
progress_bar=True,
|
progress_bar=True,
|
||||||
)
|
)
|
||||||
preds = [choices[i].index(rets[i]["answer"]) for i in range(len(rets))]
|
preds = [choices[i].index(rets[i]["answer"]) for i in range(len(rets))]
|
||||||
latency = time.time() - tic
|
latency = time.perf_counter() - tic
|
||||||
|
|
||||||
# Compute accuracy
|
# Compute accuracy
|
||||||
acc = np.mean(np.array(preds) == np.array(labels))
|
acc = np.mean(np.array(preds) == np.array(labels))
|
||||||
|
|||||||
@@ -261,7 +261,7 @@ class WorkloadGenerator:
|
|||||||
client_id, payload = item
|
client_id, payload = item
|
||||||
response = await async_request_sglang_generate(payload, self.url, self.pbar)
|
response = await async_request_sglang_generate(payload, self.url, self.pbar)
|
||||||
if self.pbar.n == self.pbar.total:
|
if self.pbar.n == self.pbar.total:
|
||||||
self.finished_time = time.time()
|
self.finished_time = time.perf_counter()
|
||||||
self.response_queue.put((client_id, response))
|
self.response_queue.put((client_id, response))
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Request failed: {e}")
|
print(f"Request failed: {e}")
|
||||||
@@ -334,7 +334,7 @@ class WorkloadGenerator:
|
|||||||
request_thread = threading.Thread(target=self.request_sender, daemon=True)
|
request_thread = threading.Thread(target=self.request_sender, daemon=True)
|
||||||
response_thread = threading.Thread(target=self.response_handler, daemon=True)
|
response_thread = threading.Thread(target=self.response_handler, daemon=True)
|
||||||
|
|
||||||
self.start_time = time.time()
|
self.start_time = time.perf_counter()
|
||||||
request_thread.start()
|
request_thread.start()
|
||||||
response_thread.start()
|
response_thread.start()
|
||||||
|
|
||||||
|
|||||||
@@ -53,7 +53,7 @@ def main(args):
|
|||||||
def get_one_answer(i):
|
def get_one_answer(i):
|
||||||
states[i] = json_decode(generate=call_generate, **arguments[i])
|
states[i] = json_decode(generate=call_generate, **arguments[i])
|
||||||
|
|
||||||
tic = time.time()
|
tic = time.perf_counter()
|
||||||
if args.parallel == 1:
|
if args.parallel == 1:
|
||||||
for i in tqdm(range(len(arguments))):
|
for i in tqdm(range(len(arguments))):
|
||||||
get_one_answer(i)
|
get_one_answer(i)
|
||||||
@@ -68,7 +68,7 @@ def main(args):
|
|||||||
for _ in rets:
|
for _ in rets:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
latency = time.time() - tic
|
latency = time.perf_counter() - tic
|
||||||
|
|
||||||
# Compute accuracy
|
# Compute accuracy
|
||||||
print(f"Latency: {latency:.3f}")
|
print(f"Latency: {latency:.3f}")
|
||||||
|
|||||||
@@ -63,11 +63,11 @@ def main(args):
|
|||||||
json_warm_up.run().sync()
|
json_warm_up.run().sync()
|
||||||
|
|
||||||
# Run requests
|
# Run requests
|
||||||
tic = time.time()
|
tic = time.perf_counter()
|
||||||
states = json_decode.run_batch(
|
states = json_decode.run_batch(
|
||||||
arguments, temperature=0, num_threads=args.parallel, progress_bar=True
|
arguments, temperature=0, num_threads=args.parallel, progress_bar=True
|
||||||
)
|
)
|
||||||
latency = time.time() - tic
|
latency = time.perf_counter() - tic
|
||||||
|
|
||||||
# Compute accuracy
|
# Compute accuracy
|
||||||
print(f"Latency: {latency:.3f}")
|
print(f"Latency: {latency:.3f}")
|
||||||
|
|||||||
@@ -175,7 +175,7 @@ def bench_character(args):
|
|||||||
else:
|
else:
|
||||||
raise ValueError(f"Invalid backend: {args.backend}")
|
raise ValueError(f"Invalid backend: {args.backend}")
|
||||||
|
|
||||||
tic = time.time()
|
tic = time.perf_counter()
|
||||||
|
|
||||||
if args.backend != "lmql":
|
if args.backend != "lmql":
|
||||||
if args.parallel == 1:
|
if args.parallel == 1:
|
||||||
@@ -202,7 +202,7 @@ def bench_character(args):
|
|||||||
asyncio.gather(*[get_one_answer_async(i) for i in bt])
|
asyncio.gather(*[get_one_answer_async(i) for i in bt])
|
||||||
)
|
)
|
||||||
|
|
||||||
latency = time.time() - tic
|
latency = time.perf_counter() - tic
|
||||||
|
|
||||||
return states, latency
|
return states, latency
|
||||||
|
|
||||||
@@ -236,7 +236,7 @@ def bench_city_doc(args):
|
|||||||
else:
|
else:
|
||||||
raise ValueError(f"Invalid backend: {args.backend}")
|
raise ValueError(f"Invalid backend: {args.backend}")
|
||||||
|
|
||||||
tic = time.time()
|
tic = time.perf_counter()
|
||||||
if args.parallel == 1:
|
if args.parallel == 1:
|
||||||
for i in tqdm(range(len(arguments))):
|
for i in tqdm(range(len(arguments))):
|
||||||
get_one_answer(i)
|
get_one_answer(i)
|
||||||
@@ -246,7 +246,7 @@ def bench_city_doc(args):
|
|||||||
for _ in rets:
|
for _ in rets:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
latency = time.time() - tic
|
latency = time.perf_counter() - tic
|
||||||
|
|
||||||
return states, latency
|
return states, latency
|
||||||
|
|
||||||
|
|||||||
@@ -67,14 +67,14 @@ def bench_city_doc(args):
|
|||||||
sgl.set_default_backend(backend)
|
sgl.set_default_backend(backend)
|
||||||
|
|
||||||
# Run requests
|
# Run requests
|
||||||
tic = time.time()
|
tic = time.perf_counter()
|
||||||
states = city_gen.run_batch(
|
states = city_gen.run_batch(
|
||||||
arguments,
|
arguments,
|
||||||
temperature=0,
|
temperature=0,
|
||||||
num_threads=args.parallel,
|
num_threads=args.parallel,
|
||||||
progress_bar=True,
|
progress_bar=True,
|
||||||
)
|
)
|
||||||
latency = time.time() - tic
|
latency = time.perf_counter() - tic
|
||||||
|
|
||||||
return states, latency
|
return states, latency
|
||||||
|
|
||||||
@@ -91,14 +91,14 @@ def bench_character(args):
|
|||||||
sgl.set_default_backend(backend)
|
sgl.set_default_backend(backend)
|
||||||
|
|
||||||
# Run requests
|
# Run requests
|
||||||
tic = time.time()
|
tic = time.perf_counter()
|
||||||
states = character_gen.run_batch(
|
states = character_gen.run_batch(
|
||||||
arguments,
|
arguments,
|
||||||
temperature=0,
|
temperature=0,
|
||||||
num_threads=args.parallel,
|
num_threads=args.parallel,
|
||||||
progress_bar=True,
|
progress_bar=True,
|
||||||
)
|
)
|
||||||
latency = time.time() - tic
|
latency = time.perf_counter() - tic
|
||||||
|
|
||||||
return states, latency
|
return states, latency
|
||||||
|
|
||||||
|
|||||||
@@ -85,14 +85,14 @@ def bench_schema(args):
|
|||||||
sgl.set_default_backend(backend)
|
sgl.set_default_backend(backend)
|
||||||
|
|
||||||
# Run requests
|
# Run requests
|
||||||
tic = time.time()
|
tic = time.perf_counter()
|
||||||
states = schema_gen.run_batch(
|
states = schema_gen.run_batch(
|
||||||
arguments,
|
arguments,
|
||||||
temperature=0,
|
temperature=0,
|
||||||
num_threads=args.parallel,
|
num_threads=args.parallel,
|
||||||
progress_bar=True,
|
progress_bar=True,
|
||||||
)
|
)
|
||||||
latency = time.time() - tic
|
latency = time.perf_counter() - tic
|
||||||
|
|
||||||
# Check if the outputs are valid
|
# Check if the outputs are valid
|
||||||
indexes = []
|
indexes = []
|
||||||
|
|||||||
@@ -487,7 +487,7 @@ def main(args: argparse.Namespace):
|
|||||||
]
|
]
|
||||||
print(f"Start tuning over {len(search_space)} configurations...")
|
print(f"Start tuning over {len(search_space)} configurations...")
|
||||||
|
|
||||||
start = time.time()
|
start = time.perf_counter()
|
||||||
configs = _distribute(
|
configs = _distribute(
|
||||||
"tune",
|
"tune",
|
||||||
[
|
[
|
||||||
@@ -522,7 +522,7 @@ def main(args: argparse.Namespace):
|
|||||||
use_int8_w8a16,
|
use_int8_w8a16,
|
||||||
block_shape,
|
block_shape,
|
||||||
)
|
)
|
||||||
end = time.time()
|
end = time.perf_counter()
|
||||||
print(f"Tuning took {end - start:.2f} seconds")
|
print(f"Tuning took {end - start:.2f} seconds")
|
||||||
else:
|
else:
|
||||||
outputs = _distribute(
|
outputs = _distribute(
|
||||||
|
|||||||
@@ -359,7 +359,7 @@ def tune_on_gpu(args_dict):
|
|||||||
config for config in search_space if block_k % config["BLOCK_SIZE_K"] == 0
|
config for config in search_space if block_k % config["BLOCK_SIZE_K"] == 0
|
||||||
]
|
]
|
||||||
|
|
||||||
start = time.time()
|
start = time.perf_counter()
|
||||||
results = {}
|
results = {}
|
||||||
for shape in tqdm(weight_shapes, desc=f"GPU {gpu_id} - Shapes"):
|
for shape in tqdm(weight_shapes, desc=f"GPU {gpu_id} - Shapes"):
|
||||||
N, K = shape[0], shape[1]
|
N, K = shape[0], shape[1]
|
||||||
@@ -379,7 +379,7 @@ def tune_on_gpu(args_dict):
|
|||||||
best_configs = {M: config for M, config in zip(batch_sizes, benchmark_results)}
|
best_configs = {M: config for M, config in zip(batch_sizes, benchmark_results)}
|
||||||
save_configs(N, K, block_n, block_k, best_configs, save_path, input_type)
|
save_configs(N, K, block_n, block_k, best_configs, save_path, input_type)
|
||||||
|
|
||||||
end = time.time()
|
end = time.perf_counter()
|
||||||
print(f"Tuning on GPU {gpu_id} took {end - start:.2f} seconds")
|
print(f"Tuning on GPU {gpu_id} took {end - start:.2f} seconds")
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -70,7 +70,7 @@ def eval_model(args, line_obj, num_hoops, src_indices, dst_percents):
|
|||||||
# Select backend
|
# Select backend
|
||||||
backend = select_sglang_backend(args)
|
backend = select_sglang_backend(args)
|
||||||
|
|
||||||
tic = time.time()
|
tic = time.perf_counter()
|
||||||
states = line_retrieval.run_batch(
|
states = line_retrieval.run_batch(
|
||||||
arguments,
|
arguments,
|
||||||
temperature=0,
|
temperature=0,
|
||||||
@@ -78,7 +78,7 @@ def eval_model(args, line_obj, num_hoops, src_indices, dst_percents):
|
|||||||
num_threads=args.parallel,
|
num_threads=args.parallel,
|
||||||
progress_bar=True,
|
progress_bar=True,
|
||||||
)
|
)
|
||||||
latency = time.time() - tic
|
latency = time.perf_counter() - tic
|
||||||
|
|
||||||
corrects = []
|
corrects = []
|
||||||
for i in range(len(arguments)):
|
for i in range(len(arguments)):
|
||||||
|
|||||||
@@ -41,7 +41,7 @@ def main(args):
|
|||||||
sgl.set_default_backend(backend)
|
sgl.set_default_backend(backend)
|
||||||
|
|
||||||
# Run requests
|
# Run requests
|
||||||
tic = time.time()
|
tic = time.perf_counter()
|
||||||
if args.parallel == 1:
|
if args.parallel == 1:
|
||||||
for i in tqdm.tqdm(range(len(lines))):
|
for i in tqdm.tqdm(range(len(lines))):
|
||||||
image_file = arguments[i]["image_file"]
|
image_file = arguments[i]["image_file"]
|
||||||
@@ -52,7 +52,7 @@ def main(args):
|
|||||||
states = image_qa.run_batch(
|
states = image_qa.run_batch(
|
||||||
arguments, temperature=0, num_threads=args.parallel, progress_bar=True
|
arguments, temperature=0, num_threads=args.parallel, progress_bar=True
|
||||||
)
|
)
|
||||||
latency = time.time() - tic
|
latency = time.perf_counter() - tic
|
||||||
|
|
||||||
print(f"Latency: {latency:.3f}")
|
print(f"Latency: {latency:.3f}")
|
||||||
|
|
||||||
|
|||||||
@@ -85,7 +85,7 @@ def main(args):
|
|||||||
call_generate = partial(get_call_generate(args), temperature=0)
|
call_generate = partial(get_call_generate(args), temperature=0)
|
||||||
|
|
||||||
# Run requests
|
# Run requests
|
||||||
tic = time.time()
|
tic = time.perf_counter()
|
||||||
|
|
||||||
if args.backend != "lmql":
|
if args.backend != "lmql":
|
||||||
|
|
||||||
@@ -120,7 +120,7 @@ def main(args):
|
|||||||
asyncio.gather(*[get_one_answer_async(i) for i in bt])
|
asyncio.gather(*[get_one_answer_async(i) for i in bt])
|
||||||
)
|
)
|
||||||
|
|
||||||
latency = time.time() - tic
|
latency = time.perf_counter() - tic
|
||||||
|
|
||||||
# Compute accuracy
|
# Compute accuracy
|
||||||
print(f"Latency: {latency:.3f}")
|
print(f"Latency: {latency:.3f}")
|
||||||
|
|||||||
@@ -59,7 +59,7 @@ def main(args):
|
|||||||
backend = select_sglang_backend(args)
|
backend = select_sglang_backend(args)
|
||||||
|
|
||||||
# Run requests
|
# Run requests
|
||||||
tic = time.time()
|
tic = time.perf_counter()
|
||||||
states = multi_dimension_judge.run_batch(
|
states = multi_dimension_judge.run_batch(
|
||||||
arguments,
|
arguments,
|
||||||
temperature=0,
|
temperature=0,
|
||||||
@@ -67,7 +67,7 @@ def main(args):
|
|||||||
num_threads=args.parallel,
|
num_threads=args.parallel,
|
||||||
progress_bar=True,
|
progress_bar=True,
|
||||||
)
|
)
|
||||||
latency = time.time() - tic
|
latency = time.perf_counter() - tic
|
||||||
|
|
||||||
print(f"Latency: {latency:.3f}")
|
print(f"Latency: {latency:.3f}")
|
||||||
|
|
||||||
|
|||||||
@@ -45,7 +45,7 @@ def main(args):
|
|||||||
def get_one_answer(i):
|
def get_one_answer(i):
|
||||||
states[i] = json_decode(generate=call_generate, **arguments[i])
|
states[i] = json_decode(generate=call_generate, **arguments[i])
|
||||||
|
|
||||||
tic = time.time()
|
tic = time.perf_counter()
|
||||||
if args.parallel == 1:
|
if args.parallel == 1:
|
||||||
for i in tqdm(range(len(arguments))):
|
for i in tqdm(range(len(arguments))):
|
||||||
get_one_answer(i)
|
get_one_answer(i)
|
||||||
@@ -58,7 +58,7 @@ def main(args):
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
latency = time.time() - tic
|
latency = time.perf_counter() - tic
|
||||||
|
|
||||||
# Compute accuracy
|
# Compute accuracy
|
||||||
print(f"Latency: {latency:.3f}")
|
print(f"Latency: {latency:.3f}")
|
||||||
|
|||||||
@@ -46,11 +46,11 @@ def main(args):
|
|||||||
sgl.set_default_backend(backend)
|
sgl.set_default_backend(backend)
|
||||||
|
|
||||||
# Run requests
|
# Run requests
|
||||||
tic = time.time()
|
tic = time.perf_counter()
|
||||||
states = json_decode.run_batch(
|
states = json_decode.run_batch(
|
||||||
arguments, temperature=0, num_threads=args.parallel, progress_bar=True
|
arguments, temperature=0, num_threads=args.parallel, progress_bar=True
|
||||||
)
|
)
|
||||||
latency = time.time() - tic
|
latency = time.perf_counter() - tic
|
||||||
|
|
||||||
# Compute accuracy
|
# Compute accuracy
|
||||||
print(f"Latency: {latency:.3f}")
|
print(f"Latency: {latency:.3f}")
|
||||||
|
|||||||
@@ -76,7 +76,7 @@ def evaluate(args, subject, dev_df, test_df, call_generate):
|
|||||||
pred = call_generate(prompts[i], temperature=0, max_tokens=max_tokens)
|
pred = call_generate(prompts[i], temperature=0, max_tokens=max_tokens)
|
||||||
preds[i] = pred.strip()[0]
|
preds[i] = pred.strip()[0]
|
||||||
|
|
||||||
tic = time.time()
|
tic = time.perf_counter()
|
||||||
if args.parallel == 1:
|
if args.parallel == 1:
|
||||||
for i in range(len(prompts)):
|
for i in range(len(prompts)):
|
||||||
get_one_answer(i)
|
get_one_answer(i)
|
||||||
@@ -94,9 +94,9 @@ def evaluate(args, subject, dev_df, test_df, call_generate):
|
|||||||
for j in range(len(rets)):
|
for j in range(len(rets)):
|
||||||
preds[i + j] = rets[j].strip()[0]
|
preds[i + j] = rets[j].strip()[0]
|
||||||
|
|
||||||
tic = time.time()
|
tic = time.perf_counter()
|
||||||
asyncio.run(batched_call(batch_size=args.parallel))
|
asyncio.run(batched_call(batch_size=args.parallel))
|
||||||
latency = time.time() - tic
|
latency = time.perf_counter() - tic
|
||||||
|
|
||||||
# Compute accuracy
|
# Compute accuracy
|
||||||
cors = [pred == label for pred, label in zip(preds, labels)]
|
cors = [pred == label for pred, label in zip(preds, labels)]
|
||||||
|
|||||||
@@ -116,7 +116,7 @@ def main(args):
|
|||||||
backend = select_sglang_backend(args)
|
backend = select_sglang_backend(args)
|
||||||
|
|
||||||
# Run
|
# Run
|
||||||
tic = time.time()
|
tic = time.perf_counter()
|
||||||
states = few_shot_mmlu.run_batch(
|
states = few_shot_mmlu.run_batch(
|
||||||
arguments,
|
arguments,
|
||||||
temperature=0,
|
temperature=0,
|
||||||
@@ -128,7 +128,7 @@ def main(args):
|
|||||||
preds = [
|
preds = [
|
||||||
s["answer"].strip()[0] if len(s["answer"].strip()) > 0 else "" for s in states
|
s["answer"].strip()[0] if len(s["answer"].strip()) > 0 else "" for s in states
|
||||||
]
|
]
|
||||||
latency = time.time() - tic
|
latency = time.perf_counter() - tic
|
||||||
|
|
||||||
# Compute accuracy
|
# Compute accuracy
|
||||||
cors = [pred == label for pred, label in zip(preds, labels)]
|
cors = [pred == label for pred, label in zip(preds, labels)]
|
||||||
|
|||||||
@@ -119,7 +119,7 @@ async def eval_mmmu(args) -> None:
|
|||||||
api_key="sk", base_url=f"http://127.0.0.1:{args.port}/v1"
|
api_key="sk", base_url=f"http://127.0.0.1:{args.port}/v1"
|
||||||
)
|
)
|
||||||
semaphore = asyncio.Semaphore(args.concurrency)
|
semaphore = asyncio.Semaphore(args.concurrency)
|
||||||
start = time.time()
|
start = time.perf_counter()
|
||||||
base_url = f"http://127.0.0.1:{args.port}"
|
base_url = f"http://127.0.0.1:{args.port}"
|
||||||
|
|
||||||
if args.profile:
|
if args.profile:
|
||||||
@@ -147,7 +147,7 @@ async def eval_mmmu(args) -> None:
|
|||||||
if profile_output.success:
|
if profile_output.success:
|
||||||
print("Profiler stopped")
|
print("Profiler stopped")
|
||||||
|
|
||||||
print(f"Benchmark time: {time.time() - start}")
|
print(f"Benchmark time: {time.perf_counter() - start}")
|
||||||
args.output_path = f"./val_sglang.json"
|
args.output_path = f"./val_sglang.json"
|
||||||
save_json(args.output_path, out_samples)
|
save_json(args.output_path, out_samples)
|
||||||
eval_result(model_answer_path=args.output_path, answer_dict=answer_dict)
|
eval_result(model_answer_path=args.output_path, answer_dict=answer_dict)
|
||||||
|
|||||||
@@ -66,7 +66,7 @@ def main(args):
|
|||||||
answers[i] = cur_answers
|
answers[i] = cur_answers
|
||||||
|
|
||||||
# Run requests
|
# Run requests
|
||||||
tic = time.time()
|
tic = time.perf_counter()
|
||||||
if args.parallel == 1:
|
if args.parallel == 1:
|
||||||
for i in tqdm(range(len(questions))):
|
for i in tqdm(range(len(questions))):
|
||||||
get_answer(i)
|
get_answer(i)
|
||||||
@@ -79,7 +79,7 @@ def main(args):
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
latency = time.time() - tic
|
latency = time.perf_counter() - tic
|
||||||
|
|
||||||
print(f"#questions: {len(questions)}, Latency: {latency:.2f}")
|
print(f"#questions: {len(questions)}, Latency: {latency:.2f}")
|
||||||
|
|
||||||
|
|||||||
@@ -57,7 +57,7 @@ def main(args):
|
|||||||
sgl.set_default_backend(backend)
|
sgl.set_default_backend(backend)
|
||||||
|
|
||||||
# Run requests
|
# Run requests
|
||||||
tic = time.time()
|
tic = time.perf_counter()
|
||||||
rets = answer_mt_bench.run_batch(
|
rets = answer_mt_bench.run_batch(
|
||||||
arguments,
|
arguments,
|
||||||
temperature=0,
|
temperature=0,
|
||||||
@@ -66,7 +66,7 @@ def main(args):
|
|||||||
progress_bar=True,
|
progress_bar=True,
|
||||||
)
|
)
|
||||||
answers = [[s["answer_1"], s["answer_2"]] for s in rets]
|
answers = [[s["answer_1"], s["answer_2"]] for s in rets]
|
||||||
latency = time.time() - tic
|
latency = time.perf_counter() - tic
|
||||||
|
|
||||||
print(f"#questions: {len(questions)}, Latency: {latency:.2f}")
|
print(f"#questions: {len(questions)}, Latency: {latency:.2f}")
|
||||||
|
|
||||||
|
|||||||
@@ -68,7 +68,7 @@ def main(args):
|
|||||||
sgl.set_default_backend(backend)
|
sgl.set_default_backend(backend)
|
||||||
|
|
||||||
# Run requests
|
# Run requests
|
||||||
tic = time.time()
|
tic = time.perf_counter()
|
||||||
rets = answer_mt_bench.run_batch(
|
rets = answer_mt_bench.run_batch(
|
||||||
arguments,
|
arguments,
|
||||||
temperature=0,
|
temperature=0,
|
||||||
@@ -78,7 +78,7 @@ def main(args):
|
|||||||
)
|
)
|
||||||
answers = [[s["answer_1"], s["answer_2"]] for s in rets]
|
answers = [[s["answer_1"], s["answer_2"]] for s in rets]
|
||||||
|
|
||||||
latency = time.time() - tic
|
latency = time.perf_counter() - tic
|
||||||
num_output_tokens = sum(
|
num_output_tokens = sum(
|
||||||
s.get_meta_info("answer_1")["completion_tokens"]
|
s.get_meta_info("answer_1")["completion_tokens"]
|
||||||
+ s.get_meta_info("answer_2")["completion_tokens"]
|
+ s.get_meta_info("answer_2")["completion_tokens"]
|
||||||
|
|||||||
@@ -113,7 +113,7 @@ def main(args):
|
|||||||
answer = multi_chain_gsm8k(questions[i], args.num_chains, call_generate)
|
answer = multi_chain_gsm8k(questions[i], args.num_chains, call_generate)
|
||||||
states[i] = answer
|
states[i] = answer
|
||||||
|
|
||||||
tic = time.time()
|
tic = time.perf_counter()
|
||||||
if args.parallel == 1:
|
if args.parallel == 1:
|
||||||
for i in tqdm(range(len(questions))):
|
for i in tqdm(range(len(questions))):
|
||||||
get_one_answer(i)
|
get_one_answer(i)
|
||||||
@@ -134,7 +134,7 @@ def main(args):
|
|||||||
)
|
)
|
||||||
states[i] = answer
|
states[i] = answer
|
||||||
|
|
||||||
tic = time.time()
|
tic = time.perf_counter()
|
||||||
loop = asyncio.get_event_loop()
|
loop = asyncio.get_event_loop()
|
||||||
batches = [
|
batches = [
|
||||||
list(range(i, min(i + args.parallel, len(questions))))
|
list(range(i, min(i + args.parallel, len(questions))))
|
||||||
@@ -144,7 +144,7 @@ def main(args):
|
|||||||
tasks = [get_one_answer_asyncio(k) for k in bt]
|
tasks = [get_one_answer_asyncio(k) for k in bt]
|
||||||
loop.run_until_complete(asyncio.gather(*tasks))
|
loop.run_until_complete(asyncio.gather(*tasks))
|
||||||
|
|
||||||
latency = time.time() - tic
|
latency = time.perf_counter() - tic
|
||||||
|
|
||||||
preds = []
|
preds = []
|
||||||
for i in range(len(states)):
|
for i in range(len(states)):
|
||||||
|
|||||||
@@ -90,7 +90,7 @@ def main(args):
|
|||||||
backend = select_sglang_backend(args)
|
backend = select_sglang_backend(args)
|
||||||
|
|
||||||
# Run requests
|
# Run requests
|
||||||
tic = time.time()
|
tic = time.perf_counter()
|
||||||
states = multi_chain_gsm8k.run_batch(
|
states = multi_chain_gsm8k.run_batch(
|
||||||
arguments,
|
arguments,
|
||||||
temperature=0,
|
temperature=0,
|
||||||
@@ -98,7 +98,7 @@ def main(args):
|
|||||||
num_threads=args.parallel,
|
num_threads=args.parallel,
|
||||||
progress_bar=True,
|
progress_bar=True,
|
||||||
)
|
)
|
||||||
latency = time.time() - tic
|
latency = time.perf_counter() - tic
|
||||||
|
|
||||||
preds = []
|
preds = []
|
||||||
for i in range(len(states)):
|
for i in range(len(states)):
|
||||||
|
|||||||
@@ -61,7 +61,7 @@ def main(args):
|
|||||||
def get_one_answer(i):
|
def get_one_answer(i):
|
||||||
states[i] = multi_document_qa(generate=call_generate, **arguments[i])
|
states[i] = multi_document_qa(generate=call_generate, **arguments[i])
|
||||||
|
|
||||||
tic = time.time()
|
tic = time.perf_counter()
|
||||||
if args.parallel == 1:
|
if args.parallel == 1:
|
||||||
for i in tqdm(range(len(labels))):
|
for i in tqdm(range(len(labels))):
|
||||||
get_one_answer(i)
|
get_one_answer(i)
|
||||||
@@ -74,7 +74,7 @@ def main(args):
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
latency = time.time() - tic
|
latency = time.perf_counter() - tic
|
||||||
|
|
||||||
# Compute accuracy
|
# Compute accuracy
|
||||||
print(states)
|
print(states)
|
||||||
|
|||||||
@@ -49,11 +49,11 @@ def main(args):
|
|||||||
sgl.set_default_backend(backend)
|
sgl.set_default_backend(backend)
|
||||||
|
|
||||||
# Run requests
|
# Run requests
|
||||||
tic = time.time()
|
tic = time.perf_counter()
|
||||||
states = multi_document_qa.run_batch(
|
states = multi_document_qa.run_batch(
|
||||||
arguments, temperature=0, num_threads=args.parallel, progress_bar=True
|
arguments, temperature=0, num_threads=args.parallel, progress_bar=True
|
||||||
)
|
)
|
||||||
latency = time.time() - tic
|
latency = time.perf_counter() - tic
|
||||||
|
|
||||||
# Compute accuracy
|
# Compute accuracy
|
||||||
print([s["answer"] for s in states])
|
print([s["answer"] for s in states])
|
||||||
|
|||||||
@@ -35,7 +35,7 @@ def main(args):
|
|||||||
def get_one_answer(i):
|
def get_one_answer(i):
|
||||||
states[i] = multi_turns(generate=call_generate, **multi_qas[i])
|
states[i] = multi_turns(generate=call_generate, **multi_qas[i])
|
||||||
|
|
||||||
tic = time.time()
|
tic = time.perf_counter()
|
||||||
if args.parallel == 1:
|
if args.parallel == 1:
|
||||||
for i in tqdm(range(len(multi_qas))):
|
for i in tqdm(range(len(multi_qas))):
|
||||||
get_one_answer(i)
|
get_one_answer(i)
|
||||||
@@ -50,7 +50,7 @@ def main(args):
|
|||||||
for _ in rets:
|
for _ in rets:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
latency = time.time() - tic
|
latency = time.perf_counter() - tic
|
||||||
|
|
||||||
# Compute accuracy
|
# Compute accuracy
|
||||||
print(f"Latency: {latency:.3f}")
|
print(f"Latency: {latency:.3f}")
|
||||||
|
|||||||
@@ -27,7 +27,7 @@ def main(args):
|
|||||||
|
|
||||||
backend = select_sglang_backend(args)
|
backend = select_sglang_backend(args)
|
||||||
|
|
||||||
tic = time.time()
|
tic = time.perf_counter()
|
||||||
states = multi_turns.run_batch(
|
states = multi_turns.run_batch(
|
||||||
multi_qas,
|
multi_qas,
|
||||||
temperature=0,
|
temperature=0,
|
||||||
@@ -35,7 +35,7 @@ def main(args):
|
|||||||
num_threads=args.parallel,
|
num_threads=args.parallel,
|
||||||
progress_bar=True,
|
progress_bar=True,
|
||||||
)
|
)
|
||||||
latency = time.time() - tic
|
latency = time.perf_counter() - tic
|
||||||
|
|
||||||
print(f"Latency: {latency:.3f}")
|
print(f"Latency: {latency:.3f}")
|
||||||
|
|
||||||
|
|||||||
@@ -84,7 +84,7 @@ def main(args):
|
|||||||
|
|
||||||
backend = select_sglang_backend(args)
|
backend = select_sglang_backend(args)
|
||||||
|
|
||||||
tic = time.time()
|
tic = time.perf_counter()
|
||||||
states = multi_turns.run_batch(
|
states = multi_turns.run_batch(
|
||||||
multi_qas,
|
multi_qas,
|
||||||
temperature=0,
|
temperature=0,
|
||||||
@@ -92,7 +92,7 @@ def main(args):
|
|||||||
num_threads="auto",
|
num_threads="auto",
|
||||||
progress_bar=True,
|
progress_bar=True,
|
||||||
)
|
)
|
||||||
latency = time.time() - tic
|
latency = time.perf_counter() - tic
|
||||||
|
|
||||||
print(f"Latency: {latency:.3f}")
|
print(f"Latency: {latency:.3f}")
|
||||||
|
|
||||||
|
|||||||
@@ -146,7 +146,7 @@ def main(args):
|
|||||||
|
|
||||||
states.append(answer)
|
states.append(answer)
|
||||||
|
|
||||||
tic = time.time()
|
tic = time.perf_counter()
|
||||||
|
|
||||||
if args.backend != "lmql":
|
if args.backend != "lmql":
|
||||||
if args.parallel == 1:
|
if args.parallel == 1:
|
||||||
@@ -173,7 +173,7 @@ def main(args):
|
|||||||
tasks = [run_single_agent_async(arg) for arg in bt]
|
tasks = [run_single_agent_async(arg) for arg in bt]
|
||||||
loop.run_until_complete(asyncio.gather(*tasks))
|
loop.run_until_complete(asyncio.gather(*tasks))
|
||||||
|
|
||||||
latency = time.time() - tic
|
latency = time.perf_counter() - tic
|
||||||
|
|
||||||
print(f"Latency: {latency:.3f}")
|
print(f"Latency: {latency:.3f}")
|
||||||
|
|
||||||
|
|||||||
@@ -115,14 +115,14 @@ def main(args):
|
|||||||
sgl.set_default_backend(backend)
|
sgl.set_default_backend(backend)
|
||||||
|
|
||||||
states = []
|
states = []
|
||||||
tic = time.time()
|
tic = time.perf_counter()
|
||||||
states = webthink.run_batch(
|
states = webthink.run_batch(
|
||||||
arguments,
|
arguments,
|
||||||
temperature=0,
|
temperature=0,
|
||||||
num_threads=args.parallel,
|
num_threads=args.parallel,
|
||||||
progress_bar=True,
|
progress_bar=True,
|
||||||
)
|
)
|
||||||
latency = time.time() - tic
|
latency = time.perf_counter() - tic
|
||||||
|
|
||||||
# Compute accuracy
|
# Compute accuracy
|
||||||
print(f"Latency: {latency:.3f}")
|
print(f"Latency: {latency:.3f}")
|
||||||
|
|||||||
@@ -51,7 +51,7 @@ def main(args):
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Run requests
|
# Run requests
|
||||||
tic = time.time()
|
tic = time.perf_counter()
|
||||||
states = reasoning_gen.run_batch(
|
states = reasoning_gen.run_batch(
|
||||||
questions,
|
questions,
|
||||||
num_threads=args.parallel,
|
num_threads=args.parallel,
|
||||||
@@ -60,7 +60,7 @@ def main(args):
|
|||||||
max_new_tokens=32768,
|
max_new_tokens=32768,
|
||||||
top_p=0.95,
|
top_p=0.95,
|
||||||
)
|
)
|
||||||
latency = time.time() - tic
|
latency = time.perf_counter() - tic
|
||||||
|
|
||||||
# Extract results and record outcomes in a list.
|
# Extract results and record outcomes in a list.
|
||||||
outcomes = []
|
outcomes = []
|
||||||
|
|||||||
@@ -68,7 +68,7 @@ def main(args):
|
|||||||
call_generate = partial(get_call_generate(args), temperature=0)
|
call_generate = partial(get_call_generate(args), temperature=0)
|
||||||
|
|
||||||
# Run requests
|
# Run requests
|
||||||
tic = time.time()
|
tic = time.perf_counter()
|
||||||
if args.backend != "lmql":
|
if args.backend != "lmql":
|
||||||
|
|
||||||
def get_one_answer(i):
|
def get_one_answer(i):
|
||||||
@@ -102,7 +102,7 @@ def main(args):
|
|||||||
loop.run_until_complete(
|
loop.run_until_complete(
|
||||||
asyncio.gather(*[get_one_answer_async(i) for i in batch])
|
asyncio.gather(*[get_one_answer_async(i) for i in batch])
|
||||||
)
|
)
|
||||||
latency = time.time() - tic
|
latency = time.perf_counter() - tic
|
||||||
|
|
||||||
# Compute accuracy
|
# Compute accuracy
|
||||||
print(f"Latency: {latency:.3f}")
|
print(f"Latency: {latency:.3f}")
|
||||||
|
|||||||
@@ -65,11 +65,11 @@ def main(args):
|
|||||||
sgl.set_default_backend(select_sglang_backend(args))
|
sgl.set_default_backend(select_sglang_backend(args))
|
||||||
|
|
||||||
# Run requests
|
# Run requests
|
||||||
tic = time.time()
|
tic = time.perf_counter()
|
||||||
states = suggest_tips.run_batch(
|
states = suggest_tips.run_batch(
|
||||||
arguments, temperature=0, num_threads=args.parallel, progress_bar=True
|
arguments, temperature=0, num_threads=args.parallel, progress_bar=True
|
||||||
)
|
)
|
||||||
latency = time.time() - tic
|
latency = time.perf_counter() - tic
|
||||||
|
|
||||||
# Compute accuracy
|
# Compute accuracy
|
||||||
print(f"Latency: {latency:.3f}")
|
print(f"Latency: {latency:.3f}")
|
||||||
|
|||||||
@@ -138,7 +138,7 @@ def main(args):
|
|||||||
# Run requests
|
# Run requests
|
||||||
states = [None] * len(questions)
|
states = [None] * len(questions)
|
||||||
|
|
||||||
tic = time.time()
|
tic = time.perf_counter()
|
||||||
if args.backend != "lmql":
|
if args.backend != "lmql":
|
||||||
|
|
||||||
def get_one_answer(i):
|
def get_one_answer(i):
|
||||||
@@ -177,7 +177,7 @@ def main(args):
|
|||||||
tasks = [get_one_answer_async(k) for k in bt]
|
tasks = [get_one_answer_async(k) for k in bt]
|
||||||
loop.run_until_complete(asyncio.gather(*tasks))
|
loop.run_until_complete(asyncio.gather(*tasks))
|
||||||
|
|
||||||
latency = time.time() - tic
|
latency = time.perf_counter() - tic
|
||||||
|
|
||||||
answers_text = []
|
answers_text = []
|
||||||
for s in states:
|
for s in states:
|
||||||
|
|||||||
@@ -119,7 +119,7 @@ def main(args):
|
|||||||
backend = select_sglang_backend(args)
|
backend = select_sglang_backend(args)
|
||||||
|
|
||||||
# Run requests
|
# Run requests
|
||||||
tic = time.time()
|
tic = time.perf_counter()
|
||||||
states = tree_search.run_batch(
|
states = tree_search.run_batch(
|
||||||
arguments,
|
arguments,
|
||||||
temperature=0,
|
temperature=0,
|
||||||
@@ -127,7 +127,7 @@ def main(args):
|
|||||||
num_threads=args.parallel,
|
num_threads=args.parallel,
|
||||||
progress_bar=True,
|
progress_bar=True,
|
||||||
)
|
)
|
||||||
latency = time.time() - tic
|
latency = time.perf_counter() - tic
|
||||||
answers_text = []
|
answers_text = []
|
||||||
for s in states:
|
for s in states:
|
||||||
answers_text.append([x for xs in s.ret_value for x in xs])
|
answers_text.append([x for xs in s.ret_value for x in xs])
|
||||||
|
|||||||
@@ -121,7 +121,7 @@ def main(args):
|
|||||||
def get_one_answer(i):
|
def get_one_answer(i):
|
||||||
states[i] = tree_search(**arguments[i], call_generate=call_generate)
|
states[i] = tree_search(**arguments[i], call_generate=call_generate)
|
||||||
|
|
||||||
tic = time.time()
|
tic = time.perf_counter()
|
||||||
if args.parallel == 1:
|
if args.parallel == 1:
|
||||||
for i in tqdm(range(len(questions))):
|
for i in tqdm(range(len(questions))):
|
||||||
get_one_answer(i)
|
get_one_answer(i)
|
||||||
@@ -134,7 +134,7 @@ def main(args):
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
latency = time.time() - tic
|
latency = time.perf_counter() - tic
|
||||||
|
|
||||||
answers_text = []
|
answers_text = []
|
||||||
for s in states:
|
for s in states:
|
||||||
|
|||||||
@@ -107,7 +107,7 @@ def main(args):
|
|||||||
backend = select_sglang_backend(args)
|
backend = select_sglang_backend(args)
|
||||||
|
|
||||||
# Run requests
|
# Run requests
|
||||||
tic = time.time()
|
tic = time.perf_counter()
|
||||||
states = tree_search.run_batch(
|
states = tree_search.run_batch(
|
||||||
arguments,
|
arguments,
|
||||||
temperature=0,
|
temperature=0,
|
||||||
@@ -115,7 +115,7 @@ def main(args):
|
|||||||
num_threads=args.parallel,
|
num_threads=args.parallel,
|
||||||
progress_bar=True,
|
progress_bar=True,
|
||||||
)
|
)
|
||||||
latency = time.time() - tic
|
latency = time.perf_counter() - tic
|
||||||
answers_text = []
|
answers_text = []
|
||||||
for s in states:
|
for s in states:
|
||||||
answers_text.append([x for xs in s["answer"] for x in xs])
|
answers_text.append([x for xs in s["answer"] for x in xs])
|
||||||
|
|||||||
@@ -90,7 +90,7 @@ def run_eval(args):
|
|||||||
#####################################
|
#####################################
|
||||||
|
|
||||||
# Run requests
|
# Run requests
|
||||||
tic = time.time()
|
tic = time.perf_counter()
|
||||||
states = few_shot_gsm8k.run_batch(
|
states = few_shot_gsm8k.run_batch(
|
||||||
arguments,
|
arguments,
|
||||||
temperature=args.temperature if hasattr(args, "temperature") else 0,
|
temperature=args.temperature if hasattr(args, "temperature") else 0,
|
||||||
@@ -99,7 +99,7 @@ def run_eval(args):
|
|||||||
return_logprob=getattr(args, "return_logprob", None),
|
return_logprob=getattr(args, "return_logprob", None),
|
||||||
logprob_start_len=getattr(args, "logprob_start_len", None),
|
logprob_start_len=getattr(args, "logprob_start_len", None),
|
||||||
)
|
)
|
||||||
latency = time.time() - tic
|
latency = time.perf_counter() - tic
|
||||||
|
|
||||||
preds = []
|
preds = []
|
||||||
for i in range(len(states)):
|
for i in range(len(states)):
|
||||||
|
|||||||
@@ -89,7 +89,7 @@ def run_eval(args):
|
|||||||
}
|
}
|
||||||
|
|
||||||
# Run requests
|
# Run requests
|
||||||
tic = time.time()
|
tic = time.perf_counter()
|
||||||
|
|
||||||
loop = asyncio.get_event_loop()
|
loop = asyncio.get_event_loop()
|
||||||
|
|
||||||
@@ -98,7 +98,7 @@ def run_eval(args):
|
|||||||
)
|
)
|
||||||
|
|
||||||
# End requests
|
# End requests
|
||||||
latency = time.time() - tic
|
latency = time.perf_counter() - tic
|
||||||
|
|
||||||
# Shutdown the engine
|
# Shutdown the engine
|
||||||
engine.shutdown()
|
engine.shutdown()
|
||||||
|
|||||||
@@ -71,9 +71,9 @@ def run_eval(args):
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Run eval
|
# Run eval
|
||||||
tic = time.time()
|
tic = time.perf_counter()
|
||||||
result = eval_obj(sampler)
|
result = eval_obj(sampler)
|
||||||
latency = time.time() - tic
|
latency = time.perf_counter() - tic
|
||||||
|
|
||||||
# Dump reports
|
# Dump reports
|
||||||
metrics = result.metrics | {"score": result.score}
|
metrics = result.metrics | {"score": result.score}
|
||||||
|
|||||||
@@ -503,7 +503,7 @@ def test_hellaswag_select():
|
|||||||
#####################################
|
#####################################
|
||||||
|
|
||||||
# Run requests
|
# Run requests
|
||||||
tic = time.time()
|
tic = time.perf_counter()
|
||||||
rets = few_shot_hellaswag.run_batch(
|
rets = few_shot_hellaswag.run_batch(
|
||||||
arguments,
|
arguments,
|
||||||
temperature=0,
|
temperature=0,
|
||||||
@@ -514,13 +514,13 @@ def test_hellaswag_select():
|
|||||||
preds = []
|
preds = []
|
||||||
for i, ret in enumerate(rets):
|
for i, ret in enumerate(rets):
|
||||||
preds.append(choices[i].index(ret["answer"]))
|
preds.append(choices[i].index(ret["answer"]))
|
||||||
latency = time.time() - tic
|
latency = time.perf_counter() - tic
|
||||||
|
|
||||||
# Compute accuracy
|
# Compute accuracy
|
||||||
accuracy = np.mean(np.array(preds) == np.array(labels))
|
accuracy = np.mean(np.array(preds) == np.array(labels))
|
||||||
|
|
||||||
# Test generator style of run_batch
|
# Test generator style of run_batch
|
||||||
tic = time.time()
|
tic = time.perf_counter()
|
||||||
rets = few_shot_hellaswag.run_batch(
|
rets = few_shot_hellaswag.run_batch(
|
||||||
arguments,
|
arguments,
|
||||||
temperature=0,
|
temperature=0,
|
||||||
@@ -531,7 +531,7 @@ def test_hellaswag_select():
|
|||||||
preds_gen = []
|
preds_gen = []
|
||||||
for i, ret in enumerate(rets):
|
for i, ret in enumerate(rets):
|
||||||
preds_gen.append(choices[i].index(ret["answer"]))
|
preds_gen.append(choices[i].index(ret["answer"]))
|
||||||
latency_gen = time.time() - tic
|
latency_gen = time.perf_counter() - tic
|
||||||
|
|
||||||
# Compute accuracy
|
# Compute accuracy
|
||||||
accuracy_gen = np.mean(np.array(preds_gen) == np.array(labels))
|
accuracy_gen = np.mean(np.array(preds_gen) == np.array(labels))
|
||||||
|
|||||||
@@ -449,9 +449,9 @@ def popen_launch_server(
|
|||||||
else:
|
else:
|
||||||
process = subprocess.Popen(command, stdout=None, stderr=None, env=env)
|
process = subprocess.Popen(command, stdout=None, stderr=None, env=env)
|
||||||
|
|
||||||
start_time = time.time()
|
start_time = time.perf_counter()
|
||||||
with requests.Session() as session:
|
with requests.Session() as session:
|
||||||
while time.time() - start_time < timeout:
|
while time.perf_counter() - start_time < timeout:
|
||||||
try:
|
try:
|
||||||
headers = {
|
headers = {
|
||||||
"Content-Type": "application/json; charset=utf-8",
|
"Content-Type": "application/json; charset=utf-8",
|
||||||
@@ -584,7 +584,7 @@ class TestFile:
|
|||||||
|
|
||||||
|
|
||||||
def run_unittest_files(files: List[TestFile], timeout_per_file: float):
|
def run_unittest_files(files: List[TestFile], timeout_per_file: float):
|
||||||
tic = time.time()
|
tic = time.perf_counter()
|
||||||
success = True
|
success = True
|
||||||
|
|
||||||
for i, file in enumerate(files):
|
for i, file in enumerate(files):
|
||||||
@@ -599,13 +599,13 @@ def run_unittest_files(files: List[TestFile], timeout_per_file: float):
|
|||||||
f".\n.\nBegin ({i}/{len(files) - 1}):\npython3 {filename}\n.\n.\n",
|
f".\n.\nBegin ({i}/{len(files) - 1}):\npython3 {filename}\n.\n.\n",
|
||||||
flush=True,
|
flush=True,
|
||||||
)
|
)
|
||||||
tic = time.time()
|
tic = time.perf_counter()
|
||||||
|
|
||||||
process = subprocess.Popen(
|
process = subprocess.Popen(
|
||||||
["python3", filename], stdout=None, stderr=None, env=os.environ
|
["python3", filename], stdout=None, stderr=None, env=os.environ
|
||||||
)
|
)
|
||||||
process.wait()
|
process.wait()
|
||||||
elapsed = time.time() - tic
|
elapsed = time.perf_counter() - tic
|
||||||
|
|
||||||
print(
|
print(
|
||||||
f".\n.\nEnd ({i}/{len(files) - 1}):\n{filename=}, {elapsed=:.0f}, {estimated_time=}\n.\n.\n",
|
f".\n.\nEnd ({i}/{len(files) - 1}):\n{filename=}, {elapsed=:.0f}, {estimated_time=}\n.\n.\n",
|
||||||
@@ -631,9 +631,9 @@ def run_unittest_files(files: List[TestFile], timeout_per_file: float):
|
|||||||
break
|
break
|
||||||
|
|
||||||
if success:
|
if success:
|
||||||
print(f"Success. Time elapsed: {time.time() - tic:.2f}s", flush=True)
|
print(f"Success. Time elapsed: {time.perf_counter() - tic:.2f}s", flush=True)
|
||||||
else:
|
else:
|
||||||
print(f"Fail. Time elapsed: {time.time() - tic:.2f}s", flush=True)
|
print(f"Fail. Time elapsed: {time.perf_counter() - tic:.2f}s", flush=True)
|
||||||
|
|
||||||
return 0 if success else -1
|
return 0 if success else -1
|
||||||
|
|
||||||
|
|||||||
@@ -92,9 +92,9 @@ def popen_launch_router(
|
|||||||
|
|
||||||
process = subprocess.Popen(command, stdout=None, stderr=None)
|
process = subprocess.Popen(command, stdout=None, stderr=None)
|
||||||
|
|
||||||
start_time = time.time()
|
start_time = time.perf_counter()
|
||||||
with requests.Session() as session:
|
with requests.Session() as session:
|
||||||
while time.time() - start_time < timeout:
|
while time.perf_counter() - start_time < timeout:
|
||||||
try:
|
try:
|
||||||
response = session.get(f"{base_url}/health")
|
response = session.get(f"{base_url}/health")
|
||||||
if response.status_code == 200:
|
if response.status_code == 200:
|
||||||
@@ -155,11 +155,11 @@ def terminate_and_wait(process, timeout=300):
|
|||||||
return
|
return
|
||||||
|
|
||||||
process.terminate()
|
process.terminate()
|
||||||
start_time = time.time()
|
start_time = time.perf_counter()
|
||||||
|
|
||||||
while process.poll() is None:
|
while process.poll() is None:
|
||||||
print(f"Terminating process {process.pid}")
|
print(f"Terminating process {process.pid}")
|
||||||
if time.time() - start_time > timeout:
|
if time.perf_counter() - start_time > timeout:
|
||||||
raise TimeoutError(
|
raise TimeoutError(
|
||||||
f"Process {process.pid} failed to terminate within {timeout}s"
|
f"Process {process.pid} failed to terminate within {timeout}s"
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -184,9 +184,9 @@ class ExperimentRunner:
|
|||||||
self.logger = logging.getLogger(__name__)
|
self.logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
def wait_for_server(self, port: int, timeout: int = 300) -> bool:
|
def wait_for_server(self, port: int, timeout: int = 300) -> bool:
|
||||||
start_time = time.time()
|
start_time = time.perf_counter()
|
||||||
|
|
||||||
while time.time() - start_time < timeout:
|
while time.perf_counter() - start_time < timeout:
|
||||||
try:
|
try:
|
||||||
response = requests.get(f"http://localhost:{port}/health")
|
response = requests.get(f"http://localhost:{port}/health")
|
||||||
if response.status_code == 200:
|
if response.status_code == 200:
|
||||||
@@ -197,7 +197,7 @@ class ExperimentRunner:
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
def run_task(self, config: TaskConfig) -> TaskResult:
|
def run_task(self, config: TaskConfig) -> TaskResult:
|
||||||
start_time = time.time()
|
start_time = time.perf_counter()
|
||||||
client_output = []
|
client_output = []
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@@ -247,7 +247,7 @@ class ExperimentRunner:
|
|||||||
name=config.name,
|
name=config.name,
|
||||||
success=True,
|
success=True,
|
||||||
output=formatted_output,
|
output=formatted_output,
|
||||||
runtime=time.time() - start_time,
|
runtime=time.perf_counter() - start_time,
|
||||||
timestamp=datetime.now().isoformat(),
|
timestamp=datetime.now().isoformat(),
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -256,7 +256,7 @@ class ExperimentRunner:
|
|||||||
name=config.name,
|
name=config.name,
|
||||||
success=False,
|
success=False,
|
||||||
output=str(e),
|
output=str(e),
|
||||||
runtime=time.time() - start_time,
|
runtime=time.perf_counter() - start_time,
|
||||||
timestamp=datetime.now().isoformat(),
|
timestamp=datetime.now().isoformat(),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -79,9 +79,9 @@ class TestEncoderEmbeddingModels(CustomTestCase):
|
|||||||
# warm up
|
# warm up
|
||||||
hf_outputs = hf_runner.forward(truncated_prompts)
|
hf_outputs = hf_runner.forward(truncated_prompts)
|
||||||
|
|
||||||
st_start_time = time.time()
|
st_start_time = time.perf_counter()
|
||||||
hf_outputs = hf_runner.forward(truncated_prompts)
|
hf_outputs = hf_runner.forward(truncated_prompts)
|
||||||
st_end_time = time.time()
|
st_end_time = time.perf_counter()
|
||||||
|
|
||||||
with SRTRunner(
|
with SRTRunner(
|
||||||
model_path,
|
model_path,
|
||||||
@@ -95,9 +95,9 @@ class TestEncoderEmbeddingModels(CustomTestCase):
|
|||||||
# warm up
|
# warm up
|
||||||
srt_outputs = srt_runner.forward(truncated_prompts)
|
srt_outputs = srt_runner.forward(truncated_prompts)
|
||||||
|
|
||||||
sgl_start_time = time.time()
|
sgl_start_time = time.perf_counter()
|
||||||
srt_outputs = srt_runner.forward(truncated_prompts)
|
srt_outputs = srt_runner.forward(truncated_prompts)
|
||||||
sgl_end_time = time.time()
|
sgl_end_time = time.perf_counter()
|
||||||
|
|
||||||
transformer_time = st_end_time - st_start_time
|
transformer_time = st_end_time - st_start_time
|
||||||
sgl_time = sgl_end_time - sgl_start_time
|
sgl_time = sgl_end_time - sgl_start_time
|
||||||
|
|||||||
@@ -130,9 +130,9 @@ class TestGPTQModelDynamic(CustomTestCase):
|
|||||||
def test_throughput(self):
|
def test_throughput(self):
|
||||||
max_tokens = 256
|
max_tokens = 256
|
||||||
|
|
||||||
tic = time.time()
|
tic = time.perf_counter()
|
||||||
result = self.run_decode(max_tokens)
|
result = self.run_decode(max_tokens)
|
||||||
tok = time.time()
|
tok = time.perf_counter()
|
||||||
|
|
||||||
print(f"result = `{result}`")
|
print(f"result = `{result}`")
|
||||||
|
|
||||||
@@ -185,9 +185,9 @@ class TestGPTQModelDynamicWithMarlin(CustomTestCase):
|
|||||||
def test_throughput(self):
|
def test_throughput(self):
|
||||||
max_tokens = 256
|
max_tokens = 256
|
||||||
|
|
||||||
tic = time.time()
|
tic = time.perf_counter()
|
||||||
result = self.run_decode(max_tokens)
|
result = self.run_decode(max_tokens)
|
||||||
tok = time.time()
|
tok = time.perf_counter()
|
||||||
|
|
||||||
print(f"result = `{result}`")
|
print(f"result = `{result}`")
|
||||||
|
|
||||||
|
|||||||
@@ -42,10 +42,10 @@ class TestReleaseMemoryOccupation(CustomTestCase):
|
|||||||
)
|
)
|
||||||
|
|
||||||
print("release_memory_occupation start")
|
print("release_memory_occupation start")
|
||||||
t = time.time()
|
t = time.perf_counter()
|
||||||
engine.release_memory_occupation()
|
engine.release_memory_occupation()
|
||||||
if _DEBUG_EXTRA:
|
if _DEBUG_EXTRA:
|
||||||
print("release_memory_occupation", time.time() - t)
|
print("release_memory_occupation", time.perf_counter() - t)
|
||||||
|
|
||||||
if _DEBUG_EXTRA:
|
if _DEBUG_EXTRA:
|
||||||
time.sleep(5)
|
time.sleep(5)
|
||||||
@@ -60,10 +60,10 @@ class TestReleaseMemoryOccupation(CustomTestCase):
|
|||||||
time.sleep(5)
|
time.sleep(5)
|
||||||
|
|
||||||
print("resume_memory_occupation start")
|
print("resume_memory_occupation start")
|
||||||
t = time.time()
|
t = time.perf_counter()
|
||||||
engine.resume_memory_occupation()
|
engine.resume_memory_occupation()
|
||||||
if _DEBUG_EXTRA:
|
if _DEBUG_EXTRA:
|
||||||
print("resume_memory_occupation", time.time() - t)
|
print("resume_memory_occupation", time.perf_counter() - t)
|
||||||
|
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
_try_allocate_big_tensor(),
|
_try_allocate_big_tensor(),
|
||||||
|
|||||||
@@ -62,9 +62,9 @@ class TestTorchCompile(CustomTestCase):
|
|||||||
res = self.run_decode(16)
|
res = self.run_decode(16)
|
||||||
|
|
||||||
max_tokens = 256
|
max_tokens = 256
|
||||||
tic = time.time()
|
tic = time.perf_counter()
|
||||||
res = self.run_decode(max_tokens)
|
res = self.run_decode(max_tokens)
|
||||||
tok = time.time()
|
tok = time.perf_counter()
|
||||||
print(f"{res=}")
|
print(f"{res=}")
|
||||||
throughput = max_tokens / (tok - tic)
|
throughput = max_tokens / (tok - tic)
|
||||||
print(f"Throughput: {throughput} tokens/s")
|
print(f"Throughput: {throughput} tokens/s")
|
||||||
|
|||||||
@@ -62,9 +62,9 @@ class TestTorchCompileMoe(CustomTestCase):
|
|||||||
res = self.run_decode(16)
|
res = self.run_decode(16)
|
||||||
|
|
||||||
max_tokens = 256
|
max_tokens = 256
|
||||||
tic = time.time()
|
tic = time.perf_counter()
|
||||||
res = self.run_decode(max_tokens)
|
res = self.run_decode(max_tokens)
|
||||||
tok = time.time()
|
tok = time.perf_counter()
|
||||||
print(f"{res=}")
|
print(f"{res=}")
|
||||||
throughput = max_tokens / (tok - tic)
|
throughput = max_tokens / (tok - tic)
|
||||||
self.assertGreaterEqual(throughput, 285)
|
self.assertGreaterEqual(throughput, 285)
|
||||||
|
|||||||
@@ -61,9 +61,9 @@ class TestTorchAO(CustomTestCase):
|
|||||||
|
|
||||||
max_tokens = 256
|
max_tokens = 256
|
||||||
|
|
||||||
tic = time.time()
|
tic = time.perf_counter()
|
||||||
res = self.run_decode(max_tokens)
|
res = self.run_decode(max_tokens)
|
||||||
tok = time.time()
|
tok = time.perf_counter()
|
||||||
print(res["text"])
|
print(res["text"])
|
||||||
throughput = max_tokens / (tok - tic)
|
throughput = max_tokens / (tok - tic)
|
||||||
print(f"Throughput: {throughput} tokens/s")
|
print(f"Throughput: {throughput} tokens/s")
|
||||||
|
|||||||
@@ -164,7 +164,7 @@ def init_process_hf(
|
|||||||
)
|
)
|
||||||
dist.barrier(group=group, device_ids=[rank])
|
dist.barrier(group=group, device_ids=[rank])
|
||||||
torch.cuda.synchronize()
|
torch.cuda.synchronize()
|
||||||
time_begin_broadcast = time.time()
|
time_begin_broadcast = time.perf_counter()
|
||||||
|
|
||||||
# The last parameter is lm_head.weight, which is tied
|
# The last parameter is lm_head.weight, which is tied
|
||||||
# with embed_tokens.weight. Actually, we only need
|
# with embed_tokens.weight. Actually, we only need
|
||||||
@@ -182,7 +182,7 @@ def init_process_hf(
|
|||||||
group=group,
|
group=group,
|
||||||
)
|
)
|
||||||
torch.cuda.synchronize()
|
torch.cuda.synchronize()
|
||||||
time_end_broadcast = time.time()
|
time_end_broadcast = time.perf_counter()
|
||||||
|
|
||||||
# Measure the latency of broadcasting/weights update.
|
# Measure the latency of broadcasting/weights update.
|
||||||
broadcast_time = time_end_broadcast - time_begin_broadcast
|
broadcast_time = time_end_broadcast - time_begin_broadcast
|
||||||
@@ -282,7 +282,7 @@ def init_process_sgl(
|
|||||||
)
|
)
|
||||||
|
|
||||||
torch.cuda.synchronize()
|
torch.cuda.synchronize()
|
||||||
time_begin_update = time.time()
|
time_begin_update = time.perf_counter()
|
||||||
|
|
||||||
# The last parameter is lm_head.weight, which is tied
|
# The last parameter is lm_head.weight, which is tied
|
||||||
# with embed_tokens.weight. Actually, we only need
|
# with embed_tokens.weight. Actually, we only need
|
||||||
@@ -312,7 +312,7 @@ def init_process_sgl(
|
|||||||
},
|
},
|
||||||
)
|
)
|
||||||
torch.cuda.synchronize()
|
torch.cuda.synchronize()
|
||||||
time_end_update = time.time()
|
time_end_update = time.perf_counter()
|
||||||
|
|
||||||
# Measure the latency of broadcast/weights update.
|
# Measure the latency of broadcast/weights update.
|
||||||
update_time = time_end_update - time_begin_update
|
update_time = time_end_update - time_begin_update
|
||||||
|
|||||||
@@ -21,9 +21,9 @@ def test_update_weights_from_tensor(tp_size):
|
|||||||
memory_before = torch.cuda.memory_allocated()
|
memory_before = torch.cuda.memory_allocated()
|
||||||
new_tensor = torch.full((16384, 2048), 1.5, device="cuda")
|
new_tensor = torch.full((16384, 2048), 1.5, device="cuda")
|
||||||
|
|
||||||
time_start = time.time()
|
time_start = time.perf_counter()
|
||||||
engine.update_weights_from_tensor([(x, new_tensor) for x in param_names])
|
engine.update_weights_from_tensor([(x, new_tensor) for x in param_names])
|
||||||
print(f"Time delta: {time.time() - time_start:.03f}")
|
print(f"Time delta: {time.perf_counter() - time_start:.03f}")
|
||||||
|
|
||||||
for param_name in param_names[:3]:
|
for param_name in param_names[:3]:
|
||||||
_check_param(engine, param_name, [1.5] * 5)
|
_check_param(engine, param_name, [1.5] * 5)
|
||||||
|
|||||||
@@ -62,9 +62,9 @@ class TestW8A8(CustomTestCase):
|
|||||||
def test_throughput(self):
|
def test_throughput(self):
|
||||||
max_tokens = 256
|
max_tokens = 256
|
||||||
|
|
||||||
tic = time.time()
|
tic = time.perf_counter()
|
||||||
res = self.run_decode(max_tokens)
|
res = self.run_decode(max_tokens)
|
||||||
tok = time.time()
|
tok = time.perf_counter()
|
||||||
print(res["text"])
|
print(res["text"])
|
||||||
throughput = max_tokens / (tok - tic)
|
throughput = max_tokens / (tok - tic)
|
||||||
print(f"Throughput: {throughput} tokens/s")
|
print(f"Throughput: {throughput} tokens/s")
|
||||||
|
|||||||
Reference in New Issue
Block a user