Misc fix for min_p_sampling, --cuda-graph-bs (#2761)

This commit is contained in:
Lianmin Zheng
2025-01-07 02:52:53 -08:00
committed by GitHub
parent 6d08ce2aa9
commit bdc1acf6cd
17 changed files with 135 additions and 63 deletions

View File

@@ -532,6 +532,8 @@ def run_bench_serving(
request_rate,
other_server_args,
dataset_name="random",
dataset_path="",
tokenizer=None,
random_input_len=4096,
random_output_len=2048,
disable_stream=False,
@@ -553,9 +555,9 @@ def run_bench_serving(
host=None,
port=None,
dataset_name=dataset_name,
dataset_path="",
dataset_path=dataset_path,
model=None,
tokenizer=None,
tokenizer=tokenizer,
num_prompts=num_prompts,
sharegpt_output_len=None,
random_input_len=random_input_len,
@@ -657,16 +659,16 @@ STDERR_FILENAME = "stderr.txt"
STDOUT_FILENAME = "stdout.txt"
def read_output(output_lines):
def read_output(output_lines: List[str], filename: str = STDERR_FILENAME):
"""Print the output in real time with another thread."""
while not os.path.exists(STDERR_FILENAME):
while not os.path.exists(filename):
time.sleep(1)
pt = 0
while pt >= 0:
if pt > 0 and not os.path.exists(STDERR_FILENAME):
if pt > 0 and not os.path.exists(filename):
break
lines = open(STDERR_FILENAME).readlines()
lines = open(filename).readlines()
for line in lines[pt:]:
print(line, end="", flush=True)
output_lines.append(line)
@@ -747,6 +749,33 @@ def run_and_check_memory_leak(
assert has_abort
def run_command_and_capture_output(command, env: Optional[dict] = None):
stdout = open(STDOUT_FILENAME, "w")
stderr = open(STDERR_FILENAME, "w")
process = subprocess.Popen(
command, stdout=stdout, stderr=stderr, env=env, text=True
)
# Launch a thread to stream the output
output_lines = []
t = threading.Thread(target=read_output, args=(output_lines, STDOUT_FILENAME))
t.start()
# Join the process
process.wait()
stdout.close()
stderr.close()
if os.path.exists(STDOUT_FILENAME):
os.remove(STDOUT_FILENAME)
if os.path.exists(STDERR_FILENAME):
os.remove(STDERR_FILENAME)
kill_process_tree(process.pid)
t.join()
return output_lines
def run_mmlu_test(
disable_radix_cache=False,
enable_mixed_chunk=False,