[CI] test chunked prefill more (#5798)

This commit is contained in:
Lianmin Zheng
2025-04-28 10:57:17 -07:00
committed by GitHub
parent d73ddeb196
commit 849c83a0c0
15 changed files with 212 additions and 97 deletions

View File

@@ -975,7 +975,7 @@ class ModelRunner:
after_mem = get_available_gpu_memory(self.device, self.gpu_id)
logger.info(
f"Capture cuda graph end. Time elapsed: {time.time() - tic:.2f} s. "
f"avail mem={after_mem:.2f} GB. mem usage={(before_mem - after_mem):.2f} GB."
f"mem usage={(before_mem - after_mem):.2f} GB. avail mem={after_mem:.2f} GB."
)
def apply_torch_tp(self):

View File

@@ -426,7 +426,7 @@ class ServerArgs:
parser.add_argument(
"--skip-tokenizer-init",
action="store_true",
help="If set, skip init tokenizer and pass input_ids in generate request",
help="If set, skip init tokenizer and pass input_ids in generate request.",
)
parser.add_argument(
"--enable-tokenizer-batch-encode",
@@ -565,6 +565,7 @@ class ServerArgs:
"name, a tag name, or a commit id. If unspecified, will use "
"the default version.",
)
# Memory and scheduling
parser.add_argument(
"--mem-fraction-static",