Move output processing logic from scheduler.py into a separate file (#4354)

This commit is contained in:
Lianmin Zheng
2025-03-12 16:21:49 -07:00
committed by GitHub
parent 2c3656f276
commit e35a93fa8a
6 changed files with 634 additions and 609 deletions

View File

@@ -20,14 +20,13 @@ import random
import tempfile
from typing import List, Optional
import torch
from sglang.srt.hf_transformers_utils import check_gguf_file
from sglang.srt.reasoning_parser import ReasoningParser
from sglang.srt.utils import (
get_amdgpu_memory_capacity,
get_hpu_memory_capacity,
get_nvgpu_memory_capacity,
is_cuda,
is_flashinfer_available,
is_hip,
is_port_available,
@@ -71,6 +70,7 @@ class ServerArgs:
schedule_policy: str = "fcfs"
schedule_conservativeness: float = 1.0
cpu_offload_gb: int = 0
page_size: int = 1
# Other runtime options
tp_size: int = 1
@@ -190,10 +190,10 @@ class ServerArgs:
if self.random_seed is None:
self.random_seed = random.randint(0, 1 << 30)
if is_hip():
gpu_mem = get_amdgpu_memory_capacity()
elif torch.cuda.is_available():
if is_cuda():
gpu_mem = get_nvgpu_memory_capacity()
elif is_hip():
gpu_mem = get_amdgpu_memory_capacity()
elif self.device == "hpu":
gpu_mem = get_hpu_memory_capacity()
else:
@@ -258,7 +258,7 @@ class ServerArgs:
f"EP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
)
# Others
# Data parallelism attention
if self.enable_dp_attention:
self.dp_size = self.tp_size
assert self.tp_size % self.dp_size == 0
@@ -507,6 +507,12 @@ class ServerArgs:
default=ServerArgs.cpu_offload_gb,
help="How many GBs of RAM to reserve for CPU offloading.",
)
parser.add_argument(
"--page-size",
type=int,
default=ServerArgs.page_size,
help="The number of tokens in a page.",
)
# Other runtime options
parser.add_argument(