Add typo checker in pre-commit (#6179)
Co-authored-by: Brayden Zhong <b8zhong@uwaterloo.ca>
This commit is contained in:
@@ -147,3 +147,7 @@ exclude = [
|
||||
"scripts*",
|
||||
"tests*",
|
||||
]
|
||||
|
||||
[tool.codespell]
|
||||
ignore-words-list = "ans, als, hel, boostrap, childs, te, vas, hsa, ment"
|
||||
skip = "*.json,*.jsonl,*.patch,*.txt"
|
||||
|
||||
@@ -315,7 +315,7 @@ def throughput_test(
|
||||
tokenizer_id = server_args.tokenizer_path or server_args.model_path
|
||||
tokenizer = get_tokenizer(tokenizer_id)
|
||||
|
||||
# Set global environmnets
|
||||
# Set global environments
|
||||
set_ulimit()
|
||||
random.seed(bench_args.seed)
|
||||
np.random.seed(bench_args.seed)
|
||||
|
||||
@@ -1263,7 +1263,7 @@ async def benchmark(
|
||||
print("{:<40} {:<10}".format("Traffic request rate:", request_rate))
|
||||
print(
|
||||
"{:<40} {:<10}".format(
|
||||
"Max reqeuest concurrency:",
|
||||
"Max request concurrency:",
|
||||
max_concurrency if max_concurrency else "not set",
|
||||
)
|
||||
)
|
||||
|
||||
@@ -129,7 +129,7 @@ def launch_server_process_and_send_one_request(
|
||||
|
||||
|
||||
def refine_server_args(server_args: ServerArgs, compile_args: CompileArgs):
|
||||
# Disbale cuda graph and torch compile to save time
|
||||
# Disable cuda graph and torch compile to save time
|
||||
server_args.disable_cuda_graph = True
|
||||
server_args.enable_torch_compile = False
|
||||
print(f"Disable CUDA Graph and Torch Compile to save time...")
|
||||
|
||||
@@ -38,7 +38,7 @@ def extract_prefix_by_tracing(program, backend):
|
||||
with TracingScope(tracer):
|
||||
tracer.ret_value = program.func(tracer, **arguments)
|
||||
except (StopTracing, TypeError, AttributeError):
|
||||
# Some exceptions may not be catched
|
||||
# Some exceptions may not be caught
|
||||
pass
|
||||
|
||||
# Run and cache prefix
|
||||
|
||||
@@ -27,7 +27,7 @@ completion_template_name = None
|
||||
|
||||
|
||||
class FimPosition:
|
||||
"""Postion of fim middle token."""
|
||||
"""Position of fim middle token."""
|
||||
|
||||
MIDDLE = auto()
|
||||
END = auto()
|
||||
|
||||
@@ -416,9 +416,9 @@ class DeepseekVLV2Processor(ProcessorMixin):
|
||||
h = w = math.ceil(
|
||||
(self.image_size // self.patch_size) / self.downsample_ratio
|
||||
)
|
||||
# global views tokens h * (w + 1), 1 is for line seperator
|
||||
# global views tokens h * (w + 1), 1 is for line separator
|
||||
tokenized_image = [self.image_token_id] * h * (w + 1)
|
||||
# add a seperator between global and local views
|
||||
# add a separator between global and local views
|
||||
tokenized_image += [self.image_token_id]
|
||||
# local views tokens, (num_height_tiles * h) * (num_width_tiles * w + 1)
|
||||
tokenized_image += (
|
||||
|
||||
@@ -509,7 +509,7 @@ class SchedulerDisaggregationDecodeMixin:
|
||||
def event_loop_overlap_disagg_decode(self: Scheduler):
|
||||
result_queue = deque()
|
||||
self.last_batch: Optional[ScheduleBatch] = None
|
||||
self.last_batch_in_queue = False # last batch is modifed in-place, so we need another variable to track if it's extend
|
||||
self.last_batch_in_queue = False # last batch is modified in-place, so we need another variable to track if it's extend
|
||||
|
||||
while True:
|
||||
recv_reqs = self.recv_requests()
|
||||
|
||||
@@ -54,7 +54,7 @@ class FakeKVSender(BaseKVSender):
|
||||
logger.info(f"FakeKVSender send success")
|
||||
else:
|
||||
self.has_sent = False
|
||||
logger.info(f"FakeKVSender send fake transfering")
|
||||
logger.info(f"FakeKVSender send fake transferring")
|
||||
|
||||
def failure_exception(self):
|
||||
raise Exception("Fake KVSender Exception")
|
||||
|
||||
@@ -363,7 +363,7 @@ class MooncakeKVManager(BaseKVManager):
|
||||
self.request_status[bootstrap_room] = KVPoll.WaitingForInput
|
||||
|
||||
def check_status(self, bootstrap_room: int):
|
||||
# TOOD: do we really need the poll()?
|
||||
# TODO: do we really need the poll()?
|
||||
|
||||
return self.request_status[bootstrap_room]
|
||||
|
||||
|
||||
@@ -112,7 +112,7 @@ def get_kv_class(transfer_backend: TransferBackend, class_type: KVClassType):
|
||||
|
||||
|
||||
def kv_to_page_indices(kv_indices: np.ndarray, page_size: int):
|
||||
# 1. The page is guaruanteed to be full except the last page.
|
||||
# 1. The page is guaranteed to be full except the last page.
|
||||
# 2. page index = kv_index // page_size
|
||||
# The return vector is kv_indices[::page_size] // page_size
|
||||
if page_size == 1: # shortcut
|
||||
|
||||
@@ -86,8 +86,8 @@ class StructureInfo:
|
||||
|
||||
_GetInfoFunc = Callable[[str], StructureInfo]
|
||||
"""
|
||||
helper alias of function
|
||||
ususally it is a function that takes a name string and returns a StructureInfo object,
|
||||
Helper alias of function
|
||||
Usually it is a function that takes a name string and returns a StructureInfo object,
|
||||
which can be used to construct a structural_tag object
|
||||
"""
|
||||
|
||||
|
||||
@@ -308,7 +308,7 @@ class FlashAttentionBackend(AttentionBackend):
|
||||
), "Sliding window and cross attention are not supported together"
|
||||
|
||||
self.forward_metadata: FlashAttentionMetadata = None
|
||||
# extra metdata for handling speculative decoding topk > 1, extended draft decode and verify
|
||||
# extra metadata for handling speculative decoding topk > 1, extended draft decode and verify
|
||||
self.forward_metadata_spec_decode_expand: FlashAttentionMetadata = None
|
||||
self.max_context_len = model_runner.model_config.context_len
|
||||
self.device = model_runner.device
|
||||
|
||||
@@ -919,7 +919,7 @@ def _fwd_kernel(
|
||||
|
||||
e_max = n_e_max
|
||||
|
||||
# stage 2: compute the trianlge part
|
||||
# stage 2: compute the triangle part
|
||||
|
||||
cur_block_m_end = tl.minimum(cur_seq_len_extend, (cur_block_m + 1) * BLOCK_M)
|
||||
for start_n in range(0, cur_block_m_end, BLOCK_N):
|
||||
|
||||
@@ -201,7 +201,7 @@ def _dp_gather(
|
||||
global_tokens, local_tokens, 0, local_start_pos, local_num_tokens, False
|
||||
)
|
||||
|
||||
# Input IDs are in int 32. We should use inplace_all_reduce for local case becaues of custom all reduce.
|
||||
# Input IDs are in int 32. We should use inplace_all_reduce for local case because of custom all reduce.
|
||||
NUM_GPUS_PER_NODE = 8
|
||||
if (
|
||||
not local_tokens.dtype.is_floating_point
|
||||
|
||||
@@ -76,7 +76,7 @@ class RMSNorm(CustomOp):
|
||||
residual: Optional[torch.Tensor] = None,
|
||||
) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
|
||||
if not x.is_contiguous():
|
||||
# NOTE: Romove this if aiter kernel supports discontinuous input
|
||||
# NOTE: Remove this if aiter kernel supports discontinuous input
|
||||
x = x.contiguous()
|
||||
if residual is not None:
|
||||
fused_add_rms_norm(x, residual, self.weight.data, self.variance_epsilon)
|
||||
|
||||
@@ -116,7 +116,7 @@ def deepep_run_moe_deep_preprocess(topk_ids: torch.Tensor, num_experts: int):
|
||||
seg_indptr = torch.empty(num_experts + 1, device=topk_ids.device, dtype=torch.int64)
|
||||
src2dst = torch.empty(topk_ids.numel(), device=topk_ids.device, dtype=torch.int64)
|
||||
|
||||
# Find offet
|
||||
# Find offset
|
||||
expert_ids = torch.arange(
|
||||
num_experts + 1, device=topk_ids.device, dtype=reorder_topk_ids.dtype
|
||||
)
|
||||
|
||||
@@ -611,7 +611,7 @@ class Fp8EPMoEMethod(Fp8MoEMethod):
|
||||
self.quant_config.weight_block_size[1],
|
||||
)
|
||||
# NOTE(HandH1998): To ensure proper alignment of the block-wise quantization scales, the output_size of the weights for both the gate and up layers must be divisible by block_n.
|
||||
# Required by collum parallel or enabling merged weights
|
||||
# Required by column parallel or enabling merged weights
|
||||
if intermediate_size % block_n != 0:
|
||||
raise ValueError(
|
||||
f"The output_size of gate's and up's weight = "
|
||||
|
||||
@@ -994,7 +994,7 @@ def get_default_config(
|
||||
"num_stages": 2 if _is_hip else 4,
|
||||
}
|
||||
else:
|
||||
# Block-wise quant: BLOCK_SIZE_K must be divisable by block_shape[1]
|
||||
# Block-wise quant: BLOCK_SIZE_K must be divisible by block_shape[1]
|
||||
config = {
|
||||
"BLOCK_SIZE_M": 64,
|
||||
"BLOCK_SIZE_N": block_shape[0],
|
||||
|
||||
@@ -270,7 +270,7 @@ def select_experts(
|
||||
routed_scaling_factor: Optional[float] = None,
|
||||
):
|
||||
n_share_experts_fusion = global_server_args_dict["n_share_experts_fusion"]
|
||||
# DeekSeek V2/V3/R1 serices models uses grouped_top_k
|
||||
# DeepSeek V2/V3/R1 series models use grouped_top_k
|
||||
if use_grouped_topk:
|
||||
assert topk_group is not None
|
||||
assert num_expert_group is not None
|
||||
|
||||
@@ -109,7 +109,7 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
|
||||
if quantization in VLLM_QUANTIZATION_METHODS and not VLLM_AVAILABLE:
|
||||
raise ValueError(
|
||||
f"{quantization} quantization requires some operators from vllm. "
|
||||
"Pleaes install vllm by `pip install vllm==0.8.4`"
|
||||
"Please install vllm by `pip install vllm==0.8.4`"
|
||||
)
|
||||
|
||||
return QUANTIZATION_METHODS[quantization]
|
||||
|
||||
@@ -152,7 +152,7 @@ class BlockInt8LinearMethod(LinearMethodBase):
|
||||
f"{input_size_per_partition} is not divisible by "
|
||||
f"weight quantization block_k = {block_k}."
|
||||
)
|
||||
# Required by collum parallel or enabling merged weights
|
||||
# Required by column parallel or enabling merged weights
|
||||
if (tp_size > 1 and output_size // output_size_per_partition == tp_size) or len(
|
||||
output_partition_sizes
|
||||
) > 1:
|
||||
@@ -285,7 +285,7 @@ class BlockInt8MoEMethod:
|
||||
self.quant_config.weight_block_size[1],
|
||||
)
|
||||
# NOTE(HandH1998): To ensure proper alignment of the block-wise quantization scales, the output_size of the weights for both the gate and up layers must be divisible by block_n.
|
||||
# Required by collum parallel or enabling merged weights
|
||||
# Required by column parallel or enabling merged weights
|
||||
if intermediate_size % block_n != 0:
|
||||
raise ValueError(
|
||||
f"The output_size of gate's and up's weight = "
|
||||
|
||||
@@ -103,10 +103,10 @@ _INITIALIZATION_DICT: Dict[Tuple[DeepGemmKernelType, int, int, int], bool] = dic
|
||||
def _compile_warning_1():
|
||||
if not _IN_PRECOMPILE_STAGE and _IS_FIRST_RANK_ON_NODE:
|
||||
logger.warning(
|
||||
"Entering DeepGEMM JIT Pre-Complie session. "
|
||||
"Entering DeepGEMM JIT Pre-Compile session. "
|
||||
"And it may takes a long time(Typically 10-20 mins) "
|
||||
"if you have not run `sglang.compile_deep_gemm`. "
|
||||
"Recommand to run `sglang.compile_deep_gemm` with same args as `sglang.launch_server`"
|
||||
"It is recommended to run `sglang.compile_deep_gemm` with same args as `sglang.launch_server`"
|
||||
" for pre-compilation to reduce the overhead if you have not run it before. "
|
||||
"For example: "
|
||||
"`python3 -m sglang.compile_deep_gemm --model deepseek-ai/DeepSeek-V3 --tp 8 --trust-remote-code`"
|
||||
@@ -115,7 +115,7 @@ def _compile_warning_1():
|
||||
|
||||
def _compile_warning_2():
|
||||
logger.warning(
|
||||
"Entering DeepGEMM JIT Single Kernel Complie session. "
|
||||
"Entering DeepGEMM JIT Single Kernel Compile session. "
|
||||
"And it will makes inference throughput becomes flaky. "
|
||||
"Please run `sglang.compile_deep_gemm` with same args as `sglang.launch_server`"
|
||||
" for pre-compilation to solve this issue. "
|
||||
@@ -298,7 +298,7 @@ def _maybe_compile_deep_gemm_one_type_all(
|
||||
logger.info(
|
||||
f"Try DeepGEMM JIT Compiling for "
|
||||
f"<{kernel_helper.name}> N={n}, K={k}, num_groups={num_groups} with all Ms."
|
||||
f"{' It only takes a litte time (typically 1 sec) if you have run `python3 -m sglang.compile_deep_gemm`. ' if not _IN_PRECOMPILE_STAGE else ''}"
|
||||
f"{' It only takes a little time (typically 1 sec) if you have run `python3 -m sglang.compile_deep_gemm`. ' if not _IN_PRECOMPILE_STAGE else ''}"
|
||||
)
|
||||
|
||||
# NOTE(alcanderian): get_num_sms should be change when 2-batch-overlap is introduced
|
||||
|
||||
@@ -235,7 +235,7 @@ class Fp8LinearMethod(LinearMethodBase):
|
||||
f"{input_size_per_partition} is not divisible by "
|
||||
f"weight quantization block_k = {block_k}."
|
||||
)
|
||||
# Required by collum parallel or enabling merged weights
|
||||
# Required by column parallel or enabling merged weights
|
||||
if (
|
||||
tp_size > 1 and output_size // output_size_per_partition == tp_size
|
||||
) or len(output_partition_sizes) > 1:
|
||||
@@ -491,7 +491,7 @@ class Fp8MoEMethod:
|
||||
self.quant_config.weight_block_size[1],
|
||||
)
|
||||
# NOTE(HandH1998): To ensure proper alignment of the block-wise quantization scales, the output_size of the weights for both the gate and up layers must be divisible by block_n.
|
||||
# Required by collum parallel or enabling merged weights
|
||||
# Required by column parallel or enabling merged weights
|
||||
if intermediate_size % block_n != 0:
|
||||
raise ValueError(
|
||||
f"The output_size of gate's and up's weight = "
|
||||
|
||||
@@ -104,7 +104,7 @@ def _per_token_group_quant_fp8(
|
||||
y_s_ptr,
|
||||
# Stride of input
|
||||
y_stride,
|
||||
# Collums of input
|
||||
# Columns of input
|
||||
N,
|
||||
# Avoid to divide zero
|
||||
eps,
|
||||
@@ -342,7 +342,7 @@ def _static_quant_fp8(
|
||||
y_s_repeat_ptr,
|
||||
# Stride of input
|
||||
y_stride,
|
||||
# Collums of input
|
||||
# Columns of input
|
||||
N,
|
||||
# Information for float8
|
||||
fp8_min,
|
||||
@@ -794,7 +794,7 @@ def w8a8_block_fp8_matmul(
|
||||
config = configs[min(configs.keys(), key=lambda x: abs(x - M))]
|
||||
else:
|
||||
# Default config
|
||||
# Block-wise quant: BLOCK_SIZE_K must be divisable by block_size[1]
|
||||
# Block-wise quant: BLOCK_SIZE_K must be divisible by block_size[1]
|
||||
config = {
|
||||
"BLOCK_SIZE_M": 64,
|
||||
"BLOCK_SIZE_N": block_size[0],
|
||||
|
||||
@@ -76,7 +76,7 @@ def _per_token_group_quant_int8(
|
||||
y_s_ptr,
|
||||
# Stride of input
|
||||
y_stride,
|
||||
# Collums of input
|
||||
# Columns of input
|
||||
N,
|
||||
# Avoid to divide zero
|
||||
eps,
|
||||
@@ -370,7 +370,7 @@ def w8a8_block_int8_matmul(
|
||||
config = configs[min(configs.keys(), key=lambda x: abs(x - M))]
|
||||
else:
|
||||
# Default config
|
||||
# Block-wise quant: BLOCK_SIZE_K must be divisable by block_size[1]
|
||||
# Block-wise quant: BLOCK_SIZE_K must be divisible by block_size[1]
|
||||
config = {
|
||||
"BLOCK_SIZE_M": 64,
|
||||
"BLOCK_SIZE_N": block_size[0],
|
||||
|
||||
@@ -100,7 +100,7 @@ class LoRAManager:
|
||||
self.configs[name] = LoRAConfig(path)
|
||||
self.hf_target_names.update(self.configs[name].target_modules)
|
||||
|
||||
# Target lora weight names for lora_a and lora_b modules repectively.
|
||||
# Target lora weight names for lora_a and lora_b modules respectively.
|
||||
# e.g., {("qkv_proj", "q_proj"), ("qkv_proj", "kv_proj")}
|
||||
self.lora_weight_names: Set[Tuple[str]] = set(
|
||||
[get_stacked_name(module) for module in self.hf_target_names]
|
||||
|
||||
@@ -50,15 +50,15 @@ class LoRAMemoryPool:
|
||||
self.uid_to_buffer_id: Dict[Optional[str], int] = {}
|
||||
|
||||
# Buffer idx -> lora uid in memory pool
|
||||
# All uids are initalized as empty strings for empty buffer slots
|
||||
# Here we don't initalize to None since None is a valid uid
|
||||
# All uids are initialized as empty strings for empty buffer slots
|
||||
# Here we don't initialize to None since None is a valid uid
|
||||
self.buffer_id_to_uid: List[Optional[str]] = [""] * self.max_loras_per_batch
|
||||
|
||||
def get_lora_A_shape(
|
||||
self, module_name: str, base_model: torch.nn.Module
|
||||
) -> Tuple[int]:
|
||||
"""
|
||||
Given a module_name (might be a stacked name), return the hidden dims of modules's input and output.
|
||||
Given a module_name (might be a stacked name), return the hidden dims of modules' input and output.
|
||||
"""
|
||||
input_dim, _ = get_hidden_dim(module_name, self.base_hf_config, base_model)
|
||||
c = get_stacked_multiply(module_name)
|
||||
@@ -75,7 +75,7 @@ class LoRAMemoryPool:
|
||||
self, module_name: str, base_model: torch.nn.Module
|
||||
) -> Tuple[int]:
|
||||
"""
|
||||
Given a module_name (might be a stacked name), return the hidden dims of modules's input and output.
|
||||
Given a module_name (might be a stacked name), return the hidden dims of modules' input and output.
|
||||
"""
|
||||
_, output_dim = get_hidden_dim(module_name, self.base_hf_config, base_model)
|
||||
c = get_stacked_multiply(module_name)
|
||||
|
||||
@@ -77,7 +77,7 @@ def _gate_up_lora_b_kernel(
|
||||
k_offset[:, None] * w_stride_2 + n_offset[None, :] * w_stride_1
|
||||
)
|
||||
|
||||
# Iteate to compute the block in output matrix
|
||||
# Iterate to compute the block in output matrix
|
||||
partial_sum = tl.zeros((BLOCK_S, BLOCK_N), dtype=tl.float32)
|
||||
for k in range(0, tl.cdiv(K, BLOCK_K)):
|
||||
x_tile = tl.load(
|
||||
|
||||
@@ -79,7 +79,7 @@ def _qkv_lora_b_kernel(
|
||||
k_offset[:, None] * w_stride_2 + n_offset[None, :] * w_stride_1
|
||||
)
|
||||
|
||||
# Iteate to compute the block in output matrix
|
||||
# Iterate to compute the block in output matrix
|
||||
partial_sum = tl.zeros((BLOCK_S, BLOCK_N), dtype=tl.float32)
|
||||
for k in range(0, tl.cdiv(K, BLOCK_K)):
|
||||
x_tile = tl.load(
|
||||
|
||||
@@ -67,7 +67,7 @@ def _sgemm_lora_a_kernel(
|
||||
k_offset[:, None] * w_stride_2 + n_offset[None, :] * w_stride_1
|
||||
)
|
||||
|
||||
# Iteate to compute the block in output matrix
|
||||
# Iterate to compute the block in output matrix
|
||||
partial_sum = tl.zeros((BLOCK_S, BLOCK_N), dtype=tl.float32)
|
||||
for k in range(0, tl.cdiv(K, BLOCK_K)):
|
||||
x_tile = tl.load(
|
||||
|
||||
@@ -69,7 +69,7 @@ def _sgemm_lora_b_kernel(
|
||||
k_offset[:, None] * w_stride_2 + n_offset[None, :] * w_stride_1
|
||||
)
|
||||
|
||||
# Iteate to compute the block in output matrix
|
||||
# Iterate to compute the block in output matrix
|
||||
partial_sum = tl.zeros((BLOCK_S, BLOCK_N), dtype=tl.float32)
|
||||
for k in range(0, tl.cdiv(K, BLOCK_K)):
|
||||
x_tile = tl.load(
|
||||
|
||||
@@ -79,7 +79,7 @@ def get_hidden_dim(
|
||||
module_name: str, config: AutoConfig, base_model: torch.nn.Module
|
||||
) -> Tuple[int]:
|
||||
"""
|
||||
Given a module_name (might be a stacked name), return the hidden dims of modules's input and output.
|
||||
Given a module_name (might be a stacked name), return the hidden dims of modules' input and output.
|
||||
"""
|
||||
|
||||
if hasattr(base_model, "get_hidden_dim"):
|
||||
|
||||
@@ -210,7 +210,7 @@ class DataParallelController:
|
||||
)
|
||||
# compute zmq ports for this dp rank
|
||||
rank_port_args = PortArgs.init_new(server_args, dp_rank)
|
||||
# Data parallelism resues the tensor parallelism group,
|
||||
# Data parallelism reuses the tensor parallelism group,
|
||||
# so all dp ranks should use the same nccl port.
|
||||
rank_port_args.nccl_port = port_args.nccl_port
|
||||
|
||||
|
||||
@@ -12,7 +12,7 @@
|
||||
# limitations under the License.
|
||||
# ==============================================================================
|
||||
"""
|
||||
The definition of objects transfered between different
|
||||
The definition of objects transferred between different
|
||||
processes (TokenizerManager, DetokenizerManager, Controller).
|
||||
"""
|
||||
|
||||
|
||||
@@ -51,7 +51,7 @@ class MultiModalityDataPaddingPatternTokenPairs(MultiModalityDataPaddingPattern)
|
||||
self, input_ids: List[int], mm_inputs: MultimodalInputs
|
||||
) -> List[int]:
|
||||
"""
|
||||
This function will replace the data-tokens inbetween with pad_values accordingly
|
||||
This function will replace the data-tokens in between with pad_values accordingly
|
||||
"""
|
||||
pad_values = [item.pad_value for item in mm_inputs.mm_items]
|
||||
data_token_pairs = self.data_token_id_pairs
|
||||
|
||||
@@ -879,7 +879,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
|
||||
error_msg = (
|
||||
f"{phase_str} out of memory. Try to lower your batch size.\n"
|
||||
f"Try to allocate {num_tokens} tokens.\n"
|
||||
f"Avaliable tokens: {self.token_to_kv_pool_allocator.available_size() + self.tree_cache.evictable_size()}\n"
|
||||
f"Available tokens: {self.token_to_kv_pool_allocator.available_size() + self.tree_cache.evictable_size()}\n"
|
||||
)
|
||||
logger.error(error_msg)
|
||||
if self.tree_cache is not None:
|
||||
@@ -920,7 +920,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
|
||||
error_msg = (
|
||||
f"Prefill out of memory. Try to lower your batch size.\n"
|
||||
f"Try to allocate {extend_num_tokens} tokens.\n"
|
||||
f"Avaliable tokens: {self.token_to_kv_pool_allocator.available_size() + self.tree_cache.evictable_size()}\n"
|
||||
f"Available tokens: {self.token_to_kv_pool_allocator.available_size() + self.tree_cache.evictable_size()}\n"
|
||||
f"{self.token_to_kv_pool_allocator.available_size()=}\n"
|
||||
f"{self.tree_cache.evictable_size()=}\n"
|
||||
)
|
||||
@@ -955,7 +955,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
|
||||
error_msg = (
|
||||
f"Decode out of memory. Try to lower your batch size.\n"
|
||||
f"Try to allocate {len(seq_lens)} tokens.\n"
|
||||
f"Avaliable tokens: {self.token_to_kv_pool_allocator.available_size() + self.tree_cache.evictable_size()}\n"
|
||||
f"Available tokens: {self.token_to_kv_pool_allocator.available_size() + self.tree_cache.evictable_size()}\n"
|
||||
f"{self.token_to_kv_pool_allocator.available_size()=}\n"
|
||||
f"{self.tree_cache.evictable_size()=}\n"
|
||||
)
|
||||
|
||||
@@ -1325,7 +1325,7 @@ class Scheduler(
|
||||
return None
|
||||
|
||||
running_bs = len(self.running_batch.reqs)
|
||||
# Igore the check if self.chunked_req is not None.
|
||||
# Ignore the check if self.chunked_req is not None.
|
||||
# In the non-PP case, when self.chunked_req is not None, num_allocatable_reqs should always be greater than 0,
|
||||
# as the space for the chunked request has just been released.
|
||||
# In PP case, a chunked req can start in one microbatch and end in another microbatch, so the max_running_requests per microbatch should not be strict.
|
||||
|
||||
@@ -1273,7 +1273,7 @@ class TokenizerManager:
|
||||
self.model_update_result.set_result(recv_obj)
|
||||
else: # self.server_args.dp_size > 1
|
||||
self.model_update_tmp.append(recv_obj)
|
||||
# set future if the all results are recevied
|
||||
# set future if the all results are received
|
||||
if len(self.model_update_tmp) == self.server_args.dp_size:
|
||||
self.model_update_result.set_result(self.model_update_tmp)
|
||||
|
||||
|
||||
@@ -296,12 +296,12 @@ class CudaGraphRunner:
|
||||
self.capture()
|
||||
except RuntimeError as e:
|
||||
raise Exception(
|
||||
f"Capture cuda graph failed: {e}\n"
|
||||
f"Capture CUDA graph failed: {e}\n"
|
||||
"Possible solutions:\n"
|
||||
"1. set --mem-fraction-static to a smaller value (e.g., 0.8 or 0.7)\n"
|
||||
"2. set --cuda-graph-max-bs to a smaller value (e.g., 16)\n"
|
||||
"3. disable torch compile by not using --enable-torch-compile\n"
|
||||
"4. disable cuda graph by --disable-cuda-graph. (Not recommonded. Huge perf loss)\n"
|
||||
"4. disable CUDA graph by --disable-cuda-graph. (Not recommended. Huge performance loss)\n"
|
||||
"Open an issue on GitHub https://github.com/sgl-project/sglang/issues/new/choose \n"
|
||||
)
|
||||
|
||||
|
||||
@@ -58,7 +58,7 @@ class ForwardMode(IntEnum):
|
||||
DECODE = auto()
|
||||
# Contains both EXTEND and DECODE when doing chunked prefill.
|
||||
MIXED = auto()
|
||||
# No sequence to forward. For data parallel attention, some workers wil be IDLE if no sequence are allocated.
|
||||
# No sequence to forward. For data parallel attention, some workers will be IDLE if no sequence are allocated.
|
||||
IDLE = auto()
|
||||
|
||||
# Used in speculative decoding: verify a batch in the target model.
|
||||
|
||||
@@ -188,7 +188,7 @@ def trunc_normal_tf_(
|
||||
best when :math:`a \\leq \text{mean} \\leq b`.
|
||||
NOTE: this 'tf' variant behaves closer to Tensorflow / JAX impl where the
|
||||
bounds [a, b] are applied when sampling the normal distribution with mean=0, std=1.0
|
||||
and the result is subsquently scaled and shifted by the mean and std args.
|
||||
and the result is subsequently scaled and shifted by the mean and std args.
|
||||
Args:
|
||||
tensor: an n-dimensional `torch.Tensor`
|
||||
mean: the mean of the normal distribution
|
||||
@@ -735,7 +735,7 @@ class VisionTransformer(nn.Module):
|
||||
img_size: Input image size.
|
||||
patch_size: Patch size.
|
||||
in_chans: Number of image input channels.
|
||||
num_classes: Mumber of classes for classification head.
|
||||
num_classes: Number of classes for classification head.
|
||||
global_pool: Type of global pooling for final sequence (default: 'token').
|
||||
embed_dim: Transformer embedding dimension.
|
||||
depth: Depth of transformer.
|
||||
|
||||
@@ -1287,7 +1287,7 @@ class DeepseekV2DecoderLayer(nn.Module):
|
||||
# Fully Connected
|
||||
hidden_states = self.mlp(hidden_states)
|
||||
|
||||
# TODO(ch-wan): ues reduce-scatter in MLP to avoid this scatter
|
||||
# TODO(ch-wan): use reduce-scatter in MLP to avoid this scatter
|
||||
# Scatter
|
||||
if self.dp_size != 1:
|
||||
# important: forward batch.gathered_buffer is used both after scatter and after gather.
|
||||
@@ -1499,7 +1499,7 @@ class DeepseekV2ForCausalLM(nn.Module):
|
||||
else:
|
||||
assert (
|
||||
self.n_share_experts_fusion == self.tp_size
|
||||
), f"Shared experts fusion optimization is enabled in DeepSeek V3/R1, set it to {self.tp_size} can get best optimized performace."
|
||||
), f"Shared experts fusion optimization is enabled in DeepSeek V3/R1, set it to {self.tp_size} can get best optimized performance."
|
||||
elif self.n_share_experts_fusion == 0:
|
||||
if (
|
||||
_is_cuda
|
||||
@@ -1665,7 +1665,7 @@ class DeepseekV2ForCausalLM(nn.Module):
|
||||
if is_nextn:
|
||||
if hasattr(self.config, "num_nextn_predict_layers"):
|
||||
num_nextn_layers = self.config.num_nextn_predict_layers
|
||||
assert num_nextn_layers == 1, "Only 1 nextn layer is supportted"
|
||||
assert num_nextn_layers == 1, "Only 1 nextn layer is supported"
|
||||
# compatible with old design
|
||||
nextn_layer_id = (
|
||||
0
|
||||
|
||||
@@ -428,7 +428,7 @@ class Llama4DecoderLayer(nn.Module):
|
||||
# Fully Connected
|
||||
hidden_states = self.feed_forward(hidden_states, forward_batch)
|
||||
|
||||
# TODO(ch-wan): ues reduce-scatter in MLP to avoid this scatter
|
||||
# TODO(ch-wan): use reduce-scatter in MLP to avoid this scatter
|
||||
# Scatter
|
||||
if self.dp_size != 1:
|
||||
# important: forward batch.gathered_buffer is used both after scatter and after gather.
|
||||
|
||||
@@ -57,7 +57,7 @@ class RobertaEmbedding(nn.Module):
|
||||
input_shape = input_ids.size()
|
||||
inputs_embeds = self.word_embeddings(input_ids)
|
||||
|
||||
# adpated from vllm: https://github.com/vllm-project/vllm/commit/4a18fd14ba4a349291c798a16bf62fa8a9af0b6b/vllm/model_executor/models/roberta.py
|
||||
# Adapted from vllm: https://github.com/vllm-project/vllm/commit/4a18fd14ba4a349291c798a16bf62fa8a9af0b6b/vllm/model_executor/models/roberta.py
|
||||
|
||||
pos_list = []
|
||||
token_list = []
|
||||
|
||||
@@ -37,7 +37,7 @@ $ python3 -m sglang.bench_one_batch --correct \
|
||||
--tensor-parallel-size 2 \
|
||||
--disable-cuda-graph
|
||||
```
|
||||
We will eanble CUDA Graph support soon.
|
||||
We will enable CUDA Graph support soon.
|
||||
"""
|
||||
|
||||
import types
|
||||
|
||||
@@ -590,7 +590,7 @@ def v1_generate_response(
|
||||
echo = False
|
||||
|
||||
if (not isinstance(request, list)) and request.echo:
|
||||
# TODO: handle the case propmt is token ids
|
||||
# TODO: handle the case prompt is token ids
|
||||
if isinstance(request.prompt, list) and isinstance(request.prompt[0], str):
|
||||
# for the case of multiple str prompts
|
||||
prompts = request.prompt
|
||||
@@ -646,7 +646,7 @@ def v1_generate_response(
|
||||
finish_reason = ret_item["meta_info"]["finish_reason"]
|
||||
|
||||
if to_file:
|
||||
# to make the choise data json serializable
|
||||
# to make the choice data json serializable
|
||||
choice_data = {
|
||||
"index": 0,
|
||||
"text": text,
|
||||
|
||||
@@ -147,7 +147,7 @@ class ReasoningParser:
|
||||
|
||||
Args:
|
||||
model_type (str): Type of model to parse reasoning from
|
||||
stream_reasoning (bool): If Flase, accumulates reasoning content until complete.
|
||||
stream_reasoning (bool): If False, accumulates reasoning content until complete.
|
||||
If True, streams reasoning content as it arrives.
|
||||
"""
|
||||
|
||||
|
||||
@@ -294,7 +294,7 @@ class SamplingBatchInfo:
|
||||
# Set the flag to True if any of the two has custom logit processor
|
||||
self.has_custom_logit_processor = True
|
||||
|
||||
# Note: becasue the __len()__ operator is defined on the temperatures tensor,
|
||||
# Note: because the __len()__ operator is defined on the temperatures tensor,
|
||||
# please make sure any merge operation with len(self) or len(other) is done before
|
||||
# the merge operation of the temperatures tensor below.
|
||||
for item in [
|
||||
|
||||
@@ -825,7 +825,7 @@ class ServerArgs:
|
||||
# Multi-node distributed serving
|
||||
parser.add_argument(
|
||||
"--dist-init-addr",
|
||||
"--nccl-init-addr", # For backward compatbility. This will be removed in the future.
|
||||
"--nccl-init-addr", # For backward compatibility. This will be removed in the future.
|
||||
type=str,
|
||||
help="The host address for initializing distributed backend (e.g., `192.168.0.2:25000`).",
|
||||
)
|
||||
@@ -1096,7 +1096,7 @@ class ServerArgs:
|
||||
parser.add_argument(
|
||||
"--triton-attention-reduce-in-fp32",
|
||||
action="store_true",
|
||||
help="Cast the intermidiate attention results to fp32 to avoid possible crashes related to fp16."
|
||||
help="Cast the intermediate attention results to fp32 to avoid possible crashes related to fp16."
|
||||
"This only affects Triton attention kernels.",
|
||||
)
|
||||
parser.add_argument(
|
||||
@@ -1188,7 +1188,7 @@ class ServerArgs:
|
||||
type=int,
|
||||
default=0,
|
||||
help="The number of shared_experts need to be replicated to fuse with normal experts in deepseek v3/r1, "
|
||||
"set it to tp_size can get best optimized performace.",
|
||||
"set it to tp_size can get best optimized performance.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--disable-chunked-prefix-cache",
|
||||
|
||||
@@ -82,12 +82,12 @@ class EAGLEDraftCudaGraphRunner:
|
||||
self.capture()
|
||||
except RuntimeError as e:
|
||||
raise Exception(
|
||||
f"Capture cuda graph failed: {e}\n"
|
||||
f"Capture CUDA graph failed: {e}\n"
|
||||
"Possible solutions:\n"
|
||||
"1. set --mem-fraction-static to a smaller value (e.g., 0.8 or 0.7)\n"
|
||||
"2. set --cuda-graph-max-bs to a smaller value (e.g., 16)\n"
|
||||
"3. disable torch compile by not using --enable-torch-compile\n"
|
||||
"4. disable cuda graph by --disable-cuda-graph. (Not recommonded. Huge perf loss)\n"
|
||||
"4. disable CUDA graph by --disable-cuda-graph. (Not recommended. Huge performance loss)\n"
|
||||
"Open an issue on GitHub https://github.com/sgl-project/sglang/issues/new/choose \n"
|
||||
)
|
||||
|
||||
@@ -149,7 +149,7 @@ class EAGLEDraftCudaGraphRunner:
|
||||
|
||||
# Run and capture
|
||||
def run_once():
|
||||
# Backup two fileds, which will be modified in-place in `draft_forward`.
|
||||
# Backup two fields, which will be modified in-place in `draft_forward`.
|
||||
output_cache_loc_backup = forward_batch.out_cache_loc
|
||||
hidden_states_backup = forward_batch.spec_info.hidden_states
|
||||
|
||||
|
||||
@@ -167,12 +167,12 @@ class EagleVerifyOutput:
|
||||
draft_input: EagleDraftInput
|
||||
# Logit outputs from target worker
|
||||
logits_output: LogitsProcessorOutput
|
||||
# Accepeted token ids including the bonus token
|
||||
# Accepted token ids including the bonus token
|
||||
verified_id: torch.Tensor
|
||||
# Accepeted token length per sequence in a batch in CPU.
|
||||
# Accepted token length per sequence in a batch in CPU.
|
||||
accept_length_per_req_cpu: List[int]
|
||||
# Accepeted indices from logits_output.next_token_logits
|
||||
accepeted_indices: torch.Tensor
|
||||
# Accepted indices from logits_output.next_token_logits
|
||||
accepted_indices: torch.Tensor
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -316,7 +316,7 @@ class EagleVerifyInput:
|
||||
|
||||
This API updates values inside logits_output based on the accepted
|
||||
tokens. I.e., logits_output.next_token_logits only contains
|
||||
accepeted token logits.
|
||||
accepted token logits.
|
||||
"""
|
||||
bs = self.retrive_index.shape[0]
|
||||
candidates = self.draft_token.reshape(bs, self.draft_token_num)
|
||||
@@ -493,7 +493,7 @@ class EagleVerifyInput:
|
||||
logits_output=logits_output,
|
||||
verified_id=verified_id,
|
||||
accept_length_per_req_cpu=accept_length_cpu,
|
||||
accepeted_indices=accept_index,
|
||||
accepted_indices=accept_index,
|
||||
)
|
||||
else:
|
||||
assign_req_to_token_pool[(bs,)](
|
||||
@@ -539,7 +539,7 @@ class EagleVerifyInput:
|
||||
logits_output=logits_output,
|
||||
verified_id=verified_id,
|
||||
accept_length_per_req_cpu=accept_length_cpu,
|
||||
accepeted_indices=accept_index,
|
||||
accepted_indices=accept_index,
|
||||
)
|
||||
|
||||
|
||||
|
||||
@@ -201,7 +201,7 @@ class EAGLEWorker(TpModelWorker):
|
||||
self.has_prefill_wrapper_verify = False
|
||||
else:
|
||||
raise ValueError(
|
||||
f"EAGLE is not supportted in attention backend {self.server_args.attention_backend}"
|
||||
f"EAGLE is not supported in attention backend {self.server_args.attention_backend}"
|
||||
)
|
||||
|
||||
self.draft_model_runner.draft_attn_backend = self.draft_attn_backend
|
||||
@@ -245,8 +245,8 @@ class EAGLEWorker(TpModelWorker):
|
||||
Args:
|
||||
batch: The batch to run forward. The state of the batch is modified as it runs.
|
||||
Returns:
|
||||
A tuple of the final logit output of the target model, next tokens accepeted,
|
||||
the batch id (used for overlap schedule), and number of accepeted tokens.
|
||||
A tuple of the final logit output of the target model, next tokens accepted,
|
||||
the batch id (used for overlap schedule), and number of accepted tokens.
|
||||
"""
|
||||
if batch.forward_mode.is_decode():
|
||||
with self.draft_tp_context(self.draft_model_runner.tp_group):
|
||||
@@ -491,11 +491,11 @@ class EAGLEWorker(TpModelWorker):
|
||||
)
|
||||
|
||||
# Post process based on verified outputs.
|
||||
# Pick indices that we care (accepeted)
|
||||
# Pick indices that we care (accepted)
|
||||
logits_output.next_token_logits = logits_output.next_token_logits[
|
||||
res.accepeted_indices
|
||||
res.accepted_indices
|
||||
]
|
||||
logits_output.hidden_states = logits_output.hidden_states[res.accepeted_indices]
|
||||
logits_output.hidden_states = logits_output.hidden_states[res.accepted_indices]
|
||||
|
||||
# Prepare the batch for the next draft forwards.
|
||||
batch.forward_mode = ForwardMode.DECODE
|
||||
@@ -597,7 +597,7 @@ class EAGLEWorker(TpModelWorker):
|
||||
self.capture_for_decode(logits_output, forward_batch.spec_info)
|
||||
|
||||
def forward_draft_extend_after_decode(self, batch: ScheduleBatch):
|
||||
# Backup fileds that will be modified in-place
|
||||
# Backup fields that will be modified in-place
|
||||
seq_lens_backup = batch.seq_lens.clone()
|
||||
req_pool_indices_backup = batch.req_pool_indices
|
||||
accept_length_backup = batch.spec_info.accept_length
|
||||
|
||||
@@ -140,7 +140,7 @@ class ChatCompletionSampler(SamplerBase):
|
||||
max_tokens=self.max_tokens,
|
||||
)
|
||||
return response.choices[0].message.content
|
||||
# NOTE: BadRequestError is triggered once for MMMU, please uncomment if you are reruning MMMU
|
||||
# NOTE: BadRequestError is triggered once for MMMU, please uncomment if you are rerunning MMMU
|
||||
except openai.BadRequestError as e:
|
||||
print("Bad Request Error", e)
|
||||
return ""
|
||||
|
||||
@@ -121,7 +121,7 @@ class HumanEval(Eval):
|
||||
convo=convo,
|
||||
metrics={
|
||||
f"pass@{k}": estimate_pass_at_k([total], [correct], k)
|
||||
# this will be aggrated so no need of .mean()
|
||||
# this will be aggregated so no need of .mean()
|
||||
for k in self._ks_passes
|
||||
if total >= k
|
||||
},
|
||||
|
||||
@@ -370,7 +370,7 @@ def test_dtype_gen():
|
||||
@sgl.function
|
||||
def dtype_gen(s):
|
||||
s += "Q: What is the full name of DNS?\n"
|
||||
s += "A: The full nams is " + sgl.gen("str_res", dtype=str, stop="\n") + "\n"
|
||||
s += "A: The full names is " + sgl.gen("str_res", dtype=str, stop="\n") + "\n"
|
||||
s += "Q: Which year was DNS invented?\n"
|
||||
s += "A: " + sgl.gen("int_res", dtype=int) + "\n"
|
||||
s += "Q: What is the value of pi?\n"
|
||||
|
||||
@@ -278,7 +278,7 @@ def graceful_registry(sub_module_name: str):
|
||||
f"{sub_module_name} Received signal to shutdown. Performing graceful shutdown..."
|
||||
)
|
||||
if signum == signal.SIGTERM:
|
||||
logger.info(f"{sub_module_name} recive sigterm")
|
||||
logger.info(f"{sub_module_name} receive sigterm")
|
||||
|
||||
signal.signal(signal.SIGTERM, graceful_shutdown)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user