Add typo checker in pre-commit (#6179)

Co-authored-by: Brayden Zhong <b8zhong@uwaterloo.ca>
This commit is contained in:
applesaucethebun
2025-05-11 00:55:00 -04:00
committed by GitHub
parent de167cf5fa
commit 2ce8793519
99 changed files with 154 additions and 144 deletions

View File

@@ -147,3 +147,7 @@ exclude = [
"scripts*",
"tests*",
]
[tool.codespell]
ignore-words-list = "ans, als, hel, boostrap, childs, te, vas, hsa, ment"
skip = "*.json,*.jsonl,*.patch,*.txt"

View File

@@ -315,7 +315,7 @@ def throughput_test(
tokenizer_id = server_args.tokenizer_path or server_args.model_path
tokenizer = get_tokenizer(tokenizer_id)
# Set global environmnets
# Set global environments
set_ulimit()
random.seed(bench_args.seed)
np.random.seed(bench_args.seed)

View File

@@ -1263,7 +1263,7 @@ async def benchmark(
print("{:<40} {:<10}".format("Traffic request rate:", request_rate))
print(
"{:<40} {:<10}".format(
"Max reqeuest concurrency:",
"Max request concurrency:",
max_concurrency if max_concurrency else "not set",
)
)

View File

@@ -129,7 +129,7 @@ def launch_server_process_and_send_one_request(
def refine_server_args(server_args: ServerArgs, compile_args: CompileArgs):
# Disbale cuda graph and torch compile to save time
# Disable cuda graph and torch compile to save time
server_args.disable_cuda_graph = True
server_args.enable_torch_compile = False
print(f"Disable CUDA Graph and Torch Compile to save time...")

View File

@@ -38,7 +38,7 @@ def extract_prefix_by_tracing(program, backend):
with TracingScope(tracer):
tracer.ret_value = program.func(tracer, **arguments)
except (StopTracing, TypeError, AttributeError):
# Some exceptions may not be catched
# Some exceptions may not be caught
pass
# Run and cache prefix

View File

@@ -27,7 +27,7 @@ completion_template_name = None
class FimPosition:
"""Postion of fim middle token."""
"""Position of fim middle token."""
MIDDLE = auto()
END = auto()

View File

@@ -416,9 +416,9 @@ class DeepseekVLV2Processor(ProcessorMixin):
h = w = math.ceil(
(self.image_size // self.patch_size) / self.downsample_ratio
)
# global views tokens h * (w + 1), 1 is for line seperator
# global views tokens h * (w + 1), 1 is for line separator
tokenized_image = [self.image_token_id] * h * (w + 1)
# add a seperator between global and local views
# add a separator between global and local views
tokenized_image += [self.image_token_id]
# local views tokens, (num_height_tiles * h) * (num_width_tiles * w + 1)
tokenized_image += (

View File

@@ -509,7 +509,7 @@ class SchedulerDisaggregationDecodeMixin:
def event_loop_overlap_disagg_decode(self: Scheduler):
result_queue = deque()
self.last_batch: Optional[ScheduleBatch] = None
self.last_batch_in_queue = False # last batch is modifed in-place, so we need another variable to track if it's extend
self.last_batch_in_queue = False # last batch is modified in-place, so we need another variable to track if it's extend
while True:
recv_reqs = self.recv_requests()

View File

@@ -54,7 +54,7 @@ class FakeKVSender(BaseKVSender):
logger.info(f"FakeKVSender send success")
else:
self.has_sent = False
logger.info(f"FakeKVSender send fake transfering")
logger.info(f"FakeKVSender send fake transferring")
def failure_exception(self):
raise Exception("Fake KVSender Exception")

View File

@@ -363,7 +363,7 @@ class MooncakeKVManager(BaseKVManager):
self.request_status[bootstrap_room] = KVPoll.WaitingForInput
def check_status(self, bootstrap_room: int):
# TOOD: do we really need the poll()?
# TODO: do we really need the poll()?
return self.request_status[bootstrap_room]

View File

@@ -112,7 +112,7 @@ def get_kv_class(transfer_backend: TransferBackend, class_type: KVClassType):
def kv_to_page_indices(kv_indices: np.ndarray, page_size: int):
# 1. The page is guaruanteed to be full except the last page.
# 1. The page is guaranteed to be full except the last page.
# 2. page index = kv_index // page_size
# The return vector is kv_indices[::page_size] // page_size
if page_size == 1: # shortcut

View File

@@ -86,8 +86,8 @@ class StructureInfo:
_GetInfoFunc = Callable[[str], StructureInfo]
"""
helper alias of function
ususally it is a function that takes a name string and returns a StructureInfo object,
Helper alias of function
Usually it is a function that takes a name string and returns a StructureInfo object,
which can be used to construct a structural_tag object
"""

View File

@@ -308,7 +308,7 @@ class FlashAttentionBackend(AttentionBackend):
), "Sliding window and cross attention are not supported together"
self.forward_metadata: FlashAttentionMetadata = None
# extra metdata for handling speculative decoding topk > 1, extended draft decode and verify
# extra metadata for handling speculative decoding topk > 1, extended draft decode and verify
self.forward_metadata_spec_decode_expand: FlashAttentionMetadata = None
self.max_context_len = model_runner.model_config.context_len
self.device = model_runner.device

View File

@@ -919,7 +919,7 @@ def _fwd_kernel(
e_max = n_e_max
# stage 2: compute the trianlge part
# stage 2: compute the triangle part
cur_block_m_end = tl.minimum(cur_seq_len_extend, (cur_block_m + 1) * BLOCK_M)
for start_n in range(0, cur_block_m_end, BLOCK_N):

View File

@@ -201,7 +201,7 @@ def _dp_gather(
global_tokens, local_tokens, 0, local_start_pos, local_num_tokens, False
)
# Input IDs are in int 32. We should use inplace_all_reduce for local case becaues of custom all reduce.
# Input IDs are in int 32. We should use inplace_all_reduce for local case because of custom all reduce.
NUM_GPUS_PER_NODE = 8
if (
not local_tokens.dtype.is_floating_point

View File

@@ -76,7 +76,7 @@ class RMSNorm(CustomOp):
residual: Optional[torch.Tensor] = None,
) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
if not x.is_contiguous():
# NOTE: Romove this if aiter kernel supports discontinuous input
# NOTE: Remove this if aiter kernel supports discontinuous input
x = x.contiguous()
if residual is not None:
fused_add_rms_norm(x, residual, self.weight.data, self.variance_epsilon)

View File

@@ -116,7 +116,7 @@ def deepep_run_moe_deep_preprocess(topk_ids: torch.Tensor, num_experts: int):
seg_indptr = torch.empty(num_experts + 1, device=topk_ids.device, dtype=torch.int64)
src2dst = torch.empty(topk_ids.numel(), device=topk_ids.device, dtype=torch.int64)
# Find offet
# Find offset
expert_ids = torch.arange(
num_experts + 1, device=topk_ids.device, dtype=reorder_topk_ids.dtype
)

View File

@@ -611,7 +611,7 @@ class Fp8EPMoEMethod(Fp8MoEMethod):
self.quant_config.weight_block_size[1],
)
# NOTE(HandH1998): To ensure proper alignment of the block-wise quantization scales, the output_size of the weights for both the gate and up layers must be divisible by block_n.
# Required by collum parallel or enabling merged weights
# Required by column parallel or enabling merged weights
if intermediate_size % block_n != 0:
raise ValueError(
f"The output_size of gate's and up's weight = "

View File

@@ -994,7 +994,7 @@ def get_default_config(
"num_stages": 2 if _is_hip else 4,
}
else:
# Block-wise quant: BLOCK_SIZE_K must be divisable by block_shape[1]
# Block-wise quant: BLOCK_SIZE_K must be divisible by block_shape[1]
config = {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": block_shape[0],

View File

@@ -270,7 +270,7 @@ def select_experts(
routed_scaling_factor: Optional[float] = None,
):
n_share_experts_fusion = global_server_args_dict["n_share_experts_fusion"]
# DeekSeek V2/V3/R1 serices models uses grouped_top_k
# DeepSeek V2/V3/R1 series models use grouped_top_k
if use_grouped_topk:
assert topk_group is not None
assert num_expert_group is not None

View File

@@ -109,7 +109,7 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
if quantization in VLLM_QUANTIZATION_METHODS and not VLLM_AVAILABLE:
raise ValueError(
f"{quantization} quantization requires some operators from vllm. "
"Pleaes install vllm by `pip install vllm==0.8.4`"
"Please install vllm by `pip install vllm==0.8.4`"
)
return QUANTIZATION_METHODS[quantization]

View File

@@ -152,7 +152,7 @@ class BlockInt8LinearMethod(LinearMethodBase):
f"{input_size_per_partition} is not divisible by "
f"weight quantization block_k = {block_k}."
)
# Required by collum parallel or enabling merged weights
# Required by column parallel or enabling merged weights
if (tp_size > 1 and output_size // output_size_per_partition == tp_size) or len(
output_partition_sizes
) > 1:
@@ -285,7 +285,7 @@ class BlockInt8MoEMethod:
self.quant_config.weight_block_size[1],
)
# NOTE(HandH1998): To ensure proper alignment of the block-wise quantization scales, the output_size of the weights for both the gate and up layers must be divisible by block_n.
# Required by collum parallel or enabling merged weights
# Required by column parallel or enabling merged weights
if intermediate_size % block_n != 0:
raise ValueError(
f"The output_size of gate's and up's weight = "

View File

@@ -103,10 +103,10 @@ _INITIALIZATION_DICT: Dict[Tuple[DeepGemmKernelType, int, int, int], bool] = dic
def _compile_warning_1():
if not _IN_PRECOMPILE_STAGE and _IS_FIRST_RANK_ON_NODE:
logger.warning(
"Entering DeepGEMM JIT Pre-Complie session. "
"Entering DeepGEMM JIT Pre-Compile session. "
"And it may takes a long time(Typically 10-20 mins) "
"if you have not run `sglang.compile_deep_gemm`. "
"Recommand to run `sglang.compile_deep_gemm` with same args as `sglang.launch_server`"
"It is recommended to run `sglang.compile_deep_gemm` with same args as `sglang.launch_server`"
" for pre-compilation to reduce the overhead if you have not run it before. "
"For example: "
"`python3 -m sglang.compile_deep_gemm --model deepseek-ai/DeepSeek-V3 --tp 8 --trust-remote-code`"
@@ -115,7 +115,7 @@ def _compile_warning_1():
def _compile_warning_2():
logger.warning(
"Entering DeepGEMM JIT Single Kernel Complie session. "
"Entering DeepGEMM JIT Single Kernel Compile session. "
"And it will makes inference throughput becomes flaky. "
"Please run `sglang.compile_deep_gemm` with same args as `sglang.launch_server`"
" for pre-compilation to solve this issue. "
@@ -298,7 +298,7 @@ def _maybe_compile_deep_gemm_one_type_all(
logger.info(
f"Try DeepGEMM JIT Compiling for "
f"<{kernel_helper.name}> N={n}, K={k}, num_groups={num_groups} with all Ms."
f"{' It only takes a litte time (typically 1 sec) if you have run `python3 -m sglang.compile_deep_gemm`. ' if not _IN_PRECOMPILE_STAGE else ''}"
f"{' It only takes a little time (typically 1 sec) if you have run `python3 -m sglang.compile_deep_gemm`. ' if not _IN_PRECOMPILE_STAGE else ''}"
)
# NOTE(alcanderian): get_num_sms should be change when 2-batch-overlap is introduced

View File

@@ -235,7 +235,7 @@ class Fp8LinearMethod(LinearMethodBase):
f"{input_size_per_partition} is not divisible by "
f"weight quantization block_k = {block_k}."
)
# Required by collum parallel or enabling merged weights
# Required by column parallel or enabling merged weights
if (
tp_size > 1 and output_size // output_size_per_partition == tp_size
) or len(output_partition_sizes) > 1:
@@ -491,7 +491,7 @@ class Fp8MoEMethod:
self.quant_config.weight_block_size[1],
)
# NOTE(HandH1998): To ensure proper alignment of the block-wise quantization scales, the output_size of the weights for both the gate and up layers must be divisible by block_n.
# Required by collum parallel or enabling merged weights
# Required by column parallel or enabling merged weights
if intermediate_size % block_n != 0:
raise ValueError(
f"The output_size of gate's and up's weight = "

View File

@@ -104,7 +104,7 @@ def _per_token_group_quant_fp8(
y_s_ptr,
# Stride of input
y_stride,
# Collums of input
# Columns of input
N,
# Avoid to divide zero
eps,
@@ -342,7 +342,7 @@ def _static_quant_fp8(
y_s_repeat_ptr,
# Stride of input
y_stride,
# Collums of input
# Columns of input
N,
# Information for float8
fp8_min,
@@ -794,7 +794,7 @@ def w8a8_block_fp8_matmul(
config = configs[min(configs.keys(), key=lambda x: abs(x - M))]
else:
# Default config
# Block-wise quant: BLOCK_SIZE_K must be divisable by block_size[1]
# Block-wise quant: BLOCK_SIZE_K must be divisible by block_size[1]
config = {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": block_size[0],

View File

@@ -76,7 +76,7 @@ def _per_token_group_quant_int8(
y_s_ptr,
# Stride of input
y_stride,
# Collums of input
# Columns of input
N,
# Avoid to divide zero
eps,
@@ -370,7 +370,7 @@ def w8a8_block_int8_matmul(
config = configs[min(configs.keys(), key=lambda x: abs(x - M))]
else:
# Default config
# Block-wise quant: BLOCK_SIZE_K must be divisable by block_size[1]
# Block-wise quant: BLOCK_SIZE_K must be divisible by block_size[1]
config = {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": block_size[0],

View File

@@ -100,7 +100,7 @@ class LoRAManager:
self.configs[name] = LoRAConfig(path)
self.hf_target_names.update(self.configs[name].target_modules)
# Target lora weight names for lora_a and lora_b modules repectively.
# Target lora weight names for lora_a and lora_b modules respectively.
# e.g., {("qkv_proj", "q_proj"), ("qkv_proj", "kv_proj")}
self.lora_weight_names: Set[Tuple[str]] = set(
[get_stacked_name(module) for module in self.hf_target_names]

View File

@@ -50,15 +50,15 @@ class LoRAMemoryPool:
self.uid_to_buffer_id: Dict[Optional[str], int] = {}
# Buffer idx -> lora uid in memory pool
# All uids are initalized as empty strings for empty buffer slots
# Here we don't initalize to None since None is a valid uid
# All uids are initialized as empty strings for empty buffer slots
# Here we don't initialize to None since None is a valid uid
self.buffer_id_to_uid: List[Optional[str]] = [""] * self.max_loras_per_batch
def get_lora_A_shape(
self, module_name: str, base_model: torch.nn.Module
) -> Tuple[int]:
"""
Given a module_name (might be a stacked name), return the hidden dims of modules's input and output.
Given a module_name (might be a stacked name), return the hidden dims of modules' input and output.
"""
input_dim, _ = get_hidden_dim(module_name, self.base_hf_config, base_model)
c = get_stacked_multiply(module_name)
@@ -75,7 +75,7 @@ class LoRAMemoryPool:
self, module_name: str, base_model: torch.nn.Module
) -> Tuple[int]:
"""
Given a module_name (might be a stacked name), return the hidden dims of modules's input and output.
Given a module_name (might be a stacked name), return the hidden dims of modules' input and output.
"""
_, output_dim = get_hidden_dim(module_name, self.base_hf_config, base_model)
c = get_stacked_multiply(module_name)

View File

@@ -77,7 +77,7 @@ def _gate_up_lora_b_kernel(
k_offset[:, None] * w_stride_2 + n_offset[None, :] * w_stride_1
)
# Iteate to compute the block in output matrix
# Iterate to compute the block in output matrix
partial_sum = tl.zeros((BLOCK_S, BLOCK_N), dtype=tl.float32)
for k in range(0, tl.cdiv(K, BLOCK_K)):
x_tile = tl.load(

View File

@@ -79,7 +79,7 @@ def _qkv_lora_b_kernel(
k_offset[:, None] * w_stride_2 + n_offset[None, :] * w_stride_1
)
# Iteate to compute the block in output matrix
# Iterate to compute the block in output matrix
partial_sum = tl.zeros((BLOCK_S, BLOCK_N), dtype=tl.float32)
for k in range(0, tl.cdiv(K, BLOCK_K)):
x_tile = tl.load(

View File

@@ -67,7 +67,7 @@ def _sgemm_lora_a_kernel(
k_offset[:, None] * w_stride_2 + n_offset[None, :] * w_stride_1
)
# Iteate to compute the block in output matrix
# Iterate to compute the block in output matrix
partial_sum = tl.zeros((BLOCK_S, BLOCK_N), dtype=tl.float32)
for k in range(0, tl.cdiv(K, BLOCK_K)):
x_tile = tl.load(

View File

@@ -69,7 +69,7 @@ def _sgemm_lora_b_kernel(
k_offset[:, None] * w_stride_2 + n_offset[None, :] * w_stride_1
)
# Iteate to compute the block in output matrix
# Iterate to compute the block in output matrix
partial_sum = tl.zeros((BLOCK_S, BLOCK_N), dtype=tl.float32)
for k in range(0, tl.cdiv(K, BLOCK_K)):
x_tile = tl.load(

View File

@@ -79,7 +79,7 @@ def get_hidden_dim(
module_name: str, config: AutoConfig, base_model: torch.nn.Module
) -> Tuple[int]:
"""
Given a module_name (might be a stacked name), return the hidden dims of modules's input and output.
Given a module_name (might be a stacked name), return the hidden dims of modules' input and output.
"""
if hasattr(base_model, "get_hidden_dim"):

View File

@@ -210,7 +210,7 @@ class DataParallelController:
)
# compute zmq ports for this dp rank
rank_port_args = PortArgs.init_new(server_args, dp_rank)
# Data parallelism resues the tensor parallelism group,
# Data parallelism reuses the tensor parallelism group,
# so all dp ranks should use the same nccl port.
rank_port_args.nccl_port = port_args.nccl_port

View File

@@ -12,7 +12,7 @@
# limitations under the License.
# ==============================================================================
"""
The definition of objects transfered between different
The definition of objects transferred between different
processes (TokenizerManager, DetokenizerManager, Controller).
"""

View File

@@ -51,7 +51,7 @@ class MultiModalityDataPaddingPatternTokenPairs(MultiModalityDataPaddingPattern)
self, input_ids: List[int], mm_inputs: MultimodalInputs
) -> List[int]:
"""
This function will replace the data-tokens inbetween with pad_values accordingly
This function will replace the data-tokens in between with pad_values accordingly
"""
pad_values = [item.pad_value for item in mm_inputs.mm_items]
data_token_pairs = self.data_token_id_pairs

View File

@@ -879,7 +879,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
error_msg = (
f"{phase_str} out of memory. Try to lower your batch size.\n"
f"Try to allocate {num_tokens} tokens.\n"
f"Avaliable tokens: {self.token_to_kv_pool_allocator.available_size() + self.tree_cache.evictable_size()}\n"
f"Available tokens: {self.token_to_kv_pool_allocator.available_size() + self.tree_cache.evictable_size()}\n"
)
logger.error(error_msg)
if self.tree_cache is not None:
@@ -920,7 +920,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
error_msg = (
f"Prefill out of memory. Try to lower your batch size.\n"
f"Try to allocate {extend_num_tokens} tokens.\n"
f"Avaliable tokens: {self.token_to_kv_pool_allocator.available_size() + self.tree_cache.evictable_size()}\n"
f"Available tokens: {self.token_to_kv_pool_allocator.available_size() + self.tree_cache.evictable_size()}\n"
f"{self.token_to_kv_pool_allocator.available_size()=}\n"
f"{self.tree_cache.evictable_size()=}\n"
)
@@ -955,7 +955,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
error_msg = (
f"Decode out of memory. Try to lower your batch size.\n"
f"Try to allocate {len(seq_lens)} tokens.\n"
f"Avaliable tokens: {self.token_to_kv_pool_allocator.available_size() + self.tree_cache.evictable_size()}\n"
f"Available tokens: {self.token_to_kv_pool_allocator.available_size() + self.tree_cache.evictable_size()}\n"
f"{self.token_to_kv_pool_allocator.available_size()=}\n"
f"{self.tree_cache.evictable_size()=}\n"
)

View File

@@ -1325,7 +1325,7 @@ class Scheduler(
return None
running_bs = len(self.running_batch.reqs)
# Igore the check if self.chunked_req is not None.
# Ignore the check if self.chunked_req is not None.
# In the non-PP case, when self.chunked_req is not None, num_allocatable_reqs should always be greater than 0,
# as the space for the chunked request has just been released.
# In PP case, a chunked req can start in one microbatch and end in another microbatch, so the max_running_requests per microbatch should not be strict.

View File

@@ -1273,7 +1273,7 @@ class TokenizerManager:
self.model_update_result.set_result(recv_obj)
else: # self.server_args.dp_size > 1
self.model_update_tmp.append(recv_obj)
# set future if the all results are recevied
# set future if the all results are received
if len(self.model_update_tmp) == self.server_args.dp_size:
self.model_update_result.set_result(self.model_update_tmp)

View File

@@ -296,12 +296,12 @@ class CudaGraphRunner:
self.capture()
except RuntimeError as e:
raise Exception(
f"Capture cuda graph failed: {e}\n"
f"Capture CUDA graph failed: {e}\n"
"Possible solutions:\n"
"1. set --mem-fraction-static to a smaller value (e.g., 0.8 or 0.7)\n"
"2. set --cuda-graph-max-bs to a smaller value (e.g., 16)\n"
"3. disable torch compile by not using --enable-torch-compile\n"
"4. disable cuda graph by --disable-cuda-graph. (Not recommonded. Huge perf loss)\n"
"4. disable CUDA graph by --disable-cuda-graph. (Not recommended. Huge performance loss)\n"
"Open an issue on GitHub https://github.com/sgl-project/sglang/issues/new/choose \n"
)

View File

@@ -58,7 +58,7 @@ class ForwardMode(IntEnum):
DECODE = auto()
# Contains both EXTEND and DECODE when doing chunked prefill.
MIXED = auto()
# No sequence to forward. For data parallel attention, some workers wil be IDLE if no sequence are allocated.
# No sequence to forward. For data parallel attention, some workers will be IDLE if no sequence are allocated.
IDLE = auto()
# Used in speculative decoding: verify a batch in the target model.

View File

@@ -188,7 +188,7 @@ def trunc_normal_tf_(
best when :math:`a \\leq \text{mean} \\leq b`.
NOTE: this 'tf' variant behaves closer to Tensorflow / JAX impl where the
bounds [a, b] are applied when sampling the normal distribution with mean=0, std=1.0
and the result is subsquently scaled and shifted by the mean and std args.
and the result is subsequently scaled and shifted by the mean and std args.
Args:
tensor: an n-dimensional `torch.Tensor`
mean: the mean of the normal distribution
@@ -735,7 +735,7 @@ class VisionTransformer(nn.Module):
img_size: Input image size.
patch_size: Patch size.
in_chans: Number of image input channels.
num_classes: Mumber of classes for classification head.
num_classes: Number of classes for classification head.
global_pool: Type of global pooling for final sequence (default: 'token').
embed_dim: Transformer embedding dimension.
depth: Depth of transformer.

View File

@@ -1287,7 +1287,7 @@ class DeepseekV2DecoderLayer(nn.Module):
# Fully Connected
hidden_states = self.mlp(hidden_states)
# TODO(ch-wan): ues reduce-scatter in MLP to avoid this scatter
# TODO(ch-wan): use reduce-scatter in MLP to avoid this scatter
# Scatter
if self.dp_size != 1:
# important: forward batch.gathered_buffer is used both after scatter and after gather.
@@ -1499,7 +1499,7 @@ class DeepseekV2ForCausalLM(nn.Module):
else:
assert (
self.n_share_experts_fusion == self.tp_size
), f"Shared experts fusion optimization is enabled in DeepSeek V3/R1, set it to {self.tp_size} can get best optimized performace."
), f"Shared experts fusion optimization is enabled in DeepSeek V3/R1, set it to {self.tp_size} can get best optimized performance."
elif self.n_share_experts_fusion == 0:
if (
_is_cuda
@@ -1665,7 +1665,7 @@ class DeepseekV2ForCausalLM(nn.Module):
if is_nextn:
if hasattr(self.config, "num_nextn_predict_layers"):
num_nextn_layers = self.config.num_nextn_predict_layers
assert num_nextn_layers == 1, "Only 1 nextn layer is supportted"
assert num_nextn_layers == 1, "Only 1 nextn layer is supported"
# compatible with old design
nextn_layer_id = (
0

View File

@@ -428,7 +428,7 @@ class Llama4DecoderLayer(nn.Module):
# Fully Connected
hidden_states = self.feed_forward(hidden_states, forward_batch)
# TODO(ch-wan): ues reduce-scatter in MLP to avoid this scatter
# TODO(ch-wan): use reduce-scatter in MLP to avoid this scatter
# Scatter
if self.dp_size != 1:
# important: forward batch.gathered_buffer is used both after scatter and after gather.

View File

@@ -57,7 +57,7 @@ class RobertaEmbedding(nn.Module):
input_shape = input_ids.size()
inputs_embeds = self.word_embeddings(input_ids)
# adpated from vllm: https://github.com/vllm-project/vllm/commit/4a18fd14ba4a349291c798a16bf62fa8a9af0b6b/vllm/model_executor/models/roberta.py
# Adapted from vllm: https://github.com/vllm-project/vllm/commit/4a18fd14ba4a349291c798a16bf62fa8a9af0b6b/vllm/model_executor/models/roberta.py
pos_list = []
token_list = []

View File

@@ -37,7 +37,7 @@ $ python3 -m sglang.bench_one_batch --correct \
--tensor-parallel-size 2 \
--disable-cuda-graph
```
We will eanble CUDA Graph support soon.
We will enable CUDA Graph support soon.
"""
import types

View File

@@ -590,7 +590,7 @@ def v1_generate_response(
echo = False
if (not isinstance(request, list)) and request.echo:
# TODO: handle the case propmt is token ids
# TODO: handle the case prompt is token ids
if isinstance(request.prompt, list) and isinstance(request.prompt[0], str):
# for the case of multiple str prompts
prompts = request.prompt
@@ -646,7 +646,7 @@ def v1_generate_response(
finish_reason = ret_item["meta_info"]["finish_reason"]
if to_file:
# to make the choise data json serializable
# to make the choice data json serializable
choice_data = {
"index": 0,
"text": text,

View File

@@ -147,7 +147,7 @@ class ReasoningParser:
Args:
model_type (str): Type of model to parse reasoning from
stream_reasoning (bool): If Flase, accumulates reasoning content until complete.
stream_reasoning (bool): If False, accumulates reasoning content until complete.
If True, streams reasoning content as it arrives.
"""

View File

@@ -294,7 +294,7 @@ class SamplingBatchInfo:
# Set the flag to True if any of the two has custom logit processor
self.has_custom_logit_processor = True
# Note: becasue the __len()__ operator is defined on the temperatures tensor,
# Note: because the __len()__ operator is defined on the temperatures tensor,
# please make sure any merge operation with len(self) or len(other) is done before
# the merge operation of the temperatures tensor below.
for item in [

View File

@@ -825,7 +825,7 @@ class ServerArgs:
# Multi-node distributed serving
parser.add_argument(
"--dist-init-addr",
"--nccl-init-addr", # For backward compatbility. This will be removed in the future.
"--nccl-init-addr", # For backward compatibility. This will be removed in the future.
type=str,
help="The host address for initializing distributed backend (e.g., `192.168.0.2:25000`).",
)
@@ -1096,7 +1096,7 @@ class ServerArgs:
parser.add_argument(
"--triton-attention-reduce-in-fp32",
action="store_true",
help="Cast the intermidiate attention results to fp32 to avoid possible crashes related to fp16."
help="Cast the intermediate attention results to fp32 to avoid possible crashes related to fp16."
"This only affects Triton attention kernels.",
)
parser.add_argument(
@@ -1188,7 +1188,7 @@ class ServerArgs:
type=int,
default=0,
help="The number of shared_experts need to be replicated to fuse with normal experts in deepseek v3/r1, "
"set it to tp_size can get best optimized performace.",
"set it to tp_size can get best optimized performance.",
)
parser.add_argument(
"--disable-chunked-prefix-cache",

View File

@@ -82,12 +82,12 @@ class EAGLEDraftCudaGraphRunner:
self.capture()
except RuntimeError as e:
raise Exception(
f"Capture cuda graph failed: {e}\n"
f"Capture CUDA graph failed: {e}\n"
"Possible solutions:\n"
"1. set --mem-fraction-static to a smaller value (e.g., 0.8 or 0.7)\n"
"2. set --cuda-graph-max-bs to a smaller value (e.g., 16)\n"
"3. disable torch compile by not using --enable-torch-compile\n"
"4. disable cuda graph by --disable-cuda-graph. (Not recommonded. Huge perf loss)\n"
"4. disable CUDA graph by --disable-cuda-graph. (Not recommended. Huge performance loss)\n"
"Open an issue on GitHub https://github.com/sgl-project/sglang/issues/new/choose \n"
)
@@ -149,7 +149,7 @@ class EAGLEDraftCudaGraphRunner:
# Run and capture
def run_once():
# Backup two fileds, which will be modified in-place in `draft_forward`.
# Backup two fields, which will be modified in-place in `draft_forward`.
output_cache_loc_backup = forward_batch.out_cache_loc
hidden_states_backup = forward_batch.spec_info.hidden_states

View File

@@ -167,12 +167,12 @@ class EagleVerifyOutput:
draft_input: EagleDraftInput
# Logit outputs from target worker
logits_output: LogitsProcessorOutput
# Accepeted token ids including the bonus token
# Accepted token ids including the bonus token
verified_id: torch.Tensor
# Accepeted token length per sequence in a batch in CPU.
# Accepted token length per sequence in a batch in CPU.
accept_length_per_req_cpu: List[int]
# Accepeted indices from logits_output.next_token_logits
accepeted_indices: torch.Tensor
# Accepted indices from logits_output.next_token_logits
accepted_indices: torch.Tensor
@dataclass
@@ -316,7 +316,7 @@ class EagleVerifyInput:
This API updates values inside logits_output based on the accepted
tokens. I.e., logits_output.next_token_logits only contains
accepeted token logits.
accepted token logits.
"""
bs = self.retrive_index.shape[0]
candidates = self.draft_token.reshape(bs, self.draft_token_num)
@@ -493,7 +493,7 @@ class EagleVerifyInput:
logits_output=logits_output,
verified_id=verified_id,
accept_length_per_req_cpu=accept_length_cpu,
accepeted_indices=accept_index,
accepted_indices=accept_index,
)
else:
assign_req_to_token_pool[(bs,)](
@@ -539,7 +539,7 @@ class EagleVerifyInput:
logits_output=logits_output,
verified_id=verified_id,
accept_length_per_req_cpu=accept_length_cpu,
accepeted_indices=accept_index,
accepted_indices=accept_index,
)

View File

@@ -201,7 +201,7 @@ class EAGLEWorker(TpModelWorker):
self.has_prefill_wrapper_verify = False
else:
raise ValueError(
f"EAGLE is not supportted in attention backend {self.server_args.attention_backend}"
f"EAGLE is not supported in attention backend {self.server_args.attention_backend}"
)
self.draft_model_runner.draft_attn_backend = self.draft_attn_backend
@@ -245,8 +245,8 @@ class EAGLEWorker(TpModelWorker):
Args:
batch: The batch to run forward. The state of the batch is modified as it runs.
Returns:
A tuple of the final logit output of the target model, next tokens accepeted,
the batch id (used for overlap schedule), and number of accepeted tokens.
A tuple of the final logit output of the target model, next tokens accepted,
the batch id (used for overlap schedule), and number of accepted tokens.
"""
if batch.forward_mode.is_decode():
with self.draft_tp_context(self.draft_model_runner.tp_group):
@@ -491,11 +491,11 @@ class EAGLEWorker(TpModelWorker):
)
# Post process based on verified outputs.
# Pick indices that we care (accepeted)
# Pick indices that we care (accepted)
logits_output.next_token_logits = logits_output.next_token_logits[
res.accepeted_indices
res.accepted_indices
]
logits_output.hidden_states = logits_output.hidden_states[res.accepeted_indices]
logits_output.hidden_states = logits_output.hidden_states[res.accepted_indices]
# Prepare the batch for the next draft forwards.
batch.forward_mode = ForwardMode.DECODE
@@ -597,7 +597,7 @@ class EAGLEWorker(TpModelWorker):
self.capture_for_decode(logits_output, forward_batch.spec_info)
def forward_draft_extend_after_decode(self, batch: ScheduleBatch):
# Backup fileds that will be modified in-place
# Backup fields that will be modified in-place
seq_lens_backup = batch.seq_lens.clone()
req_pool_indices_backup = batch.req_pool_indices
accept_length_backup = batch.spec_info.accept_length

View File

@@ -140,7 +140,7 @@ class ChatCompletionSampler(SamplerBase):
max_tokens=self.max_tokens,
)
return response.choices[0].message.content
# NOTE: BadRequestError is triggered once for MMMU, please uncomment if you are reruning MMMU
# NOTE: BadRequestError is triggered once for MMMU, please uncomment if you are rerunning MMMU
except openai.BadRequestError as e:
print("Bad Request Error", e)
return ""

View File

@@ -121,7 +121,7 @@ class HumanEval(Eval):
convo=convo,
metrics={
f"pass@{k}": estimate_pass_at_k([total], [correct], k)
# this will be aggrated so no need of .mean()
# this will be aggregated so no need of .mean()
for k in self._ks_passes
if total >= k
},

View File

@@ -370,7 +370,7 @@ def test_dtype_gen():
@sgl.function
def dtype_gen(s):
s += "Q: What is the full name of DNS?\n"
s += "A: The full nams is " + sgl.gen("str_res", dtype=str, stop="\n") + "\n"
s += "A: The full names is " + sgl.gen("str_res", dtype=str, stop="\n") + "\n"
s += "Q: Which year was DNS invented?\n"
s += "A: " + sgl.gen("int_res", dtype=int) + "\n"
s += "Q: What is the value of pi?\n"

View File

@@ -278,7 +278,7 @@ def graceful_registry(sub_module_name: str):
f"{sub_module_name} Received signal to shutdown. Performing graceful shutdown..."
)
if signum == signal.SIGTERM:
logger.info(f"{sub_module_name} recive sigterm")
logger.info(f"{sub_module_name} receive sigterm")
signal.signal(signal.SIGTERM, graceful_shutdown)