[CI]Fixed the spell check function in typos.toml (#6753)
### What this PR does / why we need it?
The incorrect regular expression syntax `.*[UE4M3|ue4m3].*` actually
ignores all words containing any of the following characters: `u, e, 4,
m, 3, |`
```yaml
extend-ignore-identifiers-re = [".*Unc.*", ".*_thw",
".*UE8M0.*", ".*[UE4M3|ue4m3].*", ".*eles.*", ".*fo.*", ".*ba.*",
".*ot.*", ".*[Tt]h[rR].*"]
```
===fix===>
```yaml
extend-ignore-identifiers-re = [".*Unc.*", ".*_thw",
".*UE8M0.*", ".*(UE4M3|ue4m3]).*", ".*eles.*", ".*fo.*", ".*ba.*",
".*ot.*", ".*[Tt]h[rR].*"]
```
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.15.0
- vLLM main:
9562912cea
Signed-off-by: MrZ20 <2609716663@qq.com>
This commit is contained in:
@@ -144,14 +144,14 @@ class AscendConfig:
|
||||
if os.getenv("VLLM_ASCEND_ENABLE_PREFETCH_MLP", "0") == "1":
|
||||
MAX_PREFETCH_WEIGHT_SIZE: int = 18 * 1024 * 1024
|
||||
gate_up_prefetch_size = int(os.getenv("VLLM_ASCEND_MLP_GATE_UP_PREFETCH_SIZE", MAX_PREFETCH_WEIGHT_SIZE))
|
||||
down_prefetch_szie = int(os.getenv("VLLM_ASCEND_MLP_DOWN_PREFETCH_SIZE", MAX_PREFETCH_WEIGHT_SIZE))
|
||||
down_prefetch_size = int(os.getenv("VLLM_ASCEND_MLP_DOWN_PREFETCH_SIZE", MAX_PREFETCH_WEIGHT_SIZE))
|
||||
self.weight_prefetch_config.set_mlp_pre_version_compatibale_config(
|
||||
gate_up_prefetch_size, down_prefetch_szie
|
||||
gate_up_prefetch_size, down_prefetch_size
|
||||
)
|
||||
logger.info_once(
|
||||
f"MLP weight prefetch enabled from env variable VLLM_ASCEND_ENABLE_PREFETCH_MLP."
|
||||
f"gate_up_prefetch_size={gate_up_prefetch_size}, "
|
||||
f"down_prefetch_szie={down_prefetch_szie}."
|
||||
f"down_prefetch_size={down_prefetch_size}."
|
||||
)
|
||||
warnings.warn(
|
||||
"VLLM_ASCEND_ENABLE_PREFETCH_MLP is deprecated and will be removed in a v0.16.0 version. "
|
||||
|
||||
@@ -34,13 +34,13 @@ else:
|
||||
from vllm.compilation.passes.vllm_inductor_pass import VllmInductorPass
|
||||
|
||||
# computation-communication tiling block is 512
|
||||
ALLREDUCE_NORM_FUSE_THREHOLD = 512
|
||||
ALLREDUCE_NORM_FUSE_THRESHOLD = 512
|
||||
|
||||
|
||||
def get_compile_range_and_extra_stream_check():
|
||||
def check_func(match: Match) -> bool:
|
||||
compile_range = get_pass_context().compile_range
|
||||
return extra_stream_scope_check(match) and compile_range.start > ALLREDUCE_NORM_FUSE_THREHOLD
|
||||
return extra_stream_scope_check(match) and compile_range.start > ALLREDUCE_NORM_FUSE_THRESHOLD
|
||||
|
||||
return check_func
|
||||
|
||||
@@ -176,5 +176,5 @@ class MatmulAllReduceAddRMSNormPass(VllmInductorPass):
|
||||
"""
|
||||
Check if the pass is applicable for the current configuration.
|
||||
"""
|
||||
applicable = compile_range.start > ALLREDUCE_NORM_FUSE_THREHOLD
|
||||
applicable = compile_range.start > ALLREDUCE_NORM_FUSE_THRESHOLD
|
||||
return applicable
|
||||
|
||||
@@ -86,9 +86,9 @@ class BudgetRefiner:
|
||||
return k
|
||||
return None
|
||||
|
||||
def _get_max_budget(self, num_deocde_tokens, num_decode):
|
||||
def _get_max_budget(self, num_decode_tokens, num_decode):
|
||||
"""Get the maximum budget according to the number of decoding tokens and the decoding requests."""
|
||||
aligned_ctx = self._align_key(num_deocde_tokens, self.context_keys)
|
||||
aligned_ctx = self._align_key(num_decode_tokens, self.context_keys)
|
||||
aligned_dnum = self._align_key(num_decode, self.dnum_keys)
|
||||
if aligned_ctx is None or aligned_dnum is None:
|
||||
return self.default_budget
|
||||
@@ -99,7 +99,7 @@ class BudgetRefiner:
|
||||
# For debug.
|
||||
# logger.info(
|
||||
# f"budget {budget}, ctx,dnum {aligned_ctx, aligned_dnum}, "
|
||||
# f"raw ctx,dnum {num_deocde_tokens, num_decode}"
|
||||
# f"raw ctx,dnum {num_decode_tokens, num_decode}"
|
||||
# )
|
||||
return budget
|
||||
|
||||
@@ -114,8 +114,8 @@ class BudgetRefiner:
|
||||
num_decode = len(num_decode_token_lst)
|
||||
if num_decode <= 0:
|
||||
return budget
|
||||
num_deocde_tokens = sum(num_decode_token_lst) / num_decode
|
||||
return self._get_max_budget(num_deocde_tokens, num_decode)
|
||||
num_decode_tokens = sum(num_decode_token_lst) / num_decode
|
||||
return self._get_max_budget(num_decode_tokens, num_decode)
|
||||
|
||||
|
||||
class SchedulerDynamicBatch(Scheduler):
|
||||
|
||||
@@ -171,7 +171,7 @@ class HCCLLibrary:
|
||||
path_to_library_cache: dict[str, Any] = {}
|
||||
|
||||
# class attribute to store the mapping from library path
|
||||
# to the correspongding directory
|
||||
# to the corresponding directory
|
||||
path_to_dict_mapping: dict[str, dict[str, Any]] = {}
|
||||
|
||||
def __init__(self, so_file: str | None = None):
|
||||
|
||||
@@ -1316,8 +1316,8 @@ class MooncakeConnectorWorker:
|
||||
"""
|
||||
prefill_tp_size = meta.remote_ptp_size if getattr(meta, "remote_ptp_size", None) else self._prefill_tp_size
|
||||
if meta.remote_pcp_size * meta.remote_dcp_size * self.pcp_size * self.dcp_size == 1:
|
||||
choosen_rank_list = self._get_remote_rank(req_id, prefill_tp_size)
|
||||
remote_handshake_port_list = [[x + meta.remote_port for x in choosen_rank_list]]
|
||||
chosen_rank_list = self._get_remote_rank(req_id, prefill_tp_size)
|
||||
remote_handshake_port_list = [[x + meta.remote_port for x in chosen_rank_list]]
|
||||
local_block_ids_list, remote_block_ids_list = [meta.local_block_ids], [meta.remote_block_ids]
|
||||
return remote_handshake_port_list, local_block_ids_list, remote_block_ids_list
|
||||
|
||||
@@ -1563,8 +1563,8 @@ class MooncakeConnectorWorker:
|
||||
),
|
||||
)
|
||||
else: # TODO: support prefill context parallel and pipeline parallel open at the same time
|
||||
choosen_rank_list = self._get_remote_rank(remote_req_id, prefill_tp_size)
|
||||
remote_handshake_port_list = [[x + meta.remote_port] for x in choosen_rank_list]
|
||||
chosen_rank_list = self._get_remote_rank(remote_req_id, prefill_tp_size)
|
||||
remote_handshake_port_list = [[x + meta.remote_port] for x in chosen_rank_list]
|
||||
for i in range(tp_num_need_pulls * self._prefill_pp_size):
|
||||
assert self.kv_recv_thread is not None
|
||||
remote_host, remote_engine_id = self._get_remote_host_info_by_port(
|
||||
@@ -1651,8 +1651,8 @@ class MooncakeConnectorWorker:
|
||||
or self.use_sparse
|
||||
):
|
||||
tp_ori_data = tp_ori_data.reshape(-1, num_groups)
|
||||
choosen_group = tp_ori_data[:, [rand_group_index]]
|
||||
flattened = choosen_group.reshape(-1).tolist()
|
||||
chosen_group = tp_ori_data[:, [rand_group_index]]
|
||||
flattened = chosen_group.reshape(-1).tolist()
|
||||
tp_sampled_nums = [
|
||||
flattened[i : i + tp_num_need_pulls] for i in range(0, len(flattened), tp_num_need_pulls)
|
||||
]
|
||||
|
||||
@@ -741,7 +741,7 @@ class MooncakeLayerwiseConnectorScheduler:
|
||||
computed_tokens.get(req_id, 0) + scheduled_tokens - spec_decode_tokens
|
||||
)
|
||||
|
||||
def add_tranfer_task(req_id, send_req_info: SendReqInfo, chunk_finish=False):
|
||||
def add_transfer_task(req_id, send_req_info: SendReqInfo, chunk_finish=False):
|
||||
(
|
||||
local_block_ids,
|
||||
local_transed_tokens,
|
||||
@@ -771,7 +771,7 @@ class MooncakeLayerwiseConnectorScheduler:
|
||||
# whether chunk finish
|
||||
chunk_finish = send_req_info.local_computed_tokens >= len(send_req_info.request.all_token_ids)
|
||||
|
||||
add_tranfer_task(req_id, send_req_info, chunk_finish=chunk_finish)
|
||||
add_transfer_task(req_id, send_req_info, chunk_finish=chunk_finish)
|
||||
if chunk_finish:
|
||||
self._reqs_need_send_layerwise.pop(req_id)
|
||||
return meta
|
||||
|
||||
@@ -68,7 +68,7 @@ class D2DExpertWeightLoader:
|
||||
self.updated_log2phy_map = log2phy_map
|
||||
|
||||
def asyn_expert_weight_transfer(self, reqs):
|
||||
# Only when send/recv tasks are parsed into self.comm_op_list, d2d send/recv tasks can be luanched
|
||||
# Only when send/recv tasks are parsed into self.comm_op_list, d2d send/recv tasks can be launched
|
||||
if self.state != ExpertWeightUpdateState.READY:
|
||||
return
|
||||
|
||||
@@ -80,7 +80,7 @@ class D2DExpertWeightLoader:
|
||||
self.state = ExpertWeightUpdateState.TRANSFERRING
|
||||
|
||||
def update_expert_map_and_weight(self, reqs):
|
||||
# Only after send/recv tasks have been luanched, expert_map and weight can be updated
|
||||
# Only after send/recv tasks have been launched, expert_map and weight can be updated
|
||||
if self.state != ExpertWeightUpdateState.TRANSFERRING:
|
||||
return
|
||||
|
||||
|
||||
@@ -130,8 +130,8 @@ def jsq_placement(X, pieces, M, stage_weights):
|
||||
score = 0.0
|
||||
for s in range(n_stage):
|
||||
tmp_sj = loads[s, j] + w[s]
|
||||
numer_sj = tmp_sj if tmp_sj > stage_max[s] else stage_max[s]
|
||||
score += stage_weights[s] * (numer_sj / denom[s])
|
||||
number_sj = tmp_sj if tmp_sj > stage_max[s] else stage_max[s]
|
||||
score += stage_weights[s] * (number_sj / denom[s])
|
||||
if score < best_val:
|
||||
best_val = score
|
||||
best_j = j
|
||||
|
||||
@@ -195,10 +195,10 @@ class NPUPlatform(Platform):
|
||||
)
|
||||
|
||||
if vllm_config.additional_config.get("ascend_compilation_config", {}).get("fuse_allreduce_rms", True):
|
||||
from vllm_ascend.compilation.passes.allreduce_rmsnorm_fusion_pass import ALLREDUCE_NORM_FUSE_THREHOLD
|
||||
from vllm_ascend.compilation.passes.allreduce_rmsnorm_fusion_pass import ALLREDUCE_NORM_FUSE_THRESHOLD
|
||||
|
||||
new_compile_ranges_split_points = vllm_config.compilation_config.compile_ranges_split_points
|
||||
new_compile_ranges_split_points.append(ALLREDUCE_NORM_FUSE_THREHOLD)
|
||||
new_compile_ranges_split_points.append(ALLREDUCE_NORM_FUSE_THRESHOLD)
|
||||
new_compile_ranges_split_points = sorted(new_compile_ranges_split_points)
|
||||
vllm_config.compilation_config.compile_ranges_split_points = new_compile_ranges_split_points
|
||||
logger.debug(
|
||||
@@ -208,10 +208,10 @@ class NPUPlatform(Platform):
|
||||
|
||||
npugraph_ex_config = ascend_config.npugraph_ex_config
|
||||
if npugraph_ex_config and npugraph_ex_config.fuse_allreduce_rms:
|
||||
from vllm_ascend.compilation.passes.allreduce_rmsnorm_fusion_pass import ALLREDUCE_NORM_FUSE_THREHOLD
|
||||
from vllm_ascend.compilation.passes.allreduce_rmsnorm_fusion_pass import ALLREDUCE_NORM_FUSE_THRESHOLD
|
||||
|
||||
new_compile_ranges_split_points = vllm_config.compilation_config.compile_ranges_split_points
|
||||
new_compile_ranges_split_points.append(ALLREDUCE_NORM_FUSE_THREHOLD)
|
||||
new_compile_ranges_split_points.append(ALLREDUCE_NORM_FUSE_THRESHOLD)
|
||||
new_compile_ranges_split_points = sorted(new_compile_ranges_split_points)
|
||||
vllm_config.compilation_config.compile_ranges_split_points = new_compile_ranges_split_points
|
||||
logger.debug(
|
||||
@@ -558,7 +558,7 @@ class NPUPlatform(Platform):
|
||||
Args:
|
||||
attn_metadata (dict[str, Any]): attention metadata for all layers.
|
||||
vllm_config (VllmConfig): configuration of vllm.
|
||||
dp_metadata (DpMetada): metadata for data parallelism.
|
||||
dp_metadata (Dpmetadata): metadata for data parallelism.
|
||||
lack of typehint because of circular import.
|
||||
virtual_engine (int, optional): index of virtual engine. Defaults to 0.
|
||||
num_tokens (int | None, optional): number of tokens. Defaults to None.
|
||||
|
||||
@@ -941,7 +941,7 @@ class EagleProposer(VllmEagleProposer):
|
||||
# [0, 1, 2, 3, 4, 5, 6, 7, 8] ->
|
||||
# [0, 1, 0, 1, 2, 3, 0, 1, 2]
|
||||
# _r1_ ____r2____ ___r3__
|
||||
token_offests = self.token_arange_np[:total_num_tokens] - new_query_start_locs_expanded
|
||||
token_offsets = self.token_arange_np[:total_num_tokens] - new_query_start_locs_expanded
|
||||
|
||||
# Expand starting positions to match token pattern
|
||||
# [0, q1, q1 + q2] ->
|
||||
@@ -952,7 +952,7 @@ class EagleProposer(VllmEagleProposer):
|
||||
# [0, 1, // req 1
|
||||
# q1 + 0, q1 + 1, q1 + 2, q1 + 3, // req 2
|
||||
# q1 + q2 + 0, q1 + q2 + 1, q1 + q2 + 2] // req 3
|
||||
token_indices_np = token_offests + old_query_start_locs_expanded
|
||||
token_indices_np = token_offsets + old_query_start_locs_expanded
|
||||
token_indices = torch.from_numpy(token_indices_np).to(device, non_blocking=True)
|
||||
|
||||
common_attn_metadata.slot_mapping[: token_indices.shape[0]].copy_(
|
||||
|
||||
@@ -35,7 +35,7 @@ class Proposer:
|
||||
aclgraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE,
|
||||
batch_descriptor=None,
|
||||
):
|
||||
"""Called by dummy_run in modle_runner"""
|
||||
"""Called by dummy_run in model_runner"""
|
||||
raise NotImplementedError
|
||||
|
||||
def generate_token_ids(
|
||||
|
||||
@@ -2390,7 +2390,7 @@ class NPUModelRunner(GPUModelRunner):
|
||||
to be reshaped to the desired shape before being used by the models.
|
||||
|
||||
NOTE: To support prefill disaggregation, we need to split kvcache tensor into
|
||||
k_cahce and v cache, and the addr of both are aligned by 2M
|
||||
k_cache and v cache, and the addr of both are aligned by 2M
|
||||
|
||||
Args:
|
||||
kv_cache_config: The KV cache config
|
||||
|
||||
@@ -459,9 +459,9 @@ class PCPManager:
|
||||
# draft_len of each request [1, 2, 1]
|
||||
# then prev_draft_token_indices is [0, 2, 3, 4]
|
||||
prev_draft_token_indices.extend(range(start, start + draft_len))
|
||||
num_commmon_tokens = len(sample_flattened_indices)
|
||||
num_common_tokens = len(sample_flattened_indices)
|
||||
|
||||
if num_commmon_tokens == 0:
|
||||
if num_common_tokens == 0:
|
||||
# No requests in common with the previous iteration
|
||||
# So input_ids.cpu will have all the input ids.
|
||||
return
|
||||
|
||||
Reference in New Issue
Block a user