diff --git a/.github/workflows/labled_doctest.yaml b/.github/workflows/labled_doctest.yaml index cdfed7ea..7720c7ad 100644 --- a/.github/workflows/labled_doctest.yaml +++ b/.github/workflows/labled_doctest.yaml @@ -44,11 +44,11 @@ jobs: # Each version should be tested fail-fast: false matrix: - vllm_verison: [releases-v0.13.0, releases-v0.13.0-openeuler, main, main-openeuler] + vllm_version: [releases-v0.13.0, releases-v0.13.0-openeuler, main, main-openeuler] name: vLLM Ascend test runs-on: linux-aarch64-a2b3-1 container: - image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:${{ matrix.vllm_verison }} + image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:${{ matrix.vllm_version }} steps: - name: Check NPU/CANN and git info run: | diff --git a/.github/workflows/schedule_nightly_test_a2.yaml b/.github/workflows/schedule_nightly_test_a2.yaml index 6bca7a90..9a011af6 100644 --- a/.github/workflows/schedule_nightly_test_a2.yaml +++ b/.github/workflows/schedule_nightly_test_a2.yaml @@ -146,10 +146,10 @@ jobs: # Each version should be tested fail-fast: false matrix: - vllm_verison: [releases-v0.13.0, releases-v0.13.0-openeuler, main, main-openeuler] + vllm_version: [releases-v0.13.0, releases-v0.13.0-openeuler, main, main-openeuler] runs-on: linux-aarch64-a2b3-1 container: - image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:${{ matrix.vllm_verison }} + image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:${{ matrix.vllm_version }} steps: - name: Check NPU/CANN and git info run: | @@ -183,4 +183,4 @@ jobs: # Run real test echo "Test:" - /vllm-workspace/vllm-ascend/tests/e2e/run_doctests.sh \ No newline at end of file + /vllm-workspace/vllm-ascend/tests/e2e/run_doctests.sh diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 723cde5b..530edd22 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,9 +1,11 @@ default_install_hook_types: - pre-commit - commit-msg + default_stages: - pre-commit # Run locally - manual # Run in CI + repos: - repo: https://github.com/astral-sh/ruff-pre-commit rev: v0.14.0 @@ -11,6 +13,7 @@ repos: - id: ruff-check args: [--output-format, github, --fix] - id: ruff-format + - repo: https://github.com/codespell-project/codespell rev: v2.4.1 hooks: @@ -22,6 +25,7 @@ repos: ] additional_dependencies: - tomli + - repo: https://github.com/crate-ci/typos rev: v1.32.0 hooks: @@ -30,6 +34,7 @@ repos: "--force-exclude", "--exclude", "csrc/**" ] + # - repo: https://github.com/pre-commit/mirrors-clang-format # rev: v20.1.3 # hooks: @@ -37,17 +42,20 @@ repos: # files: ^csrc/.*\.(cpp|hpp|cc|hh|cxx|hxx)$ # types_or: [c++] # args: [--style=google, --verbose] + - repo: https://github.com/igorshubovych/markdownlint-cli rev: v0.45.0 hooks: - id: markdownlint exclude: '.*\.inc\.md$|.*report_template\.md$|.*contributors\.md$|.*PULL_REQUEST_TEMPLATE\.md$' stages: [manual] # Only run in CI + - repo: https://github.com/rhysd/actionlint rev: v1.7.7 hooks: - id: actionlint exclude: '.*\.github/workflows/scripts/.*\.ya?ml$' + - repo: local hooks: - id: png-lint diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/modeling/adding_a_new_multimodal_model.po b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/modeling/adding_a_new_multimodal_model.po index e0fd947a..0b50de43 100644 --- a/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/modeling/adding_a_new_multimodal_model.po +++ b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/modeling/adding_a_new_multimodal_model.po @@ -25,5 +25,5 @@ msgid "Adding a New Multi-Modal Model" msgstr "添加新的多模态模型" #: ../../developer_guide/modeling/adding_a_new_multimodal_model.md:3 -msgid "**_Comming soon ..._**" +msgid "**_Coming soon ..._**" msgstr "**_敬请期待 ..._**" diff --git a/docs/source/user_guide/release_notes.md b/docs/source/user_guide/release_notes.md index 8084db65..3fe322f7 100644 --- a/docs/source/user_guide/release_notes.md +++ b/docs/source/user_guide/release_notes.md @@ -636,7 +636,7 @@ This is the 3rd release candidate of v0.9.1 for vLLM Ascend. Please follow the [ - Fix incorrect req block length in ascend scheduler [#2394](https://github.com/vllm-project/vllm-ascend/pull/2394) - Fix header include issue in rope [#2398](https://github.com/vllm-project/vllm-ascend/pull/2398) - Fix mtp config bug [#2412](https://github.com/vllm-project/vllm-ascend/pull/2412) -- Fix error info and adapt `attn_metedata` refactor [#2402](https://github.com/vllm-project/vllm-ascend/pull/2402) +- Fix error info and adapt `attn_metadata` refactor [#2402](https://github.com/vllm-project/vllm-ascend/pull/2402) - Fix torchair runtime error caused by configuration mismatches and `.kv_cache_bytes` file missing [#2312](https://github.com/vllm-project/vllm-ascend/pull/2312) - Move `with_prefill` allreduce from cpu to npu [#2230](https://github.com/vllm-project/vllm-ascend/pull/2230) diff --git a/examples/disaggregated_prefill_v1/load_balance_proxy_layerwise_server_example.py b/examples/disaggregated_prefill_v1/load_balance_proxy_layerwise_server_example.py index b9aec1ae..4ad5caa5 100644 --- a/examples/disaggregated_prefill_v1/load_balance_proxy_layerwise_server_example.py +++ b/examples/disaggregated_prefill_v1/load_balance_proxy_layerwise_server_example.py @@ -178,7 +178,7 @@ class ProxyState: # No lock needed - atomic operation self.prefillers[server_idx].aborted_requests.add(request_id) - def aquire_aborted_prefiller_requests(self, server_idx: int): # Changed to synchronous + def acquire_aborted_prefiller_requests(self, server_idx: int): # Changed to synchronous """ Get the set of aborted requests and clear it. This is used to release kv cache in prefiller node. @@ -325,7 +325,7 @@ async def send_request_to_service( max_retries: int = 3, base_delay: float = 0.2, ): - proxy_state.aquire_aborted_prefiller_requests(prefiller_id) + proxy_state.acquire_aborted_prefiller_requests(prefiller_id) req_data = req_data.copy() req_data["stream"] = False req_data["max_tokens"] = 1 diff --git a/examples/disaggregated_prefill_v1/load_balance_proxy_server_example.py b/examples/disaggregated_prefill_v1/load_balance_proxy_server_example.py index d339c484..aef8095a 100644 --- a/examples/disaggregated_prefill_v1/load_balance_proxy_server_example.py +++ b/examples/disaggregated_prefill_v1/load_balance_proxy_server_example.py @@ -241,7 +241,7 @@ class ProxyState: return self.prefillers[server_idx].aborted_requests.add(request_id) - def aquire_aborted_prefiller_requests(self, server_idx: int): # Changed to synchronous + def acquire_aborted_prefiller_requests(self, server_idx: int): # Changed to synchronous """ Get the set of aborted requests and clear it. This is used to release kv cache in prefiller node. @@ -582,7 +582,7 @@ async def send_request_to_service( max_retries: int = 3, base_delay: float = 0.2, ): - aborted_requests = proxy_state.aquire_aborted_prefiller_requests(prefiller_id) + aborted_requests = proxy_state.acquire_aborted_prefiller_requests(prefiller_id) req_data = req_data.copy() req_data["kv_transfer_params"] = { "do_remote_decode": True, diff --git a/examples/eplb/eplb_strategy.py b/examples/eplb/eplb_strategy.py index 03147b93..ceebf6e3 100644 --- a/examples/eplb/eplb_strategy.py +++ b/examples/eplb/eplb_strategy.py @@ -59,7 +59,7 @@ def calculate_average(lst): return total / count -def layer_imblance_polt(y_list, label_names, device_num, output_path, file_name): +def layer_imbalance_plot(y_list, label_names, device_num, output_path, file_name): plt.rcParams["font.sans-serif"] = ["Arial"] plt.rcParams["axes.unicode_minus"] = False x = [i for i in range(58)] @@ -160,4 +160,4 @@ if __name__ == "__main__": save_matrix_to_json(output_path, file_name, np.array(global_deployment)) label_names = ["default deployment max load", "balanced load max load", "balanced load avg load"] new_file_name = f"{exp_name}_{num_devices}_{num_redundancy_expert}.png" - layer_imblance_polt(y_list, label_names, num_devices, output_path, new_file_name) + layer_imbalance_plot(y_list, label_names, num_devices, output_path, new_file_name) diff --git a/examples/external_online_dp/dp_load_balance_proxy_server.py b/examples/external_online_dp/dp_load_balance_proxy_server.py index 23b20a1b..3ccdf104 100644 --- a/examples/external_online_dp/dp_load_balance_proxy_server.py +++ b/examples/external_online_dp/dp_load_balance_proxy_server.py @@ -283,10 +283,10 @@ async def _select_instance(api: str, req_data: Any, request_length: int): request_id = await proxy_state.next_req_id() # Select dp server based on priority score server_idx = proxy_state.select_server(priority_score) - choosen_server = proxy_state.dp_servers[server_idx] - logger.debug(f"Choose server {choosen_server.url} to process request {request_id}") + chosen_server = proxy_state.dp_servers[server_idx] + logger.debug(f"Choose server {chosen_server.url} to process request {request_id}") return InstanceInfo( - request_id=request_id, server_idx=server_idx, priority_score=priority_score, server_state=choosen_server + request_id=request_id, server_idx=server_idx, priority_score=priority_score, server_state=chosen_server ) diff --git a/examples/external_online_dp/launch_online_dp.py b/examples/external_online_dp/launch_online_dp.py index 8834e7e9..c372169a 100644 --- a/examples/external_online_dp/launch_online_dp.py +++ b/examples/external_online_dp/launch_online_dp.py @@ -29,11 +29,11 @@ dp_rpc_port = args.dp_rpc_port vllm_start_port = args.vllm_start_port -def run_command(visiable_devices, dp_rank, vllm_engine_port): +def run_command(visible_devices, dp_rank, vllm_engine_port): command = [ "bash", "./run_dp_template.sh", - visiable_devices, + visible_devices, str(vllm_engine_port), str(dp_size), str(dp_rank), @@ -55,8 +55,8 @@ if __name__ == "__main__": for i in range(dp_size_local): dp_rank = dp_rank_start + i vllm_engine_port = vllm_start_port + i - visiable_devices = ",".join(str(x) for x in range(i * tp_size, (i + 1) * tp_size)) - process = multiprocessing.Process(target=run_command, args=(visiable_devices, dp_rank, vllm_engine_port)) + visible_devices = ",".join(str(x) for x in range(i * tp_size, (i + 1) * tp_size)) + process = multiprocessing.Process(target=run_command, args=(visible_devices, dp_rank, vllm_engine_port)) processes.append(process) process.start() diff --git a/setup.py b/setup.py index 8e7ffbd1..acac0958 100644 --- a/setup.py +++ b/setup.py @@ -198,7 +198,7 @@ class build_and_install_aclnn(Command): try: print("Running bash build_aclnn.sh ...") subprocess.check_call(["bash", "csrc/build_aclnn.sh", ROOT_DIR, envs.SOC_VERSION]) - print("buid_aclnn.sh executed successfully!") + print("build_aclnn.sh executed successfully!") except subprocess.CalledProcessError as e: print(f"Error running build_aclnn.sh: {e}") raise SystemExit(e.returncode) diff --git a/tests/e2e/nightly/multi_node/scripts/multi_node_config.py b/tests/e2e/nightly/multi_node/scripts/multi_node_config.py index 6431e4d6..5ef3f3c7 100644 --- a/tests/e2e/nightly/multi_node/scripts/multi_node_config.py +++ b/tests/e2e/nightly/multi_node/scripts/multi_node_config.py @@ -10,7 +10,7 @@ import yaml # isort: off from tests.e2e.nightly.multi_node.scripts.utils import ( CONFIG_BASE_PATH, DEFAULT_SERVER_PORT, get_all_ipv4, get_cluster_ips, - get_net_interface, setup_logger, get_avaliable_port) + get_net_interface, setup_logger, get_available_port) # isort: on setup_logger() logger = logging.getLogger(__name__) @@ -202,7 +202,7 @@ class MultiNodeConfig: master_ip = (self.disagg_cfg.master_ip_for_node( self.cur_index, self.nodes) if self.disagg_cfg else self.nodes[0].ip) - self.proxy_port = get_avaliable_port() + self.proxy_port = get_available_port() self.envs = DistEnvBuilder( cur_node=self.cur_node, diff --git a/tests/e2e/nightly/multi_node/scripts/utils.py b/tests/e2e/nightly/multi_node/scripts/utils.py index 01c2d692..99bc6c38 100644 --- a/tests/e2e/nightly/multi_node/scripts/utils.py +++ b/tests/e2e/nightly/multi_node/scripts/utils.py @@ -75,7 +75,7 @@ def get_cluster_ips(word_size: int = 2) -> list[str]: return [resolver(dns) for dns in get_cluster_dns_list(word_size)] -def get_avaliable_port(start_port: int = 6000, end_port: int = 7000) -> int: +def get_available_port(start_port: int = 6000, end_port: int = 7000) -> int: import socket for port in range(start_port, end_port): with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: diff --git a/tests/e2e/nightly/single_node/ops/multicard_ops_a3/test_dispatch_ffn_combine.py b/tests/e2e/nightly/single_node/ops/multicard_ops_a3/test_dispatch_ffn_combine.py index 5897ef0c..bf1d5f4e 100644 --- a/tests/e2e/nightly/single_node/ops/multicard_ops_a3/test_dispatch_ffn_combine.py +++ b/tests/e2e/nightly/single_node/ops/multicard_ops_a3/test_dispatch_ffn_combine.py @@ -11,7 +11,7 @@ from vllm_ascend.utils import enable_custom_op enable_custom_op() -class TestDisptachFFNCombine: +class TestDispatchFFNCombine: def __init__(self, rank, world_size, port): self.rank = rank @@ -208,7 +208,7 @@ class TestDisptachFFNCombine: def worker(rank: int, world_size: int, port: int, q: mp.SimpleQueue): - op = TestDisptachFFNCombine(rank, world_size, port) + op = TestDispatchFFNCombine(rank, world_size, port) op.generate_hcom() out1 = op.run_tensor_list() q.put(out1) diff --git a/tests/e2e/nightly/single_node/ops/multicard_ops_a3/test_dispatch_ffn_combine_bf16.py b/tests/e2e/nightly/single_node/ops/multicard_ops_a3/test_dispatch_ffn_combine_bf16.py index 91289940..85a1be39 100644 --- a/tests/e2e/nightly/single_node/ops/multicard_ops_a3/test_dispatch_ffn_combine_bf16.py +++ b/tests/e2e/nightly/single_node/ops/multicard_ops_a3/test_dispatch_ffn_combine_bf16.py @@ -11,7 +11,7 @@ from vllm_ascend.utils import enable_custom_op enable_custom_op() -class TestDisptachFFNCombine: +class TestDispatchFFNCombine: def __init__(self, rank, world_size, port): self.rank = rank @@ -208,7 +208,7 @@ class TestDisptachFFNCombine: def worker(rank: int, world_size: int, port: int, q: mp.SimpleQueue): - op = TestDisptachFFNCombine(rank, world_size, port) + op = TestDispatchFFNCombine(rank, world_size, port) op.generate_hcom() out1 = op.run_tensor_list() q.put(out1) diff --git a/tests/e2e/nightly/single_node/ops/singlecard_ops/triton/test_penality.py b/tests/e2e/nightly/single_node/ops/singlecard_ops/triton/test_penality.py index 8110b116..f6d2385d 100644 --- a/tests/e2e/nightly/single_node/ops/singlecard_ops/triton/test_penality.py +++ b/tests/e2e/nightly/single_node/ops/singlecard_ops/triton/test_penality.py @@ -124,10 +124,10 @@ def create_test_data( logits = torch.randn(num_reqs, vocab_size, device=device, dtype=dtype) - repetiton_penalty = torch.ones(num_reqs, device=device, dtype=torch.float32) + repetition_penalty = torch.ones(num_reqs, device=device, dtype=torch.float32) for i in range(num_reqs): if torch.rand(1) > 0.3: - repetiton_penalty[i] = torch.rand(1, device=device).item() * 0.8 + 0.6 + repetition_penalty[i] = torch.rand(1, device=device).item() * 0.8 + 0.6 frequency_penalty = torch.zeros(num_reqs, device=device, dtype=torch.float32) for i in range(num_reqs): @@ -168,7 +168,7 @@ def create_test_data( output_bin_counts[state_idx, token] = count sampling_metadata = SamplingMetadata( - repetition_penalty=repetiton_penalty, + repetition_penalty=repetition_penalty, frequency_penalty=frequency_penalty, presence_penalty=presence_penalty, temperature=temperature, @@ -217,4 +217,3 @@ def test_apply_penalties_and_temperature( atol = 1e-02 rtol = 1e-02 assert torch.allclose(logits_triton, logits_pytorch_result, atol=atol, rtol=rtol) - diff --git a/tests/ut/ops/test_moe_mlp.py b/tests/ut/ops/test_moe_mlp.py index d8914b8f..ff0bf5f5 100644 --- a/tests/ut/ops/test_moe_mlp.py +++ b/tests/ut/ops/test_moe_mlp.py @@ -18,7 +18,7 @@ class TestCumsumGroupList(unittest.TestCase): } support_combine = [(0, 0), (1, 0), (0, 1)] - unsupport_combine = [(0, 2), (2, 1), (1, 2)] + unsupported_combine = [(0, 2), (2, 1), (1, 2)] def test_cumsum_group_list_supported_conversion(self): for src_list_type, dst_list_type in self.support_combine: @@ -38,7 +38,7 @@ class TestCumsumGroupList(unittest.TestCase): def test_cumsum_group_list_unsupported_conversion_notimplementederror( self): - for src_list_type, dst_list_type in self.unsupport_combine: + for src_list_type, dst_list_type in self.unsupported_combine: with self.subTest(src=src_list_type, dst=dst_list_type): with self.assertRaises(NotImplementedError) as excinfo: cumsum_group_list(self.glist_dict[0], src_list_type, diff --git a/typos.toml b/typos.toml index 2055450b..1452efc0 100644 --- a/typos.toml +++ b/typos.toml @@ -1,6 +1,6 @@ [files] # these files may be written in non english words -extend-exclude = [] +extend-exclude = [".pre-commit-config.yaml",] ignore-hidden = true ignore-files = true ignore-dot = true @@ -17,9 +17,9 @@ ignore-hex = true identifier-leading-digits = false locale = "en" extend-ignore-identifiers-re = [".*Unc.*", ".*_thw", - ".*UE8M0.*", ".*[UE4M3|ue4m3].*", ".*eles.*", ".*fo.*", ".*ba.*", + ".*UE8M0.*", ".*(UE4M3|ue4m3).*", ".*eles.*", ".*fo.*", ".*ba.*", ".*ot.*", ".*[Tt]h[rR].*"] -extend-ignore-words-re = ["CANN", "cann","ND","alog"] +extend-ignore-words-re = ["CANN", "cann","ND","alog","nd","BA","datas","ful","udo",] extend-ignore-re = [] [default.extend-identifiers] diff --git a/vllm_ascend/ascend_config.py b/vllm_ascend/ascend_config.py index facb2fe1..843c7da6 100644 --- a/vllm_ascend/ascend_config.py +++ b/vllm_ascend/ascend_config.py @@ -144,14 +144,14 @@ class AscendConfig: if os.getenv("VLLM_ASCEND_ENABLE_PREFETCH_MLP", "0") == "1": MAX_PREFETCH_WEIGHT_SIZE: int = 18 * 1024 * 1024 gate_up_prefetch_size = int(os.getenv("VLLM_ASCEND_MLP_GATE_UP_PREFETCH_SIZE", MAX_PREFETCH_WEIGHT_SIZE)) - down_prefetch_szie = int(os.getenv("VLLM_ASCEND_MLP_DOWN_PREFETCH_SIZE", MAX_PREFETCH_WEIGHT_SIZE)) + down_prefetch_size = int(os.getenv("VLLM_ASCEND_MLP_DOWN_PREFETCH_SIZE", MAX_PREFETCH_WEIGHT_SIZE)) self.weight_prefetch_config.set_mlp_pre_version_compatibale_config( - gate_up_prefetch_size, down_prefetch_szie + gate_up_prefetch_size, down_prefetch_size ) logger.info_once( f"MLP weight prefetch enabled from env variable VLLM_ASCEND_ENABLE_PREFETCH_MLP." f"gate_up_prefetch_size={gate_up_prefetch_size}, " - f"down_prefetch_szie={down_prefetch_szie}." + f"down_prefetch_size={down_prefetch_size}." ) warnings.warn( "VLLM_ASCEND_ENABLE_PREFETCH_MLP is deprecated and will be removed in a v0.16.0 version. " diff --git a/vllm_ascend/compilation/passes/allreduce_rmsnorm_fusion_pass.py b/vllm_ascend/compilation/passes/allreduce_rmsnorm_fusion_pass.py index b97d4d83..525543e0 100644 --- a/vllm_ascend/compilation/passes/allreduce_rmsnorm_fusion_pass.py +++ b/vllm_ascend/compilation/passes/allreduce_rmsnorm_fusion_pass.py @@ -34,13 +34,13 @@ else: from vllm.compilation.passes.vllm_inductor_pass import VllmInductorPass # computation-communication tiling block is 512 -ALLREDUCE_NORM_FUSE_THREHOLD = 512 +ALLREDUCE_NORM_FUSE_THRESHOLD = 512 def get_compile_range_and_extra_stream_check(): def check_func(match: Match) -> bool: compile_range = get_pass_context().compile_range - return extra_stream_scope_check(match) and compile_range.start > ALLREDUCE_NORM_FUSE_THREHOLD + return extra_stream_scope_check(match) and compile_range.start > ALLREDUCE_NORM_FUSE_THRESHOLD return check_func @@ -176,5 +176,5 @@ class MatmulAllReduceAddRMSNormPass(VllmInductorPass): """ Check if the pass is applicable for the current configuration. """ - applicable = compile_range.start > ALLREDUCE_NORM_FUSE_THREHOLD + applicable = compile_range.start > ALLREDUCE_NORM_FUSE_THRESHOLD return applicable diff --git a/vllm_ascend/core/scheduler_dynamic_batch.py b/vllm_ascend/core/scheduler_dynamic_batch.py index 5f0898ce..6d1aab58 100644 --- a/vllm_ascend/core/scheduler_dynamic_batch.py +++ b/vllm_ascend/core/scheduler_dynamic_batch.py @@ -86,9 +86,9 @@ class BudgetRefiner: return k return None - def _get_max_budget(self, num_deocde_tokens, num_decode): + def _get_max_budget(self, num_decode_tokens, num_decode): """Get the maximum budget according to the number of decoding tokens and the decoding requests.""" - aligned_ctx = self._align_key(num_deocde_tokens, self.context_keys) + aligned_ctx = self._align_key(num_decode_tokens, self.context_keys) aligned_dnum = self._align_key(num_decode, self.dnum_keys) if aligned_ctx is None or aligned_dnum is None: return self.default_budget @@ -99,7 +99,7 @@ class BudgetRefiner: # For debug. # logger.info( # f"budget {budget}, ctx,dnum {aligned_ctx, aligned_dnum}, " - # f"raw ctx,dnum {num_deocde_tokens, num_decode}" + # f"raw ctx,dnum {num_decode_tokens, num_decode}" # ) return budget @@ -114,8 +114,8 @@ class BudgetRefiner: num_decode = len(num_decode_token_lst) if num_decode <= 0: return budget - num_deocde_tokens = sum(num_decode_token_lst) / num_decode - return self._get_max_budget(num_deocde_tokens, num_decode) + num_decode_tokens = sum(num_decode_token_lst) / num_decode + return self._get_max_budget(num_decode_tokens, num_decode) class SchedulerDynamicBatch(Scheduler): diff --git a/vllm_ascend/distributed/device_communicators/pyhccl_wrapper.py b/vllm_ascend/distributed/device_communicators/pyhccl_wrapper.py index 2808891b..e18992cf 100644 --- a/vllm_ascend/distributed/device_communicators/pyhccl_wrapper.py +++ b/vllm_ascend/distributed/device_communicators/pyhccl_wrapper.py @@ -171,7 +171,7 @@ class HCCLLibrary: path_to_library_cache: dict[str, Any] = {} # class attribute to store the mapping from library path - # to the correspongding directory + # to the corresponding directory path_to_dict_mapping: dict[str, dict[str, Any]] = {} def __init__(self, so_file: str | None = None): diff --git a/vllm_ascend/distributed/kv_transfer/kv_p2p/mooncake_connector.py b/vllm_ascend/distributed/kv_transfer/kv_p2p/mooncake_connector.py index 5fc64590..02fd1a61 100644 --- a/vllm_ascend/distributed/kv_transfer/kv_p2p/mooncake_connector.py +++ b/vllm_ascend/distributed/kv_transfer/kv_p2p/mooncake_connector.py @@ -1316,8 +1316,8 @@ class MooncakeConnectorWorker: """ prefill_tp_size = meta.remote_ptp_size if getattr(meta, "remote_ptp_size", None) else self._prefill_tp_size if meta.remote_pcp_size * meta.remote_dcp_size * self.pcp_size * self.dcp_size == 1: - choosen_rank_list = self._get_remote_rank(req_id, prefill_tp_size) - remote_handshake_port_list = [[x + meta.remote_port for x in choosen_rank_list]] + chosen_rank_list = self._get_remote_rank(req_id, prefill_tp_size) + remote_handshake_port_list = [[x + meta.remote_port for x in chosen_rank_list]] local_block_ids_list, remote_block_ids_list = [meta.local_block_ids], [meta.remote_block_ids] return remote_handshake_port_list, local_block_ids_list, remote_block_ids_list @@ -1563,8 +1563,8 @@ class MooncakeConnectorWorker: ), ) else: # TODO: support prefill context parallel and pipeline parallel open at the same time - choosen_rank_list = self._get_remote_rank(remote_req_id, prefill_tp_size) - remote_handshake_port_list = [[x + meta.remote_port] for x in choosen_rank_list] + chosen_rank_list = self._get_remote_rank(remote_req_id, prefill_tp_size) + remote_handshake_port_list = [[x + meta.remote_port] for x in chosen_rank_list] for i in range(tp_num_need_pulls * self._prefill_pp_size): assert self.kv_recv_thread is not None remote_host, remote_engine_id = self._get_remote_host_info_by_port( @@ -1651,8 +1651,8 @@ class MooncakeConnectorWorker: or self.use_sparse ): tp_ori_data = tp_ori_data.reshape(-1, num_groups) - choosen_group = tp_ori_data[:, [rand_group_index]] - flattened = choosen_group.reshape(-1).tolist() + chosen_group = tp_ori_data[:, [rand_group_index]] + flattened = chosen_group.reshape(-1).tolist() tp_sampled_nums = [ flattened[i : i + tp_num_need_pulls] for i in range(0, len(flattened), tp_num_need_pulls) ] diff --git a/vllm_ascend/distributed/kv_transfer/kv_p2p/mooncake_layerwise_connector.py b/vllm_ascend/distributed/kv_transfer/kv_p2p/mooncake_layerwise_connector.py index 7ea8698e..c49d1621 100644 --- a/vllm_ascend/distributed/kv_transfer/kv_p2p/mooncake_layerwise_connector.py +++ b/vllm_ascend/distributed/kv_transfer/kv_p2p/mooncake_layerwise_connector.py @@ -741,7 +741,7 @@ class MooncakeLayerwiseConnectorScheduler: computed_tokens.get(req_id, 0) + scheduled_tokens - spec_decode_tokens ) - def add_tranfer_task(req_id, send_req_info: SendReqInfo, chunk_finish=False): + def add_transfer_task(req_id, send_req_info: SendReqInfo, chunk_finish=False): ( local_block_ids, local_transed_tokens, @@ -771,7 +771,7 @@ class MooncakeLayerwiseConnectorScheduler: # whether chunk finish chunk_finish = send_req_info.local_computed_tokens >= len(send_req_info.request.all_token_ids) - add_tranfer_task(req_id, send_req_info, chunk_finish=chunk_finish) + add_transfer_task(req_id, send_req_info, chunk_finish=chunk_finish) if chunk_finish: self._reqs_need_send_layerwise.pop(req_id) return meta diff --git a/vllm_ascend/eplb/core/eplb_device_transfer_loader.py b/vllm_ascend/eplb/core/eplb_device_transfer_loader.py index 345a0ee1..a5dc6154 100644 --- a/vllm_ascend/eplb/core/eplb_device_transfer_loader.py +++ b/vllm_ascend/eplb/core/eplb_device_transfer_loader.py @@ -68,7 +68,7 @@ class D2DExpertWeightLoader: self.updated_log2phy_map = log2phy_map def asyn_expert_weight_transfer(self, reqs): - # Only when send/recv tasks are parsed into self.comm_op_list, d2d send/recv tasks can be luanched + # Only when send/recv tasks are parsed into self.comm_op_list, d2d send/recv tasks can be launched if self.state != ExpertWeightUpdateState.READY: return @@ -80,7 +80,7 @@ class D2DExpertWeightLoader: self.state = ExpertWeightUpdateState.TRANSFERRING def update_expert_map_and_weight(self, reqs): - # Only after send/recv tasks have been luanched, expert_map and weight can be updated + # Only after send/recv tasks have been launched, expert_map and weight can be updated if self.state != ExpertWeightUpdateState.TRANSFERRING: return diff --git a/vllm_ascend/eplb/core/policy/policy_flashlb.py b/vllm_ascend/eplb/core/policy/policy_flashlb.py index 3ceefa73..117b8ed6 100644 --- a/vllm_ascend/eplb/core/policy/policy_flashlb.py +++ b/vllm_ascend/eplb/core/policy/policy_flashlb.py @@ -130,8 +130,8 @@ def jsq_placement(X, pieces, M, stage_weights): score = 0.0 for s in range(n_stage): tmp_sj = loads[s, j] + w[s] - numer_sj = tmp_sj if tmp_sj > stage_max[s] else stage_max[s] - score += stage_weights[s] * (numer_sj / denom[s]) + number_sj = tmp_sj if tmp_sj > stage_max[s] else stage_max[s] + score += stage_weights[s] * (number_sj / denom[s]) if score < best_val: best_val = score best_j = j diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py index 6713b90c..9bfaa0bc 100644 --- a/vllm_ascend/platform.py +++ b/vllm_ascend/platform.py @@ -195,10 +195,10 @@ class NPUPlatform(Platform): ) if vllm_config.additional_config.get("ascend_compilation_config", {}).get("fuse_allreduce_rms", True): - from vllm_ascend.compilation.passes.allreduce_rmsnorm_fusion_pass import ALLREDUCE_NORM_FUSE_THREHOLD + from vllm_ascend.compilation.passes.allreduce_rmsnorm_fusion_pass import ALLREDUCE_NORM_FUSE_THRESHOLD new_compile_ranges_split_points = vllm_config.compilation_config.compile_ranges_split_points - new_compile_ranges_split_points.append(ALLREDUCE_NORM_FUSE_THREHOLD) + new_compile_ranges_split_points.append(ALLREDUCE_NORM_FUSE_THRESHOLD) new_compile_ranges_split_points = sorted(new_compile_ranges_split_points) vllm_config.compilation_config.compile_ranges_split_points = new_compile_ranges_split_points logger.debug( @@ -208,10 +208,10 @@ class NPUPlatform(Platform): npugraph_ex_config = ascend_config.npugraph_ex_config if npugraph_ex_config and npugraph_ex_config.fuse_allreduce_rms: - from vllm_ascend.compilation.passes.allreduce_rmsnorm_fusion_pass import ALLREDUCE_NORM_FUSE_THREHOLD + from vllm_ascend.compilation.passes.allreduce_rmsnorm_fusion_pass import ALLREDUCE_NORM_FUSE_THRESHOLD new_compile_ranges_split_points = vllm_config.compilation_config.compile_ranges_split_points - new_compile_ranges_split_points.append(ALLREDUCE_NORM_FUSE_THREHOLD) + new_compile_ranges_split_points.append(ALLREDUCE_NORM_FUSE_THRESHOLD) new_compile_ranges_split_points = sorted(new_compile_ranges_split_points) vllm_config.compilation_config.compile_ranges_split_points = new_compile_ranges_split_points logger.debug( @@ -558,7 +558,7 @@ class NPUPlatform(Platform): Args: attn_metadata (dict[str, Any]): attention metadata for all layers. vllm_config (VllmConfig): configuration of vllm. - dp_metadata (DpMetada): metadata for data parallelism. + dp_metadata (Dpmetadata): metadata for data parallelism. lack of typehint because of circular import. virtual_engine (int, optional): index of virtual engine. Defaults to 0. num_tokens (int | None, optional): number of tokens. Defaults to None. diff --git a/vllm_ascend/spec_decode/eagle_proposer.py b/vllm_ascend/spec_decode/eagle_proposer.py index 7ec749b2..73a080b4 100644 --- a/vllm_ascend/spec_decode/eagle_proposer.py +++ b/vllm_ascend/spec_decode/eagle_proposer.py @@ -941,7 +941,7 @@ class EagleProposer(VllmEagleProposer): # [0, 1, 2, 3, 4, 5, 6, 7, 8] -> # [0, 1, 0, 1, 2, 3, 0, 1, 2] # _r1_ ____r2____ ___r3__ - token_offests = self.token_arange_np[:total_num_tokens] - new_query_start_locs_expanded + token_offsets = self.token_arange_np[:total_num_tokens] - new_query_start_locs_expanded # Expand starting positions to match token pattern # [0, q1, q1 + q2] -> @@ -952,7 +952,7 @@ class EagleProposer(VllmEagleProposer): # [0, 1, // req 1 # q1 + 0, q1 + 1, q1 + 2, q1 + 3, // req 2 # q1 + q2 + 0, q1 + q2 + 1, q1 + q2 + 2] // req 3 - token_indices_np = token_offests + old_query_start_locs_expanded + token_indices_np = token_offsets + old_query_start_locs_expanded token_indices = torch.from_numpy(token_indices_np).to(device, non_blocking=True) common_attn_metadata.slot_mapping[: token_indices.shape[0]].copy_( diff --git a/vllm_ascend/spec_decode/interface.py b/vllm_ascend/spec_decode/interface.py index 138f88dc..42ac576a 100644 --- a/vllm_ascend/spec_decode/interface.py +++ b/vllm_ascend/spec_decode/interface.py @@ -35,7 +35,7 @@ class Proposer: aclgraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE, batch_descriptor=None, ): - """Called by dummy_run in modle_runner""" + """Called by dummy_run in model_runner""" raise NotImplementedError def generate_token_ids( diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index c8b8a94e..b1d925a5 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -2390,7 +2390,7 @@ class NPUModelRunner(GPUModelRunner): to be reshaped to the desired shape before being used by the models. NOTE: To support prefill disaggregation, we need to split kvcache tensor into - k_cahce and v cache, and the addr of both are aligned by 2M + k_cache and v cache, and the addr of both are aligned by 2M Args: kv_cache_config: The KV cache config diff --git a/vllm_ascend/worker/pcp_utils.py b/vllm_ascend/worker/pcp_utils.py index d2e7e723..3528970b 100644 --- a/vllm_ascend/worker/pcp_utils.py +++ b/vllm_ascend/worker/pcp_utils.py @@ -459,9 +459,9 @@ class PCPManager: # draft_len of each request [1, 2, 1] # then prev_draft_token_indices is [0, 2, 3, 4] prev_draft_token_indices.extend(range(start, start + draft_len)) - num_commmon_tokens = len(sample_flattened_indices) + num_common_tokens = len(sample_flattened_indices) - if num_commmon_tokens == 0: + if num_common_tokens == 0: # No requests in common with the previous iteration # So input_ids.cpu will have all the input ids. return