From 0dae55a9a3deebdb4f2263011154d886c525fc13 Mon Sep 17 00:00:00 2001 From: wangxiyuan Date: Tue, 29 Apr 2025 11:14:19 +0800 Subject: [PATCH] [MISC] fix format check error (#654) This pr makes format.sh works as expect. Signed-off-by: wangxiyuan --- csrc/kernels/pos_encoding_kernels.cpp | 8 ++--- docs/source/faqs.md | 2 +- docs/source/installation.md | 2 +- examples/disaggregated_prefill_hccl.py | 2 +- examples/offline_inference_audio_language.py | 2 +- format.sh | 33 +++++++++---------- .../e2e/test_medusa_correctness.py | 2 +- .../spec_decode/e2e/test_mlp_correctness.py | 4 +-- .../spec_decode/e2e/test_ngram_correctness.py | 2 +- tools/actionlint.sh | 3 +- tools/mypy.sh | 6 +--- vllm_ascend/attention/attention.py | 2 +- vllm_ascend/core/schedule_config.py | 2 +- .../distributed/llmdatadist_connector.py | 11 +++++-- vllm_ascend/distributed/parallel_state.py | 4 +-- vllm_ascend/ops/fused_moe.py | 2 +- vllm_ascend/ops/rotary_embedding.py | 2 +- 17 files changed, 45 insertions(+), 44 deletions(-) diff --git a/csrc/kernels/pos_encoding_kernels.cpp b/csrc/kernels/pos_encoding_kernels.cpp index 28ef503..0b77ce8 100644 --- a/csrc/kernels/pos_encoding_kernels.cpp +++ b/csrc/kernels/pos_encoding_kernels.cpp @@ -29,7 +29,7 @@ using vllm_ascend::AccType; using vllm_ascend::local_mem_copy; template class RotaryEmbedding { // NOTE(ganyi): we use 512B as load stride for pipe, need to find another way to - // retrive this size from runtime for more Soc support + // retrieve this size from runtime for more Soc support static int constexpr loadSize = 512; using dst_t = scalar_t; using acc_t = typename AccType::type; @@ -66,7 +66,7 @@ public: pipe_->InitBuffer(inQue_, 1 /* buffer_num */, loadSize /* buffer_size */); pipe_->InitBuffer(inQueSinCos_, 1 /* buffer_num */, rotDim_ * sizeof(scalar_t) /* buffer_size */); pipe_->InitBuffer(outQue_, 1 /* buffer_num */, loadSize /* buffer_size */); - // 2 temperary calculation buffer + // 2 temporary calculation buffer calcTmpBufferOffset_ = 0; // 1 upcast buffer for bf16 (headSize) upcastInputBufferOffset_ = calcTmpBufferOffset_ + sizeof(acc_t) * embedDim_ * 2; @@ -75,10 +75,10 @@ public: // 2 sin cos upcast buffer for bf16 cosSinUpcastBufferOffset_ = upcastTempBufferOffset_ + sizeof(acc_t) * 2 * embedDim_; // 2. bf16 path: needs 2 cos sin upcast buffer size - // 3. fp16 path: needs 2 temperary calculation buffer size + // 3. fp16 path: needs 2 temporary calculation buffer size tempBufferSize_ = cosSinUpcastBufferOffset_ + 2 * embedDim_ * sizeof(acc_t); // need to consider upcast the bf16 to fp32, so we might need 4 buffer just in case - // 2 temperary buffer, 2 input buffer, 1 cos buffer, 1 sin buffer, 2 scale buffer (headSize), 2 zp + // 2 temporary buffer, 2 input buffer, 1 cos buffer, 1 sin buffer, 2 scale buffer (headSize), 2 zp // buffer(headSize int8), 1 dst_temp buffer(headSize, int32) pipe_->InitBuffer(calcBuf_, tempBufferSize_ /* buffer_size */); if constexpr (!std::is_same_v) { diff --git a/docs/source/faqs.md b/docs/source/faqs.md index 288689b..f954098 100644 --- a/docs/source/faqs.md +++ b/docs/source/faqs.md @@ -89,7 +89,7 @@ Currently, w8a8 quantization is already supported by vllm-ascend originally on v Currently, w8a8 DeepSeek is working in process: [support AscendW8A8 quantization](https://github.com/vllm-project/vllm-ascend/pull/511) -Please run DeepSeek with BF16 now, follwing the [Multi-Node DeepSeek inferencing tutorail](https://vllm-ascend.readthedocs.io/en/main/tutorials/multi_node.html) +Please run DeepSeek with BF16 now, following the [Multi-Node DeepSeek inferencing tutorail](https://vllm-ascend.readthedocs.io/en/main/tutorials/multi_node.html) ### 12. There is not output in log when loading models using vllm-ascend, How to solve it? diff --git a/docs/source/installation.md b/docs/source/installation.md index 51d0475..cb63a4b 100644 --- a/docs/source/installation.md +++ b/docs/source/installation.md @@ -127,7 +127,7 @@ apt update -y apt install -y gcc g++ cmake libnuma-dev wget git ``` -**[Optinal]** Config the extra-index of `pip` if you are working on a **x86** machine, so that the torch with cpu could be found: +**[Optional]** Config the extra-index of `pip` if you are working on a **x86** machine, so that the torch with cpu could be found: ```bash pip config set global.extra-index-url https://download.pytorch.org/whl/cpu/ diff --git a/examples/disaggregated_prefill_hccl.py b/examples/disaggregated_prefill_hccl.py index ab82abc..be317d2 100644 --- a/examples/disaggregated_prefill_hccl.py +++ b/examples/disaggregated_prefill_hccl.py @@ -84,7 +84,7 @@ def run_decode(prefill_done): gpu_memory_utilization=0.8, tensor_parallel_size=2) - # Wait for the producer to start the comsumer + # Wait for the producer to start the consumer print("Waiting for prefill node to finish...") prefill_done.wait() diff --git a/examples/offline_inference_audio_language.py b/examples/offline_inference_audio_language.py index 4f9d64f..deb8105 100644 --- a/examples/offline_inference_audio_language.py +++ b/examples/offline_inference_audio_language.py @@ -93,7 +93,7 @@ def main(args): inputs = {"prompt": prompt, "multi_modal_data": mm_data} if args.num_prompts > 1: # Batch inference - inputs = [inputs] * args.num_prompts + inputs = [inputs] * args.num_prompts # type: ignore outputs = llm.generate(inputs, sampling_params=sampling_params) diff --git a/format.sh b/format.sh index bec7be4..595bf2f 100755 --- a/format.sh +++ b/format.sh @@ -116,6 +116,7 @@ format_all() { yapf --in-place "${YAPF_FLAGS[@]}" "${YAPF_EXCLUDES[@]}" . } +echo 'vllm-ascend yapf:' ## This flag formats individual files. --files *must* be the first command line ## arg to use this option. if [[ "$1" == '--files' ]]; then @@ -128,12 +129,12 @@ else # Format only the files that changed in last commit. format_changed fi -echo 'vLLM yapf: Done' +echo 'vllm-ascend yapf: Done' # Run mypy -echo 'vLLM mypy:' +echo 'vllm-ascend mypy:' tools/mypy.sh -echo 'vLLM mypy: Done' +echo 'vllm-ascend mypy: Done' # If git diff returns a file that is in the skip list, the file may be checked anyway: @@ -172,6 +173,7 @@ spell_check_changed() { fi } +echo 'vllm-ascend codespell:' # Run Codespell ## This flag runs spell check of individual files. --files *must* be the first command line ## arg to use this option. @@ -185,7 +187,7 @@ else # Check spelling only of the files that changed in last commit. spell_check_changed fi -echo 'vLLM codespell: Done' +echo 'vllm-ascend codespell: Done' # Lint specified files @@ -211,6 +213,7 @@ lint_changed() { } +echo 'vllm-ascend ruff:' # Run Ruff ### This flag lints individual files. --files *must* be the first command line ### arg to use this option. @@ -224,7 +227,7 @@ else # Format only the files that changed in last commit. lint_changed fi -echo 'vLLM ruff: Done' +echo 'vllm-ascend ruff: Done' # check spelling of specified files isort_check() { @@ -251,6 +254,7 @@ isort_check_changed() { fi } +echo 'vllm-ascend isort:' # Run Isort # This flag runs spell check of individual files. --files *must* be the first command line # arg to use this option. @@ -264,18 +268,13 @@ else # Check spelling only of the files that changed in last commit. isort_check_changed fi -echo 'vLLM isort: Done' +echo 'vllm-ascend isort: Done' # Clang-format section # Exclude some files for formatting because they are vendored # NOTE: Keep up to date with .github/workflows/clang-format.yml CLANG_FORMAT_EXCLUDES=( - 'csrc/moe/topk_softmax_kernels.cu' - 'csrc/quantization/gguf/ggml-common.h' - 'csrc/quantization/gguf/dequantize.cuh' - 'csrc/quantization/gguf/vecdotq.cuh' - 'csrc/quantization/gguf/mmq.cuh' - 'csrc/quantization/gguf/mmvq.cuh' + 'csrc/kernels/pos_encoding_kernels.cpp' ) # Format specified files with clang-format @@ -315,15 +314,15 @@ elif [[ "$1" == '--all' ]]; then else clang_format_changed fi -echo 'vLLM clang-format: Done' +echo 'vllm-ascend clang-format: Done' -echo 'vLLM actionlint:' +echo 'vllm-ascend actionlint:' tools/actionlint.sh -color -echo 'vLLM actionlint: Done' +echo 'vllm-ascend actionlint: Done' -echo 'vLLM shellcheck:' +echo 'vllm-ascend shellcheck:' tools/shellcheck.sh -echo 'vLLM shellcheck: Done' +echo 'vllm-ascend shellcheck: Done' echo 'excalidraw png check:' tools/png-lint.sh diff --git a/tests/singlecard/spec_decode/e2e/test_medusa_correctness.py b/tests/singlecard/spec_decode/e2e/test_medusa_correctness.py index 92d3ae4..26cd11b 100644 --- a/tests/singlecard/spec_decode/e2e/test_medusa_correctness.py +++ b/tests/singlecard/spec_decode/e2e/test_medusa_correctness.py @@ -236,7 +236,7 @@ def test_medusa_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs, # TODO: There is a problem with the preemptive scheduling in the current # version, which makes this case fail. Please release this case after the -# preemptive scheduling preblem is solved. +# preemptive scheduling problem is solved. # @pytest.mark.parametrize( # "common_llm_kwargs", # [{ diff --git a/tests/singlecard/spec_decode/e2e/test_mlp_correctness.py b/tests/singlecard/spec_decode/e2e/test_mlp_correctness.py index 675556f..e446c60 100644 --- a/tests/singlecard/spec_decode/e2e/test_mlp_correctness.py +++ b/tests/singlecard/spec_decode/e2e/test_mlp_correctness.py @@ -296,7 +296,7 @@ def test_mlp_e2e_seeded_correctness(vllm_runner, common_llm_kwargs, # TODO: There is a problem with the preemptive scheduling in the current # version, which makes this case fail. Please release this case after the -# preemptive scheduling preblem is solved. +# preemptive scheduling problem is solved. # @pytest.mark.parametrize( # "common_llm_kwargs", # [{ @@ -352,7 +352,7 @@ def test_mlp_e2e_seeded_correctness(vllm_runner, common_llm_kwargs, # TODO: There is a problem with the preemptive scheduling in the current # version, which makes this case fail. Please release this case after the -# preemptive scheduling preblem is solved. +# preemptive scheduling problem is solved. # @pytest.mark.parametrize( # "common_llm_kwargs", # [{ diff --git a/tests/singlecard/spec_decode/e2e/test_ngram_correctness.py b/tests/singlecard/spec_decode/e2e/test_ngram_correctness.py index 14d97e9..18ed731 100644 --- a/tests/singlecard/spec_decode/e2e/test_ngram_correctness.py +++ b/tests/singlecard/spec_decode/e2e/test_ngram_correctness.py @@ -175,7 +175,7 @@ def test_ngram_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs, # TODO: There is a problem with the preemptive scheduling in the current # version, which makes this case fail. Please release this case after the -# preemptive scheduling preblem is solved. +# preemptive scheduling problem is solved. # @pytest.mark.parametrize( # "common_llm_kwargs", # [{ diff --git a/tools/actionlint.sh b/tools/actionlint.sh index 072335b..a050b56 100755 --- a/tools/actionlint.sh +++ b/tools/actionlint.sh @@ -18,6 +18,7 @@ # This file is a part of the vllm-ascend project. # Adapted from https://github.com/vllm-project/vllm/tree/main/tools # +export SHELLCHECK_OPTS="--exclude=SC2046,SC2006" if command -v actionlint &> /dev/null; then actionlint .github/workflows/*.yml .github/workflows/*.yaml @@ -29,4 +30,4 @@ fi # download a binary to the current directory - v1.7.3 bash <(curl https://raw.githubusercontent.com/rhysd/actionlint/aa0a7be8e566b096e64a5df8ff290ec24fa58fbc/scripts/download-actionlint.bash) -./actionlint .github/workflows/*.yml .github/workflows/*.yaml +./actionlint .github/workflows/*.yml .github/workflows/*.yaml diff --git a/tools/mypy.sh b/tools/mypy.sh index 7f7f644..57a3d27 100755 --- a/tools/mypy.sh +++ b/tools/mypy.sh @@ -28,11 +28,7 @@ fi run_mypy() { echo "Running mypy on $1" - if [ "$CI" -eq 1 ] && [ -z "$1" ]; then - mypy --python-version "${PYTHON_VERSION}" "$@" - return - fi - mypy --follow-imports skip --python-version "${PYTHON_VERSION}" "$@" + mypy --check-untyped-defs --follow-imports skip --python-version "${PYTHON_VERSION}" "$@" } run_mypy vllm_ascend diff --git a/vllm_ascend/attention/attention.py b/vllm_ascend/attention/attention.py index 3de391f..b179785 100644 --- a/vllm_ascend/attention/attention.py +++ b/vllm_ascend/attention/attention.py @@ -1080,7 +1080,7 @@ class AscendMLAAttentionBackendImpl(MLAAttentionImpl): if len(kv_cache) > 0 and kv_cache[0].numel( ) > 0 and attn_metadata.num_prefills > 0: slots = attn_metadata.slot_mapping - # NOTE: Seperate the kv cache in advance to avoid OOM or other issues + # NOTE: Separate the kv cache in advance to avoid OOM or other issues torch_npu._npu_reshape_and_cache(key=kv_c_normed.view( num_tokens, self.num_kv_heads, -1), value=k_pe, diff --git a/vllm_ascend/core/schedule_config.py b/vllm_ascend/core/schedule_config.py index 9194411..51e4960 100644 --- a/vllm_ascend/core/schedule_config.py +++ b/vllm_ascend/core/schedule_config.py @@ -60,7 +60,7 @@ class AscendSchedulerConfig(SchedulerConfig): ) if self.is_multimodal_model: raise NotImplementedError( - "currently AscendScheduler only supports LLM modles.") + "currently AscendScheduler only supports LLM models.") if self.num_scheduler_steps > 1: raise NotImplementedError( "currently AscendScheduler doesn't support multi-step.") diff --git a/vllm_ascend/distributed/llmdatadist_connector.py b/vllm_ascend/distributed/llmdatadist_connector.py index 69c8ce7..19a759a 100644 --- a/vllm_ascend/distributed/llmdatadist_connector.py +++ b/vllm_ascend/distributed/llmdatadist_connector.py @@ -57,8 +57,10 @@ def get_device_ips(): universal_newlines=True) if npu_info.returncode != 0 or not os.path.exists(HCCN_TOOL_PATH): raise RuntimeError("No npu-smi/hccn_tool tools provided for NPU.") - npu_start_idx = int( - re.match(r'.*\n\t([0-9]+).*', npu_info.stdout).group(1)) + re_result = re.match(r'.*\n\t([0-9]+).*', npu_info.stdout) + if re_result is None: + raise RuntimeError("Can't find npu start index") + npu_start_idx = int(re_result.group(1)) device_ip_list = [] for ip_offset in range(world_size): cmd = [ @@ -68,7 +70,10 @@ def get_device_ips(): stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) - device_ip = re.match(r'ipaddr:(.*)\n', device_ip_info.stdout).group(1) + re_result = re.match(r'ipaddr:(.*)\n', device_ip_info.stdout) + if re_result is None: + raise RuntimeError("Can't find npu ip") + device_ip = re_result.group(1) device_ip_list.append(device_ip) return device_ip_list diff --git a/vllm_ascend/distributed/parallel_state.py b/vllm_ascend/distributed/parallel_state.py index d7be8c3..895b7ff 100644 --- a/vllm_ascend/distributed/parallel_state.py +++ b/vllm_ascend/distributed/parallel_state.py @@ -7,7 +7,7 @@ from vllm.distributed.parallel_state import (GroupCoordinator, get_world_group, # vllm-ascend will maintain its own EP GroupCoordinator and ETP GroupCoordinator for # customize parallel solution _EP: Optional[GroupCoordinator] = None -_ETP: Optional[list[GroupCoordinator]] = None +_ETP: Optional[GroupCoordinator] = None def get_ep_group() -> GroupCoordinator: @@ -69,4 +69,4 @@ def destory_ascend_model_parallel(): global _ETP if _ETP: _ETP.destroy() - _ETP = None \ No newline at end of file + _ETP = None diff --git a/vllm_ascend/ops/fused_moe.py b/vllm_ascend/ops/fused_moe.py index 7eebc7d..2c25e0c 100644 --- a/vllm_ascend/ops/fused_moe.py +++ b/vllm_ascend/ops/fused_moe.py @@ -278,7 +278,7 @@ def fused_experts( dtype=dtype) # TODO: npu_grouped_matmul output random values at [num_valid_tokens:, ...] - # This created multiple NaN and index_add_ will mix them up which harms accracy + # This created multiple NaN and index_add_ will mix them up which harms accuracy # remove this mask and filter after it being fixed num_valid_tokens = mask.sum() valid_token_mask = torch.arange( diff --git a/vllm_ascend/ops/rotary_embedding.py b/vllm_ascend/ops/rotary_embedding.py index 1ddd08f..f830364 100644 --- a/vllm_ascend/ops/rotary_embedding.py +++ b/vllm_ascend/ops/rotary_embedding.py @@ -227,7 +227,7 @@ def _set_cos_sin_cache(self, seq_len, device, dtype): persistent=False) -# TODO: Patch when aclnn ops avaiable +# TODO: Patch when aclnn ops available RotaryEmbedding.forward_oot = rope_forward_oot DeepseekScalingRotaryEmbedding.forward = native_rope_deepseek_forward DeepseekScalingRotaryEmbedding._set_cos_sin_cache = _set_cos_sin_cache