[Lint]Style: Convert vllm-ascend/ to ruff format(Batch #10) (#6173)

### What this PR does / why we need it? **Scope of Changes**: | File Path | | :--- | |`vllm_ascend/ops/layer_shard_linear.py`| |`vllm_ascend/ops/linear.py`| |`vllm_ascend/ops/linear_op.py`| |`vllm_ascend/worker/worker.py`| | ` vllm_ascend/patch/worker/patch_bert.py` | | ` vllm_ascend/patch/worker/patch_deepseek.py` | | ` vllm_ascend/patch/worker/patch_distributed.py` | | ` vllm_ascend/patch/worker/patch_module.py` | | ` vllm_ascend/patch/worker/patch_multimodal_merge.py` | | ` vllm_ascend/patch/worker/patch_qwen3_next.py` | | ` vllm_ascend/patch/worker/patch_qwen3_next_mtp.py` | | ` vllm_ascend/patch/worker/patch_rejection_sampler.py` | | ` vllm_ascend/patch/worker/patch_rope.py` | | ` vllm_ascend/patch/worker/patch_triton.py` | | ` vllm_ascend/patch/worker/patch_unquantized_gemm.py` | | ` vllm_ascend/patch/worker/patch_v2_egale.py` | |` vllm_ascend/worker/npu_input_batch.py`| |` vllm_ascend/worker/v2/aclgraph_utils.py`| |` vllm_ascend/worker/v2/attn_utils.py`| |` vllm_ascend/worker/v2/model_runner.py`| |` vllm_ascend/worker/v2/sample/gumbel.py`| |` vllm_ascend/worker/v2/sample/penalties.py`| |` vllm_ascend/worker/v2/sample/sampler.py`| |` vllm_ascend/worker/v2/spec_decode/__init__.py`| |` vllm_ascend/worker/v2/spec_decode/eagle.py`| |` vllm_ascend/worker/v2/states.py`| ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.14.0 - vLLM main: d68209402d Signed-off-by: MrZ20 <2609716663@qq.com> Signed-off-by: SILONG ZENG <2609716663@qq.com> Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> Co-authored-by: wangxiyuan <wangxiyuan1007@gmail.com>
2026-02-06 15:35:06 +08:00
parent 65b7f716e6
commit 19b5d44ea8
33 changed files with 938 additions and 1243 deletions
--- a/vllm_ascend/worker/v2/sample/penalties.py
+++ b/vllm_ascend/worker/v2/sample/penalties.py
@@ -68,8 +68,7 @@ def _penalties_and_temperature_kernel(
    if use_penalty:
        req_state_idx = tl.load(idx_mapping_ptr + batch_idx)
        output_bin_counts = tl.load(
-            output_bin_counts_ptr + req_state_idx * output_bin_counts_stride +
-            block,
+            output_bin_counts_ptr + req_state_idx * output_bin_counts_stride + block,
            mask=mask,
        )
        # to use vector core, if use > 0 will use scalar to slow down performance
@@ -77,11 +76,9 @@ def _penalties_and_temperature_kernel(

        # Apply repetition penalties.
        if use_rep_penalty:
-            packed_block = block_idx * BLOCK_SIZE // 32 + tl.arange(
-                0, BLOCK_SIZE // 32)
+            packed_block = block_idx * BLOCK_SIZE // 32 + tl.arange(0, BLOCK_SIZE // 32)
            packed_mask = tl.load(
-                prompt_bin_mask_ptr + req_state_idx * prompt_bin_mask_stride +
-                packed_block,
+                prompt_bin_mask_ptr + req_state_idx * prompt_bin_mask_stride + packed_block,
                mask=packed_block < tl.cdiv(vocab_size, 32),
            )
            # the compiler itself does not optimize right-shift operations, so we change the same func
@@ -97,8 +94,7 @@ def _penalties_and_temperature_kernel(
            prompt_bin_mask = prompt_bin_mask.reshape(BLOCK_SIZE)

            # If token appears in prompt or output, apply, otherwise use 1.0 for no-op.
-            scale = tl.where(prompt_bin_mask | output_bin_mask, rep_penalty,
-                             1.0)
+            scale = tl.where(prompt_bin_mask | output_bin_mask, rep_penalty, 1.0)
            # If logits are positive, divide by penalty, otherwise multiply by penalty.
            logits *= tl.where(logits > 0, 1.0 / scale, scale)