[Lint]Style: Convert vllm-ascend/ to ruff format(Batch #10) (#6173)

### What this PR does / why we need it? **Scope of Changes**: | File Path | | :--- | |`vllm_ascend/ops/layer_shard_linear.py`| |`vllm_ascend/ops/linear.py`| |`vllm_ascend/ops/linear_op.py`| |`vllm_ascend/worker/worker.py`| | ` vllm_ascend/patch/worker/patch_bert.py` | | ` vllm_ascend/patch/worker/patch_deepseek.py` | | ` vllm_ascend/patch/worker/patch_distributed.py` | | ` vllm_ascend/patch/worker/patch_module.py` | | ` vllm_ascend/patch/worker/patch_multimodal_merge.py` | | ` vllm_ascend/patch/worker/patch_qwen3_next.py` | | ` vllm_ascend/patch/worker/patch_qwen3_next_mtp.py` | | ` vllm_ascend/patch/worker/patch_rejection_sampler.py` | | ` vllm_ascend/patch/worker/patch_rope.py` | | ` vllm_ascend/patch/worker/patch_triton.py` | | ` vllm_ascend/patch/worker/patch_unquantized_gemm.py` | | ` vllm_ascend/patch/worker/patch_v2_egale.py` | |` vllm_ascend/worker/npu_input_batch.py`| |` vllm_ascend/worker/v2/aclgraph_utils.py`| |` vllm_ascend/worker/v2/attn_utils.py`| |` vllm_ascend/worker/v2/model_runner.py`| |` vllm_ascend/worker/v2/sample/gumbel.py`| |` vllm_ascend/worker/v2/sample/penalties.py`| |` vllm_ascend/worker/v2/sample/sampler.py`| |` vllm_ascend/worker/v2/spec_decode/__init__.py`| |` vllm_ascend/worker/v2/spec_decode/eagle.py`| |` vllm_ascend/worker/v2/states.py`| ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.14.0 - vLLM main: d68209402d Signed-off-by: MrZ20 <2609716663@qq.com> Signed-off-by: SILONG ZENG <2609716663@qq.com> Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> Co-authored-by: wangxiyuan <wangxiyuan1007@gmail.com>
2026-02-06 15:35:06 +08:00
parent 65b7f716e6
commit 19b5d44ea8
33 changed files with 938 additions and 1243 deletions
--- a/vllm_ascend/patch/worker/patch_bert.py
+++ b/vllm_ascend/patch/worker/patch_bert.py
@@ -24,15 +24,12 @@ TOKEN_TYPE_MULTIPLIER = 1 << 30
 TOKEN_MASK = TOKEN_TYPE_MULTIPLIER - 1


-def _encode_token_type_ids(input_ids: torch.Tensor,
-                           token_type_ids: torch.Tensor) -> None:
+def _encode_token_type_ids(input_ids: torch.Tensor, token_type_ids: torch.Tensor) -> None:
    # input_ids can be padded to the right
-    input_ids[:token_type_ids.shape[0]].bitwise_or_(token_type_ids *
-                                                    TOKEN_TYPE_MULTIPLIER)
+    input_ids[: token_type_ids.shape[0]].bitwise_or_(token_type_ids * TOKEN_TYPE_MULTIPLIER)


 def _decode_token_type_ids(input_ids: torch.Tensor) -> torch.Tensor:
-
    token_type_ids = input_ids // TOKEN_TYPE_MULTIPLIER

    input_ids.bitwise_and_(TOKEN_MASK)
--- a/vllm_ascend/patch/worker/patch_deepseek.py
+++ b/vllm_ascend/patch/worker/patch_deepseek.py
@@ -0,0 +1,54 @@
+from itertools import islice
+
+import torch
+from vllm.distributed import get_pp_group
+from vllm.model_executor.models.deepseek_v2 import DeepseekV2Model, _get_llama_4_scaling
+from vllm.sequence import IntermediateTensors
+
+
+def forward(
+    self,
+    input_ids,
+    positions,
+    intermediate_tensors,
+    inputs_embeds,
+):
+    if get_pp_group().is_first_rank:
+        if inputs_embeds is not None:
+            hidden_states = inputs_embeds
+        else:
+            hidden_states = self.embed_input_ids(input_ids)
+        residual = None
+    else:
+        assert intermediate_tensors is not None
+        hidden_states = intermediate_tensors["hidden_states"]
+        residual = intermediate_tensors["residual"]
+
+    # Compute llama 4 scaling once per forward pass if enabled
+    # Note(wxy): This is a hack fix to avoid graph mode error for torch 2.8
+    # We'll find a better way to remove this patch.
+    try:
+        llama_4_scaling_config = self.config.llama_4_scaling
+    except AttributeError:
+        llama_4_scaling_config = None
+    llama_4_scaling: torch.Tensor | None
+    if llama_4_scaling_config is not None:
+        llama_4_scaling = _get_llama_4_scaling(
+            original_max_position_embeddings=llama_4_scaling_config["original_max_position_embeddings"],
+            scaling_beta=llama_4_scaling_config["beta"],
+            positions=positions,
+        )
+    else:
+        llama_4_scaling = None
+
+    for layer in islice(self.layers, self.start_layer, self.end_layer):
+        hidden_states, residual = layer(positions, hidden_states, residual, llama_4_scaling)
+
+    if not get_pp_group().is_last_rank:
+        return IntermediateTensors({"hidden_states": hidden_states, "residual": residual})
+
+    hidden_states, _ = self.norm(hidden_states, residual)
+    return hidden_states
+
+
+DeepseekV2Model.forward = forward
--- a/vllm_ascend/patch/worker/patch_distributed.py
+++ b/vllm_ascend/patch/worker/patch_distributed.py
@@ -15,29 +15,25 @@
 # limitations under the License.
 #

-from typing import List, Optional, Union

 import torch
 import vllm
 from torch.distributed import Backend
-from vllm.distributed.parallel_state import (GroupCoordinator,
-                                             _get_unique_name, _register_group)
+from vllm.distributed.parallel_state import GroupCoordinator, _get_unique_name, _register_group

-from vllm_ascend.distributed.device_communicators.npu_communicator import \
-    NPUCommunicator
+from vllm_ascend.distributed.device_communicators.npu_communicator import NPUCommunicator
 from vllm_ascend.utils import create_hccl_pg_options


 class GroupCoordinatorPatch(GroupCoordinator):
-
    def __init__(
        self,
        group_ranks: list[list[int]],
        local_rank: int,
-        torch_distributed_backend: Union[str, Backend],
+        torch_distributed_backend: str | Backend,
        use_device_communicator: bool,  # whether to use device communicator
        use_message_queue_broadcaster: bool = False,
-        group_name: Optional[str] = None,
+        group_name: str | None = None,
    ):
        group_name = group_name or "anonymous"
        self.unique_name = _get_unique_name(group_name)
@@ -52,9 +48,8 @@ class GroupCoordinatorPatch(GroupCoordinator):

        for ranks in group_ranks:
            device_group = torch.distributed.new_group(
-                ranks,
-                backend=torch_distributed_backend,
-                pg_options=hccl_pg_options)
+                ranks, backend=torch_distributed_backend, pg_options=hccl_pg_options
+            )

            # a group with `gloo` backend, to allow direct coordination between
            # processes through the CPU.
@@ -83,22 +78,23 @@ class GroupCoordinatorPatch(GroupCoordinator):
                unique_name=self.unique_name,
            )

-        from vllm.distributed.device_communicators.shm_broadcast import \
-            MessageQueue
-        self.mq_broadcaster: Optional[MessageQueue] = None
+        from vllm.distributed.device_communicators.shm_broadcast import MessageQueue
+
+        self.mq_broadcaster: MessageQueue | None = None
        if use_message_queue_broadcaster and self.world_size > 1:
-            self.mq_broadcaster = MessageQueue.create_from_process_group(
-                self.cpu_group, 1 << 22, 6)
+            self.mq_broadcaster = MessageQueue.create_from_process_group(self.cpu_group, 1 << 22, 6)

        self.use_custom_op_call = False
        self.use_cpu_custom_send_recv = False

-    def all_to_all(self,
-                   input_: torch.Tensor,
-                   scatter_dim: int = 0,
-                   gather_dim: int = -1,
-                   scatter_sizes: Optional[List[int]] = None,
-                   gather_sizes: Optional[List[int]] = None) -> torch.Tensor:
+    def all_to_all(
+        self,
+        input_: torch.Tensor,
+        scatter_dim: int = 0,
+        gather_dim: int = -1,
+        scatter_sizes: list[int] | None = None,
+        gather_sizes: list[int] | None = None,
+    ) -> torch.Tensor:
        if self.world_size == 1:
            return input_
        assert -input_.dim() <= scatter_dim < input_.dim(), (
@@ -108,9 +104,7 @@ class GroupCoordinatorPatch(GroupCoordinator):
            f"Invalid gather dim ({gather_dim}) for input tensor with shape {input_.size()}"
        )
        assert self.device_communicator is not None, "device_communicator should be initialized when world_size > 1"
-        return self.device_communicator.all_to_all(input_, scatter_dim,
-                                                   gather_dim, scatter_sizes,
-                                                   gather_sizes)
+        return self.device_communicator.all_to_all(input_, scatter_dim, gather_dim, scatter_sizes, gather_sizes)

    def all_reduce(self, input_):
        if self.world_size == 1:
--- a/vllm_ascend/patch/worker/patch_huanyuan_vl.py
+++ b/vllm_ascend/patch/worker/patch_huanyuan_vl.py
@@ -13,15 +13,17 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-#from collections.abc import Iterable
+# from collections.abc import Iterable

 from vllm.transformers_utils.processors.hunyuan_vl import HunYuanVLProcessor

 _original_call = HunYuanVLProcessor.__call__

+
 def _patched_call(self, images=None, text=None, videos=None, **kwargs):
    """Remove add_special_tokens requirement."""
    kwargs.pop("add_special_tokens", None)
    return _original_call(self, images=images, text=text, videos=videos, **kwargs)

-HunYuanVLProcessor.__call__ = _patched_call
+
+HunYuanVLProcessor.__call__ = _patched_call
--- a/vllm_ascend/patch/worker/patch_module.py
+++ b/vllm_ascend/patch/worker/patch_module.py
@@ -13,7 +13,6 @@ def _argsort(tensor, *args, **kwargs):


 class _TorchWrapper:
-
    def __init__(self):
        self._raw_torch = torch

@@ -32,5 +31,6 @@ def patch_torch_npu_argsort():
    global _is_patched
    if not _is_patched:
        import vllm.v1.attention.backends.gdn_attn as gdn_attn
+
        gdn_attn.torch = _TorchWrapper()
        _is_patched = True
--- a/vllm_ascend/patch/worker/patch_multimodal_merge.py
+++ b/vllm_ascend/patch/worker/patch_multimodal_merge.py
@@ -18,8 +18,7 @@

 import torch
 import vllm
-from vllm.model_executor.models.utils import (_embedding_count_expression,
-                                              _flatten_embeddings)
+from vllm.model_executor.models.utils import _embedding_count_expression, _flatten_embeddings
 from vllm.multimodal import NestedTensors


--- a/vllm_ascend/patch/worker/patch_qwen3_next.py
+++ b/vllm_ascend/patch/worker/patch_qwen3_next.py
@@ -13,33 +13,27 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-#from collections.abc import Iterable
+# from collections.abc import Iterable

 import torch
 from einops import rearrange
 from torch import nn
 from vllm.config import CUDAGraphMode
 from vllm.forward_context import get_forward_context
-from vllm.model_executor.layers.fla.ops import (
-    chunk_gated_delta_rule, fused_recurrent_gated_delta_rule)
+from vllm.model_executor.layers.fla.ops import chunk_gated_delta_rule, fused_recurrent_gated_delta_rule
 from vllm.model_executor.layers.mamba.abstract import MambaBase
-from vllm.model_executor.layers.mamba.ops.causal_conv1d import (
-    causal_conv1d_fn, causal_conv1d_update)
-from vllm.model_executor.models.qwen3_next import (Qwen3NextGatedDeltaNet,
-                                                   fused_gdn_gating)
+from vllm.model_executor.layers.mamba.ops.causal_conv1d import causal_conv1d_fn, causal_conv1d_update
+from vllm.model_executor.models.qwen3_next import Qwen3NextGatedDeltaNet
 from vllm.triton_utils import triton
 from vllm.v1.attention.backend import AttentionMetadata  # type: ignore
 from vllm.v1.attention.backends.gdn_attn import GDNAttentionMetadata

-from vllm_ascend.ops.triton.fla.fused_qkvzba_split_reshape import \
-    fused_qkvzba_split_reshape_cat
-from vllm_ascend.ops.triton.fla.sigmoid_gating import \
-    fused_sigmoid_gating_delta_rule_update
+from vllm_ascend.ops.triton.fla.fused_qkvzba_split_reshape import fused_qkvzba_split_reshape_cat
+from vllm_ascend.ops.triton.fla.sigmoid_gating import fused_sigmoid_gating_delta_rule_update
 from vllm_ascend.ops.triton.fused_gdn_gating import fused_gdn_gating_patch


 class AscendQwen3Next_GatedDeltaNet(nn.Module, MambaBase):
-
    def forward(
        self,
        hidden_states: torch.Tensor,
@@ -61,10 +55,8 @@ class AscendQwen3Next_GatedDeltaNet(nn.Module, MambaBase):
        forward_context = get_forward_context()
        is_cuda_graph = forward_context.cudagraph_runtime_mode != CUDAGraphMode.NONE
        # triton grid should be less than 66536
-        divide_grid = projected_states_qkvz.shape[0] * triton.cdiv(
-            self.num_k_heads, self.tp_size)
-        if self.num_v_heads // self.num_k_heads in [1, 2, 4] and \
-            is_cuda_graph and divide_grid < 65536:
+        divide_grid = projected_states_qkvz.shape[0] * triton.cdiv(self.num_k_heads, self.tp_size)
+        if self.num_v_heads // self.num_k_heads in [1, 2, 4] and is_cuda_graph and divide_grid < 65536:
            mixed_qkv, z, b, a = fused_qkvzba_split_reshape_cat(
                projected_states_qkvz,
                projected_states_ba,
@@ -74,10 +66,8 @@ class AscendQwen3Next_GatedDeltaNet(nn.Module, MambaBase):
                self.head_v_dim,
            )
        else:
-            query, key, value, z, b, a = self.fix_query_key_value_ordering(
-                projected_states_qkvz, projected_states_ba)
-            query, key, value = map(lambda x: rearrange(x, 'l p d -> l (p d)'),
-                                    (query, key, value))
+            query, key, value, z, b, a = self.fix_query_key_value_ordering(projected_states_qkvz, projected_states_ba)
+            query, key, value = map(lambda x: rearrange(x, "l p d -> l (p d)"), (query, key, value))
            mixed_qkv = torch.cat((query, key, value), dim=-1)

        # ============================================================
@@ -150,16 +140,14 @@ class AscendQwen3Next_GatedDeltaNet(nn.Module, MambaBase):
        a = a[:num_actual_tokens]

        # 1. Convolution sequence transformation
-        conv_weights = self.conv1d.weight.view(self.conv1d.weight.size(0),
-                                               self.conv1d.weight.size(2))
+        conv_weights = self.conv1d.weight.view(self.conv1d.weight.size(0), self.conv1d.weight.size(2))
        if spec_sequence_masks is not None:
            if attn_metadata.num_prefills == 0 and attn_metadata.num_decodes == 0:
                mixed_qkv_spec = mixed_qkv
                mixed_qkv_non_spec = None
            else:
                mixed_qkv_spec = mixed_qkv.index_select(0, spec_token_indx)
-                mixed_qkv_non_spec = mixed_qkv.index_select(
-                    0, non_spec_token_indx)
+                mixed_qkv_non_spec = mixed_qkv.index_select(0, non_spec_token_indx)
        else:
            mixed_qkv_spec = None
            mixed_qkv_non_spec = mixed_qkv
@@ -172,8 +160,7 @@ class AscendQwen3Next_GatedDeltaNet(nn.Module, MambaBase):
                conv_weights,
                self.conv1d.bias,
                self.activation,
-                conv_state_indices=spec_state_indices_tensor[:, 0]
-                [:attn_metadata.num_spec_decodes],
+                conv_state_indices=spec_state_indices_tensor[:, 0][: attn_metadata.num_spec_decodes],
                num_accepted_tokens=num_accepted_tokens,
                query_start_loc=spec_query_start_loc,
                max_query_len=spec_state_indices_tensor.size(-1),
@@ -204,21 +191,16 @@ class AscendQwen3Next_GatedDeltaNet(nn.Module, MambaBase):
                conv_weights,
                self.conv1d.bias,
                self.activation,
-                conv_state_indices=
-                non_spec_state_indices_tensor[:attn_metadata.
-                                              num_actual_tokens],
+                conv_state_indices=non_spec_state_indices_tensor[: attn_metadata.num_actual_tokens],
                validate_data=True,
            )
        else:
            mixed_qkv_non_spec = None
-        query_spec, key_spec, value_spec = self.rearrange_mixed_qkv(
-            mixed_qkv_spec)
-        query_non_spec, key_non_spec, value_non_spec = self.rearrange_mixed_qkv(
-            mixed_qkv_non_spec)
+        query_spec, key_spec, value_spec = self.rearrange_mixed_qkv(mixed_qkv_spec)
+        query_non_spec, key_non_spec, value_non_spec = self.rearrange_mixed_qkv(mixed_qkv_non_spec)

        if attn_metadata.num_prefills > 0 or spec_sequence_masks is not None:
-            g, beta = fused_gdn_gating_patch(self.A_log, a, b,
-                                                self.dt_bias)
+            g, beta = fused_gdn_gating_patch(self.A_log, a, b, self.dt_bias)
            if spec_sequence_masks is not None:
                if attn_metadata.num_prefills == 0 and attn_metadata.num_decodes == 0:
                    g_spec = g
@@ -248,8 +230,7 @@ class AscendQwen3Next_GatedDeltaNet(nn.Module, MambaBase):
                    beta=beta_spec,
                    initial_state=ssm_state,
                    inplace_final_state=True,
-                    cu_seqlens=spec_query_start_loc[:attn_metadata.
-                                                    num_spec_decodes + 1],
+                    cu_seqlens=spec_query_start_loc[: attn_metadata.num_spec_decodes + 1],
                    ssm_state_indices=spec_state_indices_tensor,
                    num_accepted_tokens=num_accepted_tokens,
                    use_qk_l2norm_in_kernel=True,
@@ -259,8 +240,7 @@ class AscendQwen3Next_GatedDeltaNet(nn.Module, MambaBase):

            # 2.2: Process the remaining part
            if attn_metadata.num_prefills > 0:
-                initial_state = ssm_state[
-                    non_spec_state_indices_tensor].contiguous()
+                initial_state = ssm_state[non_spec_state_indices_tensor].contiguous()
                initial_state[~has_initial_state, ...] = 0
                (
                    core_attn_out_non_spec,
@@ -278,24 +258,20 @@ class AscendQwen3Next_GatedDeltaNet(nn.Module, MambaBase):
                    use_qk_l2norm_in_kernel=True,
                )
                # Init cache
-                ssm_state[
-                    non_spec_state_indices_tensor] = last_recurrent_state.to(
-                        ssm_state.dtype)
+                ssm_state[non_spec_state_indices_tensor] = last_recurrent_state.to(ssm_state.dtype)
            elif attn_metadata.num_decodes > 0:
-                core_attn_out_non_spec, last_recurrent_state = (
-                    fused_recurrent_gated_delta_rule(
-                        q=query_non_spec,
-                        k=key_non_spec,
-                        v=value_non_spec,
-                        g=g_non_spec,
-                        beta=beta_non_spec,
-                        initial_state=ssm_state,
-                        inplace_final_state=True,
-                        cu_seqlens=non_spec_query_start_loc[:attn_metadata.
-                                                            num_decodes + 1],
-                        ssm_state_indices=non_spec_state_indices_tensor,
-                        use_qk_l2norm_in_kernel=True,
-                    ))
+                core_attn_out_non_spec, last_recurrent_state = fused_recurrent_gated_delta_rule(
+                    q=query_non_spec,
+                    k=key_non_spec,
+                    v=value_non_spec,
+                    g=g_non_spec,
+                    beta=beta_non_spec,
+                    initial_state=ssm_state,
+                    inplace_final_state=True,
+                    cu_seqlens=non_spec_query_start_loc[: attn_metadata.num_decodes + 1],
+                    ssm_state_indices=non_spec_state_indices_tensor,
+                    use_qk_l2norm_in_kernel=True,
+                )
            else:
                core_attn_out_non_spec, last_recurrent_state = None, None

@@ -324,14 +300,12 @@ class AscendQwen3Next_GatedDeltaNet(nn.Module, MambaBase):
                device=core_attn_out_non_spec.device,
            )
            merged_out.index_copy_(1, spec_token_indx, core_attn_out_spec)
-            merged_out.index_copy_(1, non_spec_token_indx,
-                                   core_attn_out_non_spec)
+            merged_out.index_copy_(1, non_spec_token_indx, core_attn_out_non_spec)
            core_attn_out[:num_actual_tokens] = merged_out.squeeze(0)
        elif spec_sequence_masks is not None:
            core_attn_out[:num_actual_tokens] = core_attn_out_spec.squeeze(0)
        else:
-            core_attn_out[:num_actual_tokens] = core_attn_out_non_spec.squeeze(
-                0)
+            core_attn_out[:num_actual_tokens] = core_attn_out_non_spec.squeeze(0)


 Qwen3NextGatedDeltaNet.forward = AscendQwen3Next_GatedDeltaNet.forward
--- a/vllm_ascend/patch/worker/patch_qwen3_next_mtp.py
+++ b/vllm_ascend/patch/worker/patch_qwen3_next_mtp.py
@@ -1,13 +1,15 @@
 import torch
 import vllm.v1.worker.utils as utils
 from vllm.v1.worker.utils import defaultdict, extract_layer_index
+
 from vllm_ascend.utils import vllm_version_is

 if vllm_version_is("v0.15.0"):
-    from vllm.attention.layer import Attention # type: ignore
+    from vllm.attention.layer import Attention  # type: ignore
 else:
    from vllm.model_executor.layers.attention import Attention

+
 # Without this patch, it will raise an exception when initialize kv_cache.
 # TODO To remove the patch, we need check why the original bind_kv_cache raises an NotImplementedError.
 def bind_kv_cache(
@@ -38,8 +40,7 @@ def bind_kv_cache(
    # Convert kv_caches dict to a list of tensors in the order of layer_index.
    index2name = defaultdict(list)
    for layer_name in kv_caches:
-        index2name[extract_layer_index(layer_name,
-                                       num_attn_module)].append(layer_name)
+        index2name[extract_layer_index(layer_name, num_attn_module)].append(layer_name)

    for layer_index in sorted(index2name.keys()):
        layer_names = index2name[layer_index]
--- a/vllm_ascend/patch/worker/patch_rejection_sampler.py
+++ b/vllm_ascend/patch/worker/patch_rejection_sampler.py
@@ -1,8 +1,6 @@
 import vllm.v1.sample.rejection_sampler as rs

-from vllm_ascend.sample.rejection_sampler import (apply_sampling_constraints,
-                                                  expand_batch_to_tokens,
-                                                  rejection_sample)
+from vllm_ascend.sample.rejection_sampler import apply_sampling_constraints, expand_batch_to_tokens, rejection_sample

 # TODO: delete this patch after apply_sampling_constraints and rejection_sample
 #   are extracted to as class func of RejectionSampler
--- a/vllm_ascend/patch/worker/patch_rope.py
+++ b/vllm_ascend/patch/worker/patch_rope.py
@@ -17,12 +17,10 @@

 import torch
 import torch.nn as nn
-from vllm.model_executor.layers.rotary_embedding.base import \
-    RotaryEmbeddingBase
+from vllm.model_executor.layers.rotary_embedding.base import RotaryEmbeddingBase


 class AscendRotaryEmbeddingBase(nn.Module):
-
    def get_cos_sin(self, seqlen: int) -> tuple[torch.Tensor, torch.Tensor]:
        cos_sin = self.cos_sin_cache[:seqlen]
        cos, sin = cos_sin.chunk(2, dim=-1)
--- a/vllm_ascend/patch/worker/patch_triton.py
+++ b/vllm_ascend/patch/worker/patch_triton.py
@@ -3,16 +3,15 @@ import vllm.v1.worker.gpu.sample.gumbel

 from vllm_ascend.ops.triton.fla.chunk import chunk_gated_delta_rule
 from vllm_ascend.ops.triton.fla.layernorm_guard import LayerNormFn
-from vllm_ascend.ops.triton.fla.sigmoid_gating import \
-    fused_recurrent_gated_delta_rule_fwd_kernel
-from vllm_ascend.ops.triton.mamba.causal_conv1d import (
-    causal_conv1d_fn, causal_conv1d_update_npu)
-from vllm_ascend.worker.v2.sample.gumbel import \
-    gumbel_sample as ascend_gumbel_sample
+from vllm_ascend.ops.triton.fla.sigmoid_gating import fused_recurrent_gated_delta_rule_fwd_kernel
+from vllm_ascend.ops.triton.mamba.causal_conv1d import causal_conv1d_fn, causal_conv1d_update_npu
+from vllm_ascend.worker.v2.sample.gumbel import gumbel_sample as ascend_gumbel_sample

 vllm.model_executor.layers.mamba.ops.causal_conv1d.causal_conv1d_update = causal_conv1d_update_npu
 vllm.model_executor.layers.mamba.ops.causal_conv1d.causal_conv1d_fn = causal_conv1d_fn
-vllm.model_executor.layers.fla.ops.fused_recurrent.fused_recurrent_gated_delta_rule_fwd_kernel = fused_recurrent_gated_delta_rule_fwd_kernel
+vllm.model_executor.layers.fla.ops.fused_recurrent.fused_recurrent_gated_delta_rule_fwd_kernel = (
+    fused_recurrent_gated_delta_rule_fwd_kernel
+)
 vllm.model_executor.layers.fla.ops.layernorm_guard.LayerNormFn = LayerNormFn
 vllm.model_executor.layers.fla.ops.chunk_gated_delta_rule = chunk_gated_delta_rule
 vllm.v1.worker.gpu.sample.gumbel.gumbel_sample = ascend_gumbel_sample
--- a/vllm_ascend/patch/worker/patch_unquantized_gemm.py
+++ b/vllm_ascend/patch/worker/patch_unquantized_gemm.py
@@ -36,11 +36,14 @@ def unquantized_gemm_fake(
    return torch.empty(output_shape, dtype=x.dtype, device=x.device)


-direct_register_custom_op(op_name="unquantized_gemm",
-                          op_func=unquantized_gemm,
-                          fake_impl=unquantized_gemm_fake,
-                          mutates_args=[],
-                          dispatch_key="PrivateUse1")
+direct_register_custom_op(
+    op_name="unquantized_gemm",
+    op_func=unquantized_gemm,
+    fake_impl=unquantized_gemm_fake,
+    mutates_args=[],
+    dispatch_key="PrivateUse1",
+)
+

 def default_unquantized_gemm(
    layer: torch.nn.Module,
--- a/vllm_ascend/patch/worker/patch_v2_egale.py
+++ b/vllm_ascend/patch/worker/patch_v2_egale.py
@@ -19,11 +19,10 @@
 import numpy as np
 import torch
 import vllm
+from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.worker.gpu.input_batch import InputBatch
 from vllm.v1.worker.gpu.sample.gumbel import gumbel_sample
-from vllm.v1.sample.metadata import SamplingMetadata
-from vllm.v1.worker.gpu.spec_decode.eagle import (prepare_eagle_decode,
-                                                  prepare_eagle_inputs)
+from vllm.v1.worker.gpu.spec_decode.eagle import prepare_eagle_decode, prepare_eagle_inputs

 from vllm_ascend.worker.v2.attn_utils import build_attn_metadata

@@ -54,8 +53,7 @@ def propose(
    # seq_lens) of the target model.
    if aux_hidden_states:
        assert self.method == "eagle3"
-        hidden_states = self.model.combine_hidden_states(
-            torch.cat(aux_hidden_states, dim=-1))
+        hidden_states = self.model.combine_hidden_states(torch.cat(aux_hidden_states, dim=-1))
    else:
        hidden_states = last_hidden_states
    num_tokens = input_batch.num_tokens_after_padding
@@ -95,19 +93,12 @@ def propose(
    seeds = self.seeds[:num_reqs].clone()
    pos = self.input_buffers.positions[:num_reqs].clone()
    # Gather the values and copy them to the pre-allocated buffers.
-    torch.gather(sampling_metadata.temperature,
-                 0,
-                 cu_num_logits,
-                 out=temperature)
+    torch.gather(sampling_metadata.temperature, 0, cu_num_logits, out=temperature)
    torch.gather(sampling_metadata.seeds, 0, cu_num_logits, out=seeds)
    torch.gather(input_batch.positions, 0, last_token_indices, out=pos)
    # NOTE(woosuk): We must add 1 to the positions to match the Gumbel noise
    # used for draft and target sampling.
-    draft_tokens = gumbel_sample(logits,
-                                 temperature,
-                                 seeds,
-                                 pos + 1,
-                                 apply_temperature=True)
+    draft_tokens = gumbel_sample(logits, temperature, seeds, pos + 1, apply_temperature=True)
    if self.num_speculative_steps == 1:
        # Early exit.
        return draft_tokens.view(-1, 1)
@@ -127,9 +118,8 @@ def propose(
        self.max_num_reqs,
    )
    query_start_loc = self.input_buffers.query_start_loc
-    query_start_loc_gpu = query_start_loc.gpu[:num_reqs + 1]
-    slot_mappings = self.block_tables.compute_slot_mappings(
-        query_start_loc_gpu, pos)
+    query_start_loc_gpu = query_start_loc.gpu[: num_reqs + 1]
+    slot_mappings = self.block_tables.compute_slot_mappings(query_start_loc_gpu, pos)

    cudagraph_size = self.cudagraph_manager.get_cudagraph_size(num_reqs)
    if cudagraph_size is not None:
@@ -138,8 +128,8 @@ def propose(
        return self.draft_tokens[:num_reqs]

    # Run eager mode.
-    query_start_loc.np[:num_reqs + 1] = np.arange(num_reqs + 1)
-    query_start_loc_cpu = query_start_loc.cpu[:num_reqs + 1]
+    query_start_loc.np[: num_reqs + 1] = np.arange(num_reqs + 1)
+    query_start_loc_cpu = query_start_loc.cpu[: num_reqs + 1]
    # HACK(woosuk)
    seq_lens_np = np.full(num_reqs, self.max_model_len, dtype=np.int32)
    block_tables = [x[:num_reqs] for x in self.block_tables.input_block_tables]
@@ -158,8 +148,7 @@ def propose(
        slot_mappings=slot_mappings,
        kv_cache_config=self.kv_cache_config,
    )
-    self.generate_draft(num_reqs, attn_metadata,
-                        num_tokens_across_dp=None)  # FIXME
+    self.generate_draft(num_reqs, attn_metadata, num_tokens_across_dp=None)  # FIXME
    return self.draft_tokens[:num_reqs]