[Core] Cherry pick from 0.7.1 to keep the main code newest (#127)

Cherry pick from 0.7.1 to keep the main code newest Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2025-02-21 17:07:37 +08:00
parent 36991b2052
commit 5f465010de
11 changed files with 1136 additions and 353 deletions
--- a/.github/workflows/vllm_ascend_test.yaml
+++ b/.github/workflows/vllm_ascend_test.yaml
@@ -102,7 +102,7 @@ jobs:
        run: |
          pip install -e .

-      - name: Install torch-npu
+      - name: Install pta
        run: |
          mkdir pta
          cd pta
--- a/vllm_ascend/attention.py
+++ b/vllm_ascend/attention.py
--- a/vllm_ascend/model_runner.py
+++ b/vllm_ascend/model_runner.py
@@ -53,7 +53,7 @@ from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import IntermediateTensors, SequenceGroupMetadata
 from vllm.utils import (DeviceMemoryProfiler, PyObjectCache, flatten_2d_lists,
-                        is_pin_memory_available, make_tensor_with_pad)
+                        is_pin_memory_available)
 from vllm.worker.model_runner_base import (
    ModelRunnerBase, ModelRunnerInputBase, ModelRunnerInputBuilderBase,
    _add_attn_metadata_broadcastable_dict,
@@ -511,50 +511,21 @@ class ModelInputForNPUBuilder(ModelRunnerInputBuilderBase[ModelInputForNPU]):
            for data in self.inter_data_list
        }

-        batch_size = len(input_tokens)
-
-        if self.inter_data_list[0].is_prompt:
-            input_tokens_tensor = make_tensor_with_pad(
-                input_tokens, 0, dtype=torch.int, device=self.runner.device)
-            input_tokens_tensor = torch.flatten(input_tokens_tensor)
-            if mrope_input_positions is not None:
-                mrope_input_positions_tensor = make_tensor_with_pad(
-                    mrope_input_positions,
-                    0,
-                    dtype=torch.int,
-                    device=self.runner.device)
-                input_positions_tensor = torch.tensor(
-                    mrope_input_positions_tensor,
-                    dtype=torch.long,
-                    device=self.runner.device)
-            else:
-                input_positions_tensor = make_tensor_with_pad(
-                    input_positions,
-                    0,
-                    dtype=torch.int,
-                    device=self.runner.device)
-                input_positions_tensor = torch.flatten(input_positions_tensor)
-
-            max_seq_len = max(seq_lens)
-            seq_lens = len(seq_lens) * [max_seq_len]
+        input_tokens_tensor = torch.tensor(flatten_2d_lists(input_tokens),
+                                           dtype=torch.long,
+                                           device=self.runner.device)
+        if mrope_input_positions is not None:
+            input_positions_tensor = torch.tensor(mrope_input_positions,
+                                                  dtype=torch.long,
+                                                  device=self.runner.device)
        else:
-            input_tokens_tensor = torch.tensor(flatten_2d_lists(input_tokens),
-                                               dtype=torch.long,
-                                               device=self.runner.device)
-            if mrope_input_positions is not None:
-                input_positions_tensor = torch.tensor(
-                    mrope_input_positions,
-                    dtype=torch.long,
-                    device=self.runner.device)
-            else:
-                input_positions_tensor = torch.tensor(
-                    flatten_2d_lists(input_positions),
-                    dtype=torch.long,
-                    device=self.runner.device)
+            input_positions_tensor = torch.tensor(
+                flatten_2d_lists(input_positions),
+                dtype=torch.long,
+                device=self.runner.device)

        # Attention metadata.
-        attn_metadata = self.attn_metadata_builder.build(
-            seq_lens, query_lens, -1, batch_size)
+        attn_metadata = self.attn_metadata_builder.build(seq_lens, query_lens)

        # Multi-modal data.
        multi_modal_kwargs_list = [
@@ -749,10 +720,14 @@ class ModelInputForNPUBuilder(ModelRunnerInputBuilderBase[ModelInputForNPU]):
                mrope_input_positions, mrope_position_delta = \
                    MRotaryEmbedding.get_input_positions(
                        token_ids,
-                        hf_config,
                        image_grid_thw=image_grid_thw,
                        video_grid_thw=video_grid_thw,
-                        second_per_grid_ts=None,
+                        image_token_id=hf_config.image_token_id,
+                        video_token_id=hf_config.video_token_id,
+                        vision_start_token_id=hf_config.vision_start_token_id,
+                        vision_end_token_id=hf_config.vision_end_token_id,
+                        spatial_merge_size=hf_config.vision_config.
+                        spatial_merge_size,
                        context_len=inter_data.context_lens[seq_idx],
                        seq_len=inter_data.seq_lens[seq_idx],
                    )
--- a/vllm_ascend/ops/init.py
+++ b/vllm_ascend/ops/init.py
@@ -14,5 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-
+import vllm_ascend.ops.activation  # noqa
+import vllm_ascend.ops.fused_moe  # noqa
 import vllm_ascend.ops.layernorm  # noqa
+import vllm_ascend.ops.rotary_embedding  # noqa
--- a/vllm_ascend/ops/activation.py
+++ b/vllm_ascend/ops/activation.py
@@ -0,0 +1,29 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# This file is a part of the vllm-ascend project.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import torch
+from vllm.model_executor.layers.activation import SiluAndMul
+
+
+def silu_and_mul_forward_oot(self, x: torch.Tensor) -> torch.Tensor:
+    import torch_npu
+
+    out = torch_npu.npu_swiglu(x)
+    return out
+
+
+SiluAndMul.forward_oot = silu_and_mul_forward_oot
--- a/vllm_ascend/ops/fused_moe.py
+++ b/vllm_ascend/ops/fused_moe.py
@@ -0,0 +1,176 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# This file is a part of the vllm-ascend project.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from typing import Callable, Optional
+
+import torch
+import torch_npu
+from vllm.model_executor.layers.fused_moe.layer import \
+    UnquantizedFusedMoEMethod
+
+
+def group_topk(hidden_states: torch.Tensor,
+               gating_output: torch.Tensor,
+               topk: int,
+               renormalize: bool,
+               num_expert_group: Optional[int] = 0,
+               topk_group: Optional[int] = 0,
+               scoring_func: str = "softmax",
+               e_score_correction_bias: Optional[torch.Tensor] = None):
+
+    assert hidden_states.shape[0] == gating_output.shape[0], (
+        "Number of tokens mismatch")
+
+    if scoring_func == "softmax":
+        scores = torch.softmax(gating_output, dim=-1)
+    elif scoring_func == "sigmoid":
+        scores = gating_output.sigmoid()
+    else:
+        raise ValueError(f"Unsupported scoring function: {scoring_func}")
+
+    if e_score_correction_bias is not None:
+        # Store original scores before applying correction bias. We use biased
+        # scores for expert selection but original scores for routing weights
+        original_scores = scores
+        scores = scores + e_score_correction_bias.unsqueeze(0)
+
+    torch_npu.npu_group_topk(input=scores,
+                             out=scores,
+                             group_num=num_expert_group,
+                             k=topk_group)
+    if e_score_correction_bias is not None:
+        topk_ids = torch.topk(scores, k=topk, dim=-1, sorted=False)[1]
+        # Use original unbiased scores for the routing weights
+        topk_weights = original_scores.gather(1, topk_ids)
+    else:
+        topk_weights, topk_ids = torch.topk(scores,
+                                            k=topk,
+                                            dim=-1,
+                                            sorted=False)
+
+    if renormalize:
+        topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
+
+    return topk_weights.to(torch.float32), topk_ids.to(torch.int32)
+
+
+def fused_experts(hidden_states: torch.Tensor, w1: torch.Tensor,
+                  w2: torch.Tensor, topk_weights: torch.Tensor,
+                  topk_ids: torch.Tensor, top_k: int):
+    # Check constraints.
+    assert hidden_states.shape[1] == w1.shape[2], "Hidden size mismatch"
+    assert topk_weights.shape == topk_ids.shape, "topk shape mismatch"
+    assert hidden_states.is_contiguous(), "Hidden_states must be contiguous"
+    assert w1.is_contiguous(), "Expert weights1 must be contiguous"
+    assert w2.is_contiguous(), "Expert weights2 must be contiguous"
+    assert hidden_states.dtype in [
+        torch.float32, torch.float16, torch.bfloat16
+    ]
+    ori_shape = hidden_states.shape
+    if len(ori_shape) == 3:
+        hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
+
+    num_tokens, _ = hidden_states.shape
+    E, N, _ = w1.shape
+
+    row_idx_len = num_tokens * top_k
+    row_idx = torch.arange(0,
+                           row_idx_len,
+                           dtype=torch.int32,
+                           device=topk_weights.device).view(top_k, -1).permute(
+                               1, 0).contiguous()
+    expanded_x, expanded_row_idx, expanded_expert_idx = torch_npu.npu_moe_init_routing(
+        hidden_states,
+        row_idx=row_idx,
+        expert_idx=topk_ids,
+        active_num=num_tokens)
+
+    expert_tokens = torch_npu.npu_moe_compute_expert_tokens(
+        expanded_expert_idx, E)
+    expert_tokens = expert_tokens.to(torch.int64)
+
+    w1 = w1.transpose(1, 2)
+    gate_up_out_list = torch_npu.npu_grouped_matmul(x=[expanded_x],
+                                                    weight=[w1],
+                                                    split_item=2,
+                                                    group_list_type=0,
+                                                    group_type=0,
+                                                    group_list=expert_tokens)
+
+    # TODO: Remove this in the future.
+    gate_up_out = torch.cat(gate_up_out_list, dim=0)
+    gate_up_out = torch_npu.npu_swiglu(gate_up_out)
+
+    w2 = w2.transpose(1, 2)
+    down_out_list = torch_npu.npu_grouped_matmul(x=[gate_up_out],
+                                                 weight=[w2],
+                                                 split_item=2,
+                                                 group_list_type=0,
+                                                 group_type=0,
+                                                 group_list=expert_tokens)
+
+    down_out_list = torch.cat(down_out_list, dim=0)
+    # TODO: Reorder device memory 2 times here, replace the current
+    # implementation here when suitable operators become available.
+    routing_weights = topk_weights.to(down_out_list.dtype)
+    hidden_states = torch_npu.npu_moe_finalize_routing(
+        down_out_list,
+        skip1=None,
+        skip2=None,
+        bias=None,
+        scales=routing_weights,
+        expanded_src_to_dst_row=expanded_row_idx,
+        export_for_source_row=topk_ids)
+    if len(ori_shape) == 3:
+        hidden_states = hidden_states.view(ori_shape)
+    return hidden_states
+
+
+def forward_oot(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        use_grouped_topk: bool,
+        top_k: int,
+        router_logits: torch.Tensor,
+        renormalize: bool,
+        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
+        custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None
+) -> torch.Tensor:
+
+    topk_weights, topk_ids = group_topk(
+        hidden_states=x,
+        gating_output=router_logits,
+        topk=top_k,
+        renormalize=renormalize,
+        num_expert_group=num_expert_group,
+        topk_group=topk_group,
+        scoring_func=scoring_func,
+        e_score_correction_bias=e_score_correction_bias)
+
+    return fused_experts(hidden_states=x,
+                         w1=layer.w13_weight,
+                         w2=layer.w2_weight,
+                         topk_weights=topk_weights,
+                         topk_ids=topk_ids,
+                         top_k=top_k)
+
+
+UnquantizedFusedMoEMethod.forward_oot = forward_oot
--- a/vllm_ascend/ops/rotary_embedding.py
+++ b/vllm_ascend/ops/rotary_embedding.py
@@ -0,0 +1,56 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# This file is a part of the vllm-ascend project.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from typing import Optional, Tuple
+
+import torch
+from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
+
+
+def rope_forward_oot(
+    self,
+    positions: torch.Tensor,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    offsets: Optional[torch.Tensor] = None,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    import torch_npu
+
+    if self.cos_sin_cache.device != query.device:
+        self.cos_sin_cache = self.cos_sin_cache.to(query.device)
+    if self.cos_sin_cache.dtype != query.dtype:
+        self.cos_sin_cache = self.cos_sin_cache.to(query.dtype)
+    if offsets is not None:
+        raise NotImplementedError(
+            "Batched rotary embedding is currently not supported on NPU.")
+    else:
+        # TODO: Remove the contiguous in the future.
+        query = query.contiguous()
+        key = key.contiguous()
+        torch_npu.npu_rope(
+            positions,
+            query,
+            key,
+            self.head_size,
+            self.cos_sin_cache,
+            self.is_neox_style,
+        )
+
+    return query, key
+
+
+RotaryEmbedding.forward_oot = rope_forward_oot
--- a/vllm_ascend/platform.py
+++ b/vllm_ascend/platform.py
@@ -16,7 +16,7 @@
 #

 import os
-from typing import Optional, Tuple
+from typing import TYPE_CHECKING, Optional, Tuple

 import torch

@@ -28,6 +28,11 @@ except ImportError:
 from vllm.config import VllmConfig
 from vllm.platforms import Platform, PlatformEnum

+if TYPE_CHECKING:
+    from vllm.utils import FlexibleArgumentParser
+else:
+    FlexibleArgumentParser = None
+
 os.environ["RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES"] = "1"


@@ -53,6 +58,15 @@ class NPUPlatform(Platform):
    ray_device_key: str = "NPU"
    device_control_env_var: str = "ASCEND_RT_VISIBLE_DEVICES"

+    supported_quantization: list[str] = ["ascend"]
+
+    @classmethod
+    def pre_register_and_update(cls,
+                                parser: Optional[FlexibleArgumentParser] = None
+                                ) -> None:
+        from vllm_ascend.quantization.quant_config import \
+            AscendQuantConfig  # noqa: F401
+
    @classmethod
    def get_device_capability(cls, device_id: int = 0):
        return None
@@ -96,11 +110,14 @@ class NPUPlatform(Platform):
            parallel_config.worker_cls = "vllm_ascend.worker.NPUWorker"
        cache_config = vllm_config.cache_config
        if cache_config and cache_config.block_size is None:
-            cache_config.block_size = 128
+            # TODO: Set block_size to 128 will lead unexpected accuracy issue in mla case.  Please set block_size to 128 back once the problem is fixed.
+            cache_config.block_size = 16

    @classmethod
    def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
                             kv_cache_dtype, block_size, use_v1, use_mla):
+        if use_mla:
+            return "vllm_ascend.attention.AscendMLAAttentionBackend"
        return "vllm_ascend.attention.AscendAttentionBackend"

    @classmethod
--- a/vllm_ascend/quantization/init.py
+++ b/vllm_ascend/quantization/init.py
--- a/vllm_ascend/quantization/quant_config.py
+++ b/vllm_ascend/quantization/quant_config.py
@@ -0,0 +1,256 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# This file is a part of the vllm-ascend project.
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from types import MappingProxyType
+from typing import Any, Dict, List, Mapping, Optional
+
+import torch
+import torch_npu  # noqa: F401
+from vllm.distributed import get_tensor_model_parallel_rank
+from vllm.logger import init_logger
+from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
+                                               RowParallelLinear,
+                                               UnquantizedLinearMethod)
+from vllm.model_executor.layers.quantization import \
+    register_quantization_config
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig, QuantizeMethodBase)
+from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
+from vllm.model_executor.parameter import (BasevLLMParameter,
+                                           ChannelQuantScaleParameter,
+                                           ModelWeightParameter)
+
+from .quantizer import AscendQuantizer
+
+logger = init_logger(__name__)
+
+
+@register_quantization_config("ascend")
+class AscendQuantConfig(QuantizationConfig):
+    """Config class for Ascend"""
+
+    def __init__(self, quant_config: Dict[str, Any]):
+        self.quant_description = quant_config
+
+    def __repr__(self) -> str:
+        return "AscendQuantConfig:\n" + super().__repr__()
+
+    @classmethod
+    def get_name(cls) -> str:
+        return "ascend"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
+        return [torch.int8, torch.float16, torch.bfloat16]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        raise NotImplementedError(
+            "Ascend hardware dose not support \"get_min_capability\" feature.")
+
+    @classmethod
+    def get_config_filenames(cls) -> List[str]:
+        return []
+
+    @classmethod
+    def from_config(cls, config: Dict[str, Any]) -> "AscendQuantConfig":
+        return cls(config)
+
+    @classmethod
+    def override_quantization_method(cls, hf_quant_cfg,
+                                     user_quant) -> Optional[str]:
+        if torch.npu.is_available():
+            return "ascend"
+        return None
+
+    def get_quant_method(self, layer: torch.nn.Module,
+                         prefix: str) -> Optional["QuantizeMethodBase"]:
+        from vllm.attention.layer import Attention
+        if isinstance(layer, LinearBase):
+            if self.is_layer_skipped_ascend(prefix,
+                                            self.packed_modules_mapping):
+                return UnquantizedLinearMethod()
+            return AscendLinearMethod(self)
+        if isinstance(layer, Attention) and \
+            'fa_quant_type' in self.quant_description.keys():
+            return AscendQKVQuantAttentionMethod(self)
+        return None
+
+    def is_layer_skipped_ascend(
+        self,
+        prefix: str,
+        fused_mapping: Mapping[str, List[str]] = MappingProxyType({})):
+        # adapted from vllm.model_executor.layers.quantization.utils.quant_utils.is_layer_skipped
+        proj_name = prefix.split(".")[-1]
+        if proj_name in fused_mapping:
+            shard_prefixes = [
+                prefix.replace(proj_name, shard_proj_name)
+                for shard_proj_name in fused_mapping[proj_name]
+            ]
+
+            is_skipped = None
+            for shard_prefix in shard_prefixes:
+                is_shard_skipped = self.quant_description[shard_prefix +
+                                                          '.weight'] == "FLOAT"
+
+                if is_skipped is None:
+                    is_skipped = is_shard_skipped
+                elif is_shard_skipped != is_skipped:
+                    raise ValueError(
+                        f"Detected some but not all shards of {prefix} "
+                        "are quantized. All shards of fused layers "
+                        "to have the same precision.")
+        else:
+            is_skipped = self.quant_description[prefix + '.weight'] == "FLOAT"
+
+        assert is_skipped is not None
+        return is_skipped
+
+    def get_scaled_act_names(self) -> List[str]:
+        return []
+
+
+class AscendLinearMethod(LinearMethodBase):
+    """Linear method for Ascend quantization.
+
+    Args:
+        quant_config: The Ascend quantization config.
+    """
+
+    def __init__(self, quant_config: AscendQuantConfig) -> None:
+        self.quantizer = AscendQuantizer.get_quantizer(
+            quant_config.quant_description)
+        self.quant_method = self.quantizer.build_linear_method()
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: List[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ) -> None:
+        del output_size
+        output_size_per_partition = sum(output_partition_sizes)
+        weight_loader = extra_weight_attrs.get("weight_loader")
+
+        weights = self.quant_method.create_weights(input_size_per_partition,
+                                                   output_size_per_partition,
+                                                   params_dtype)
+
+        weight_name = self.quant_method.get_weight()
+        if weight_name in weights.keys():
+            layer.register_parameter(
+                weight_name,
+                ModelWeightParameter(data=weights[weight_name].transpose(0, 1),
+                                     input_dim=1,
+                                     output_dim=0,
+                                     weight_loader=weight_loader))
+        else:
+            raise ValueError(
+                f"{weight_name} is nor registered. Please check your linear quant method implementation."
+            )
+
+        pertensor_names = self.quant_method.get_pertensor_param()
+        for pertensor_name in pertensor_names:
+            if pertensor_name in weights.keys():
+                param = BasevLLMParameter(data=weights[pertensor_name],
+                                          weight_loader=weight_loader)
+                # disable warning
+                param.ignore_warning = True
+                layer.register_parameter(pertensor_name, param)
+            else:
+                raise ValueError(
+                    f"{pertensor_name} is nor registered. Please check your linear quant method implementation."
+                )
+
+        perchannel_names = self.quant_method.get_perchannel_param()
+        for perchannel_name in perchannel_names:
+            if perchannel_name in weights.keys():
+                layer.register_parameter(
+                    perchannel_name,
+                    ChannelQuantScaleParameter(data=weights[perchannel_name],
+                                               output_dim=0,
+                                               weight_loader=weight_loader))
+            else:
+                raise ValueError(
+                    f"{perchannel_name} is nor registered. Please check your linear quant method implementation."
+                )
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        if hasattr(self.quant_method,
+                   'transpose_weight') and self.quant_method.transpose_weight:
+            layer.weight.data = layer.weight.data.transpose(1, 0)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if isinstance(layer, RowParallelLinear):
+            tp_rank = get_tensor_model_parallel_rank()
+            return self.quant_method.apply(layer, x, bias, tp_rank)
+        return self.quant_method.apply(layer, x, bias)
+
+
+class AscendQKVQuantAttentionMethod(BaseKVCacheMethod):
+    """Linear method for Ascend quantization.
+
+    Args:
+        quant_config: The Ascend quantization config.
+    """
+
+    def __init__(self, quant_config: AscendQuantConfig) -> None:
+        self.quantizer = AscendQuantizer.get_quantizer(
+            quant_config.quant_description)
+        self.quant_method = self.quantizer.build_attention_method()
+
+    def create_weights(self, layer: torch.nn.Module) -> None:
+        # ascend attention quantization might include some extra weights
+        # and must be loaded by dummy modules
+        extra_module_names = self.quant_method.get_extra_module_names()
+        for name in extra_module_names:
+            setattr(layer, name, torch.nn.Module())
+
+        # During model initialization, the default dtype is set as the model
+        # weight and activation dtype.
+        dtype = torch.get_default_dtype()
+        weights = self.quant_method.create_weights(dtype, layer.num_heads,
+                                                   layer.num_kv_heads)
+
+        for name, weight in weights.items():
+            module_name, weight_name = name.split('.')
+            module = getattr(layer, module_name)
+            module.register_parameter(
+                weight_name, torch.nn.Parameter(weight, requires_grad=False))
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        if hasattr(self.quant_method, "process_weights_after_loading"):
+            self.quant_method.process_weights_after_loading(layer)
+
+    def apply(self, layer: torch.nn.Module, query: torch.Tensor,
+              key: torch.Tensor, value: torch.Tensor,
+              kv_cache: List[torch.Tensor], scale: torch.Tensor,
+              seq_lens_tensor_cpu: int, block_tables: torch.Tensor,
+              isPrefill: bool, attn_metadata, output) -> torch.Tensor:
+        return self.quant_method.apply(layer, query, key, value, kv_cache,
+                                       scale, seq_lens_tensor_cpu,
+                                       block_tables, isPrefill, attn_metadata,
+                                       output)
--- a/vllm_ascend/quantization/quantizer.py
+++ b/vllm_ascend/quantization/quantizer.py
@@ -0,0 +1,51 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# This file is a part of the vllm-ascend project.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import importlib
+from typing import Any, Dict, List
+
+CUSTOMIZED_QUANTIZER_TYPE: List[str] = []
+
+
+class AscendQuantizer:
+    """An interface to different quantization implementations for ascend hardwares."""
+
+    @classmethod
+    def get_quantizer(cls, quant_config: Dict[str, Any]):
+        # TODO: Need a param to choose quantization algorithms.
+        quantization_algorithm = ''
+
+        if quantization_algorithm in CUSTOMIZED_QUANTIZER_TYPE:
+            return
+
+        try:
+            module = importlib.import_module("mindie_turbo")
+            MindIETurboQuantizer = module.MindIETurboQuantizer
+        except Exception:
+            raise NotImplementedError(
+                "There is no available ascend quantizer.")
+
+        return MindIETurboQuantizer.get_quantizer(quant_config)
+
+    def build_linear_method(self):
+        raise NotImplementedError
+
+    def build_moe_method(self):
+        raise NotImplementedError
+
+    def build_attention_method(self):
+        raise NotImplementedError