[Core] Cherry pick from 0.7.1 to keep the main code newest (#127)

Cherry pick from 0.7.1 to keep the main code newest Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2025-02-21 17:07:37 +08:00
parent 36991b2052
commit 5f465010de
11 changed files with 1136 additions and 353 deletions
--- a/.github/workflows/vllm_ascend_test.yaml
+++ b/.github/workflows/vllm_ascend_test.yaml
@@ -102,7 +102,7 @@ jobs:
        run: |
          pip install -e .
-      - name: Install torch-npu
+      - name: Install pta
        run: |
          mkdir pta
          cd pta
--- a/vllm_ascend/attention.py
+++ b/vllm_ascend/attention.py
--- a/vllm_ascend/model_runner.py
+++ b/vllm_ascend/model_runner.py
@@ -53,7 +53,7 @@ from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import IntermediateTensors, SequenceGroupMetadata
 from vllm.utils import (DeviceMemoryProfiler, PyObjectCache, flatten_2d_lists,
-                        is_pin_memory_available, make_tensor_with_pad)
+                        is_pin_memory_available)
 from vllm.worker.model_runner_base import (
    ModelRunnerBase, ModelRunnerInputBase, ModelRunnerInputBuilderBase,
    _add_attn_metadata_broadcastable_dict,
@@ -511,50 +511,21 @@ class ModelInputForNPUBuilder(ModelRunnerInputBuilderBase[ModelInputForNPU]):
            for data in self.inter_data_list
        }
-        batch_size = len(input_tokens)
+        input_tokens_tensor = torch.tensor(flatten_2d_lists(input_tokens),
-
+                                           dtype=torch.long,
-        if self.inter_data_list[0].is_prompt:
+                                           device=self.runner.device)
-            input_tokens_tensor = make_tensor_with_pad(
+        if mrope_input_positions is not None:
-                input_tokens, 0, dtype=torch.int, device=self.runner.device)
+            input_positions_tensor = torch.tensor(mrope_input_positions,
-            input_tokens_tensor = torch.flatten(input_tokens_tensor)
+                                                  dtype=torch.long,
-            if mrope_input_positions is not None:
+                                                  device=self.runner.device)
                mrope_input_positions_tensor = make_tensor_with_pad(
                    mrope_input_positions,
                    0,
                    dtype=torch.int,
                    device=self.runner.device)
                input_positions_tensor = torch.tensor(
                    mrope_input_positions_tensor,
                    dtype=torch.long,
                    device=self.runner.device)
            else:
                input_positions_tensor = make_tensor_with_pad(
                    input_positions,
                    0,
                    dtype=torch.int,
                    device=self.runner.device)
                input_positions_tensor = torch.flatten(input_positions_tensor)
            max_seq_len = max(seq_lens)
            seq_lens = len(seq_lens) * [max_seq_len]
        else:
-            input_tokens_tensor = torch.tensor(flatten_2d_lists(input_tokens),
+            input_positions_tensor = torch.tensor(
-                                               dtype=torch.long,
+                flatten_2d_lists(input_positions),
-                                               device=self.runner.device)
+                dtype=torch.long,
-            if mrope_input_positions is not None:
+                device=self.runner.device)
                input_positions_tensor = torch.tensor(
                    mrope_input_positions,
                    dtype=torch.long,
                    device=self.runner.device)
            else:
                input_positions_tensor = torch.tensor(
                    flatten_2d_lists(input_positions),
                    dtype=torch.long,
                    device=self.runner.device)
        # Attention metadata.
-        attn_metadata = self.attn_metadata_builder.build(
+        attn_metadata = self.attn_metadata_builder.build(seq_lens, query_lens)
            seq_lens, query_lens, -1, batch_size)
        # Multi-modal data.
        multi_modal_kwargs_list = [
@@ -749,10 +720,14 @@ class ModelInputForNPUBuilder(ModelRunnerInputBuilderBase[ModelInputForNPU]):
                mrope_input_positions, mrope_position_delta = \
                    MRotaryEmbedding.get_input_positions(
                        token_ids,
                        hf_config,
                        image_grid_thw=image_grid_thw,
                        video_grid_thw=video_grid_thw,
-                        second_per_grid_ts=None,
+                        image_token_id=hf_config.image_token_id,
                        video_token_id=hf_config.video_token_id,
                        vision_start_token_id=hf_config.vision_start_token_id,
                        vision_end_token_id=hf_config.vision_end_token_id,
                        spatial_merge_size=hf_config.vision_config.
                        spatial_merge_size,
                        context_len=inter_data.context_lens[seq_idx],
                        seq_len=inter_data.seq_lens[seq_idx],
                    )
--- a/vllm_ascend/ops/init.py
+++ b/vllm_ascend/ops/init.py
@@ -14,5 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-
+import vllm_ascend.ops.activation  # noqa
 import vllm_ascend.ops.fused_moe  # noqa
 import vllm_ascend.ops.layernorm  # noqa
 import vllm_ascend.ops.rotary_embedding  # noqa
--- a/vllm_ascend/ops/activation.py
+++ b/vllm_ascend/ops/activation.py
@@ -0,0 +1,29 @@
 #
 # Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
 # This file is a part of the vllm-ascend project.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
 import torch
 from vllm.model_executor.layers.activation import SiluAndMul
 def silu_and_mul_forward_oot(self, x: torch.Tensor) -> torch.Tensor:
    import torch_npu
    out = torch_npu.npu_swiglu(x)
    return out
 SiluAndMul.forward_oot = silu_and_mul_forward_oot
--- a/vllm_ascend/ops/fused_moe.py
+++ b/vllm_ascend/ops/fused_moe.py
@@ -0,0 +1,176 @@
 #
 # Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
 # This file is a part of the vllm-ascend project.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
 from typing import Callable, Optional
 import torch
 import torch_npu
 from vllm.model_executor.layers.fused_moe.layer import \
    UnquantizedFusedMoEMethod
 def group_topk(hidden_states: torch.Tensor,
               gating_output: torch.Tensor,
               topk: int,
               renormalize: bool,
               num_expert_group: Optional[int] = 0,
               topk_group: Optional[int] = 0,
               scoring_func: str = "softmax",
               e_score_correction_bias: Optional[torch.Tensor] = None):
    assert hidden_states.shape[0] == gating_output.shape[0], (
        "Number of tokens mismatch")
    if scoring_func == "softmax":
        scores = torch.softmax(gating_output, dim=-1)
    elif scoring_func == "sigmoid":
        scores = gating_output.sigmoid()
    else:
        raise ValueError(f"Unsupported scoring function: {scoring_func}")
    if e_score_correction_bias is not None:
        # Store original scores before applying correction bias. We use biased
        # scores for expert selection but original scores for routing weights
        original_scores = scores
        scores = scores + e_score_correction_bias.unsqueeze(0)
    torch_npu.npu_group_topk(input=scores,
                             out=scores,
                             group_num=num_expert_group,
                             k=topk_group)
    if e_score_correction_bias is not None:
        topk_ids = torch.topk(scores, k=topk, dim=-1, sorted=False)[1]
        # Use original unbiased scores for the routing weights
        topk_weights = original_scores.gather(1, topk_ids)
    else:
        topk_weights, topk_ids = torch.topk(scores,
                                            k=topk,
                                            dim=-1,
                                            sorted=False)
    if renormalize:
        topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
    return topk_weights.to(torch.float32), topk_ids.to(torch.int32)
 def fused_experts(hidden_states: torch.Tensor, w1: torch.Tensor,
                  w2: torch.Tensor, topk_weights: torch.Tensor,
                  topk_ids: torch.Tensor, top_k: int):
    # Check constraints.
    assert hidden_states.shape[1] == w1.shape[2], "Hidden size mismatch"
    assert topk_weights.shape == topk_ids.shape, "topk shape mismatch"
    assert hidden_states.is_contiguous(), "Hidden_states must be contiguous"
    assert w1.is_contiguous(), "Expert weights1 must be contiguous"
    assert w2.is_contiguous(), "Expert weights2 must be contiguous"
    assert hidden_states.dtype in [
        torch.float32, torch.float16, torch.bfloat16
    ]
    ori_shape = hidden_states.shape
    if len(ori_shape) == 3:
        hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
    num_tokens, _ = hidden_states.shape
    E, N, _ = w1.shape
    row_idx_len = num_tokens * top_k
    row_idx = torch.arange(0,
                           row_idx_len,
                           dtype=torch.int32,
                           device=topk_weights.device).view(top_k, -1).permute(
                               1, 0).contiguous()
    expanded_x, expanded_row_idx, expanded_expert_idx = torch_npu.npu_moe_init_routing(
        hidden_states,
        row_idx=row_idx,
        expert_idx=topk_ids,
        active_num=num_tokens)
    expert_tokens = torch_npu.npu_moe_compute_expert_tokens(
        expanded_expert_idx, E)
    expert_tokens = expert_tokens.to(torch.int64)
    w1 = w1.transpose(1, 2)
    gate_up_out_list = torch_npu.npu_grouped_matmul(x=[expanded_x],
                                                    weight=[w1],
                                                    split_item=2,
                                                    group_list_type=0,
                                                    group_type=0,
                                                    group_list=expert_tokens)
    # TODO: Remove this in the future.
    gate_up_out = torch.cat(gate_up_out_list, dim=0)
    gate_up_out = torch_npu.npu_swiglu(gate_up_out)
    w2 = w2.transpose(1, 2)
    down_out_list = torch_npu.npu_grouped_matmul(x=[gate_up_out],
                                                 weight=[w2],
                                                 split_item=2,
                                                 group_list_type=0,
                                                 group_type=0,
                                                 group_list=expert_tokens)
    down_out_list = torch.cat(down_out_list, dim=0)
    # TODO: Reorder device memory 2 times here, replace the current
    # implementation here when suitable operators become available.
    routing_weights = topk_weights.to(down_out_list.dtype)
    hidden_states = torch_npu.npu_moe_finalize_routing(
        down_out_list,
        skip1=None,
        skip2=None,
        bias=None,
        scales=routing_weights,
        expanded_src_to_dst_row=expanded_row_idx,
        export_for_source_row=topk_ids)
    if len(ori_shape) == 3:
        hidden_states = hidden_states.view(ori_shape)
    return hidden_states
 def forward_oot(
        self,
        layer: torch.nn.Module,
        x: torch.Tensor,
        use_grouped_topk: bool,
        top_k: int,
        router_logits: torch.Tensor,
        renormalize: bool,
        topk_group: Optional[int] = None,
        num_expert_group: Optional[int] = None,
        custom_routing_function: Optional[Callable] = None,
        scoring_func: str = "softmax",
        e_score_correction_bias: Optional[torch.Tensor] = None
 ) -> torch.Tensor:
    topk_weights, topk_ids = group_topk(
        hidden_states=x,
        gating_output=router_logits,
        topk=top_k,
        renormalize=renormalize,
        num_expert_group=num_expert_group,
        topk_group=topk_group,
        scoring_func=scoring_func,
        e_score_correction_bias=e_score_correction_bias)
    return fused_experts(hidden_states=x,
                         w1=layer.w13_weight,
                         w2=layer.w2_weight,
                         topk_weights=topk_weights,
                         topk_ids=topk_ids,
                         top_k=top_k)
 UnquantizedFusedMoEMethod.forward_oot = forward_oot
--- a/vllm_ascend/ops/rotary_embedding.py
+++ b/vllm_ascend/ops/rotary_embedding.py
@@ -0,0 +1,56 @@
 #
 # Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
 # This file is a part of the vllm-ascend project.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
 from typing import Optional, Tuple
 import torch
 from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
 def rope_forward_oot(
    self,
    positions: torch.Tensor,
    query: torch.Tensor,
    key: torch.Tensor,
    offsets: Optional[torch.Tensor] = None,
 ) -> Tuple[torch.Tensor, torch.Tensor]:
    import torch_npu
    if self.cos_sin_cache.device != query.device:
        self.cos_sin_cache = self.cos_sin_cache.to(query.device)
    if self.cos_sin_cache.dtype != query.dtype:
        self.cos_sin_cache = self.cos_sin_cache.to(query.dtype)
    if offsets is not None:
        raise NotImplementedError(
            "Batched rotary embedding is currently not supported on NPU.")
    else:
        # TODO: Remove the contiguous in the future.
        query = query.contiguous()
        key = key.contiguous()
        torch_npu.npu_rope(
            positions,
            query,
            key,
            self.head_size,
            self.cos_sin_cache,
            self.is_neox_style,
        )
    return query, key
 RotaryEmbedding.forward_oot = rope_forward_oot
--- a/vllm_ascend/platform.py
+++ b/vllm_ascend/platform.py
@@ -16,7 +16,7 @@
 #
 import os
-from typing import Optional, Tuple
+from typing import TYPE_CHECKING, Optional, Tuple
 import torch
@@ -28,6 +28,11 @@ except ImportError:
 from vllm.config import VllmConfig
 from vllm.platforms import Platform, PlatformEnum
 if TYPE_CHECKING:
    from vllm.utils import FlexibleArgumentParser
 else:
    FlexibleArgumentParser = None
 os.environ["RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES"] = "1"
@@ -53,6 +58,15 @@ class NPUPlatform(Platform):
    ray_device_key: str = "NPU"
    device_control_env_var: str = "ASCEND_RT_VISIBLE_DEVICES"
    supported_quantization: list[str] = ["ascend"]
    @classmethod
    def pre_register_and_update(cls,
                                parser: Optional[FlexibleArgumentParser] = None
                                ) -> None:
        from vllm_ascend.quantization.quant_config import \
            AscendQuantConfig  # noqa: F401
    @classmethod
    def get_device_capability(cls, device_id: int = 0):
        return None
@@ -96,11 +110,14 @@ class NPUPlatform(Platform):
            parallel_config.worker_cls = "vllm_ascend.worker.NPUWorker"
        cache_config = vllm_config.cache_config
        if cache_config and cache_config.block_size is None:
-            cache_config.block_size = 128
+            # TODO: Set block_size to 128 will lead unexpected accuracy issue in mla case.  Please set block_size to 128 back once the problem is fixed.
            cache_config.block_size = 16
    @classmethod
    def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
                             kv_cache_dtype, block_size, use_v1, use_mla):
        if use_mla:
            return "vllm_ascend.attention.AscendMLAAttentionBackend"
        return "vllm_ascend.attention.AscendAttentionBackend"
    @classmethod
--- a/vllm_ascend/quantization/init.py
+++ b/vllm_ascend/quantization/init.py
--- a/vllm_ascend/quantization/quant_config.py
+++ b/vllm_ascend/quantization/quant_config.py
@@ -0,0 +1,256 @@
 #
 # Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
 # This file is a part of the vllm-ascend project.
 # Copyright 2023 The vLLM team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
 from types import MappingProxyType
 from typing import Any, Dict, List, Mapping, Optional
 import torch
 import torch_npu  # noqa: F401
 from vllm.distributed import get_tensor_model_parallel_rank
 from vllm.logger import init_logger
 from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
                                               RowParallelLinear,
                                               UnquantizedLinearMethod)
 from vllm.model_executor.layers.quantization import \
    register_quantization_config
 from vllm.model_executor.layers.quantization.base_config import (
    QuantizationConfig, QuantizeMethodBase)
 from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
 from vllm.model_executor.parameter import (BasevLLMParameter,
                                           ChannelQuantScaleParameter,
                                           ModelWeightParameter)
 from .quantizer import AscendQuantizer
 logger = init_logger(__name__)
@register_quantization_config("ascend")
 class AscendQuantConfig(QuantizationConfig):
    """Config class for Ascend"""
    def __init__(self, quant_config: Dict[str, Any]):
        self.quant_description = quant_config
    def __repr__(self) -> str:
        return "AscendQuantConfig:\n" + super().__repr__()
    @classmethod
    def get_name(cls) -> str:
        return "ascend"
    @classmethod
    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
        return [torch.int8, torch.float16, torch.bfloat16]
    @classmethod
    def get_min_capability(cls) -> int:
        raise NotImplementedError(
            "Ascend hardware dose not support \"get_min_capability\" feature.")
    @classmethod
    def get_config_filenames(cls) -> List[str]:
        return []
    @classmethod
    def from_config(cls, config: Dict[str, Any]) -> "AscendQuantConfig":
        return cls(config)
    @classmethod
    def override_quantization_method(cls, hf_quant_cfg,
                                     user_quant) -> Optional[str]:
        if torch.npu.is_available():
            return "ascend"
        return None
    def get_quant_method(self, layer: torch.nn.Module,
                         prefix: str) -> Optional["QuantizeMethodBase"]:
        from vllm.attention.layer import Attention
        if isinstance(layer, LinearBase):
            if self.is_layer_skipped_ascend(prefix,
                                            self.packed_modules_mapping):
                return UnquantizedLinearMethod()
            return AscendLinearMethod(self)
        if isinstance(layer, Attention) and \
            'fa_quant_type' in self.quant_description.keys():
            return AscendQKVQuantAttentionMethod(self)
        return None
    def is_layer_skipped_ascend(
        self,
        prefix: str,
        fused_mapping: Mapping[str, List[str]] = MappingProxyType({})):
        # adapted from vllm.model_executor.layers.quantization.utils.quant_utils.is_layer_skipped
        proj_name = prefix.split(".")[-1]
        if proj_name in fused_mapping:
            shard_prefixes = [
                prefix.replace(proj_name, shard_proj_name)
                for shard_proj_name in fused_mapping[proj_name]
            ]
            is_skipped = None
            for shard_prefix in shard_prefixes:
                is_shard_skipped = self.quant_description[shard_prefix +
                                                          '.weight'] == "FLOAT"
                if is_skipped is None:
                    is_skipped = is_shard_skipped
                elif is_shard_skipped != is_skipped:
                    raise ValueError(
                        f"Detected some but not all shards of {prefix} "
                        "are quantized. All shards of fused layers "
                        "to have the same precision.")
        else:
            is_skipped = self.quant_description[prefix + '.weight'] == "FLOAT"
        assert is_skipped is not None
        return is_skipped
    def get_scaled_act_names(self) -> List[str]:
        return []
 class AscendLinearMethod(LinearMethodBase):
    """Linear method for Ascend quantization.
    Args:
        quant_config: The Ascend quantization config.
    """
    def __init__(self, quant_config: AscendQuantConfig) -> None:
        self.quantizer = AscendQuantizer.get_quantizer(
            quant_config.quant_description)
        self.quant_method = self.quantizer.build_linear_method()
    def create_weights(
        self,
        layer: torch.nn.Module,
        input_size_per_partition: int,
        output_partition_sizes: List[int],
        input_size: int,
        output_size: int,
        params_dtype: torch.dtype,
        **extra_weight_attrs,
    ) -> None:
        del output_size
        output_size_per_partition = sum(output_partition_sizes)
        weight_loader = extra_weight_attrs.get("weight_loader")
        weights = self.quant_method.create_weights(input_size_per_partition,
                                                   output_size_per_partition,
                                                   params_dtype)
        weight_name = self.quant_method.get_weight()
        if weight_name in weights.keys():
            layer.register_parameter(
                weight_name,
                ModelWeightParameter(data=weights[weight_name].transpose(0, 1),
                                     input_dim=1,
                                     output_dim=0,
                                     weight_loader=weight_loader))
        else:
            raise ValueError(
                f"{weight_name} is nor registered. Please check your linear quant method implementation."
            )
        pertensor_names = self.quant_method.get_pertensor_param()
        for pertensor_name in pertensor_names:
            if pertensor_name in weights.keys():
                param = BasevLLMParameter(data=weights[pertensor_name],
                                          weight_loader=weight_loader)
                # disable warning
                param.ignore_warning = True
                layer.register_parameter(pertensor_name, param)
            else:
                raise ValueError(
                    f"{pertensor_name} is nor registered. Please check your linear quant method implementation."
                )
        perchannel_names = self.quant_method.get_perchannel_param()
        for perchannel_name in perchannel_names:
            if perchannel_name in weights.keys():
                layer.register_parameter(
                    perchannel_name,
                    ChannelQuantScaleParameter(data=weights[perchannel_name],
                                               output_dim=0,
                                               weight_loader=weight_loader))
            else:
                raise ValueError(
                    f"{perchannel_name} is nor registered. Please check your linear quant method implementation."
                )
    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
        if hasattr(self.quant_method,
                   'transpose_weight') and self.quant_method.transpose_weight:
            layer.weight.data = layer.weight.data.transpose(1, 0)
    def apply(
        self,
        layer: torch.nn.Module,
        x: torch.Tensor,
        bias: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        if isinstance(layer, RowParallelLinear):
            tp_rank = get_tensor_model_parallel_rank()
            return self.quant_method.apply(layer, x, bias, tp_rank)
        return self.quant_method.apply(layer, x, bias)
 class AscendQKVQuantAttentionMethod(BaseKVCacheMethod):
    """Linear method for Ascend quantization.
    Args:
        quant_config: The Ascend quantization config.
    """
    def __init__(self, quant_config: AscendQuantConfig) -> None:
        self.quantizer = AscendQuantizer.get_quantizer(
            quant_config.quant_description)
        self.quant_method = self.quantizer.build_attention_method()
    def create_weights(self, layer: torch.nn.Module) -> None:
        # ascend attention quantization might include some extra weights
        # and must be loaded by dummy modules
        extra_module_names = self.quant_method.get_extra_module_names()
        for name in extra_module_names:
            setattr(layer, name, torch.nn.Module())
        # During model initialization, the default dtype is set as the model
        # weight and activation dtype.
        dtype = torch.get_default_dtype()
        weights = self.quant_method.create_weights(dtype, layer.num_heads,
                                                   layer.num_kv_heads)
        for name, weight in weights.items():
            module_name, weight_name = name.split('.')
            module = getattr(layer, module_name)
            module.register_parameter(
                weight_name, torch.nn.Parameter(weight, requires_grad=False))
    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
        if hasattr(self.quant_method, "process_weights_after_loading"):
            self.quant_method.process_weights_after_loading(layer)
    def apply(self, layer: torch.nn.Module, query: torch.Tensor,
              key: torch.Tensor, value: torch.Tensor,
              kv_cache: List[torch.Tensor], scale: torch.Tensor,
              seq_lens_tensor_cpu: int, block_tables: torch.Tensor,
              isPrefill: bool, attn_metadata, output) -> torch.Tensor:
        return self.quant_method.apply(layer, query, key, value, kv_cache,
                                       scale, seq_lens_tensor_cpu,
                                       block_tables, isPrefill, attn_metadata,
                                       output)
--- a/vllm_ascend/quantization/quantizer.py
+++ b/vllm_ascend/quantization/quantizer.py
@@ -0,0 +1,51 @@
 #
 # Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
 # This file is a part of the vllm-ascend project.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
 import importlib
 from typing import Any, Dict, List
 CUSTOMIZED_QUANTIZER_TYPE: List[str] = []
 class AscendQuantizer:
    """An interface to different quantization implementations for ascend hardwares."""
    @classmethod
    def get_quantizer(cls, quant_config: Dict[str, Any]):
        # TODO: Need a param to choose quantization algorithms.
        quantization_algorithm = ''
        if quantization_algorithm in CUSTOMIZED_QUANTIZER_TYPE:
            return
        try:
            module = importlib.import_module("mindie_turbo")
            MindIETurboQuantizer = module.MindIETurboQuantizer
        except Exception:
            raise NotImplementedError(
                "There is no available ascend quantizer.")
        return MindIETurboQuantizer.get_quantizer(quant_config)
    def build_linear_method(self):
        raise NotImplementedError
    def build_moe_method(self):
        raise NotImplementedError
    def build_attention_method(self):
        raise NotImplementedError