[3/N][refactor] refactoer quantization (#2504)

### What this PR does / why we need it? Move torchair related qunatization section into torchair dir to make the code clear. Next step we'll remove all torchair related code outside of torchair quantization. ### Does this PR introduce _any_ user-facing change? NO ### How was this patch tested? vLLM version: main vLLM main: ab9f2cfd19 - vLLM version: v0.10.1.1 - vLLM main: 959783fb99 Signed-off-by: hust17yixuan <303660421@qq.com>
2025-08-27 10:45:50 +08:00
parent acdc53c2f6
commit 20a7bc4b71
10 changed files with 1752 additions and 3 deletions
--- a/vllm_ascend/torchair/models/torchair_deepseek_v2.py
+++ b/vllm_ascend/torchair/models/torchair_deepseek_v2.py
@@ -71,8 +71,9 @@ from vllm.sequence import IntermediateTensors

 from vllm_ascend.ascend_config import get_ascend_config
 from vllm_ascend.quantization.quant_config import AscendLinearMethod
-from vllm_ascend.quantization.w8a8_dynamic import AscendW8A8DynamicLinearMethod
 from vllm_ascend.torchair.ops.torchair_fused_moe import TorchairAscendFusedMoE
+from vllm_ascend.torchair.quantization.torchair_w8a8_dynamic import \
+    TorchairAscendW8A8DynamicLinearMethod
 from vllm_ascend.utils import dispose_tensor, npu_prefetch


@@ -261,8 +262,9 @@ class TorchairDeepseekV2MLP(nn.Module):
        quant_method = self.gate_up_proj.quant_method
        if isinstance(quant_method, UnquantizedLinearMethod):
            self.act_fn = TorchairDeepseekV2SiluAndMul()
-        elif (isinstance(quant_method, AscendLinearMethod) and isinstance(
-                quant_method.quant_method, AscendW8A8DynamicLinearMethod)):
+        elif (isinstance(quant_method, AscendLinearMethod)
+              and isinstance(quant_method.quant_method,
+                             TorchairAscendW8A8DynamicLinearMethod)):
            # TODO(sdmyzlp): Currently preserved as before:
            # 1. The only quantization supported for silu is W8A8Dynamic
            # 2. Output dtype of gate_up/down is fixed to be int32/bfloat16