[Refactor]refactor 310p ops and add ut (#6591)

### What this PR does / why we need it? This pull request focuses on a significant refactoring effort within the vllm-ascend project, specifically targeting operations optimized for the Ascend 310P hardware. The changes aim to streamline the implementation of core components like quantization and multi-head attention, making the codebase more maintainable and robust. Concurrently, new unit tests have been introduced to ensure the correctness and reliability of these refactored modules. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? E2E test with qwen3-32b w8a8 - vLLM version: v0.15.0 - vLLM main: d7e17aaacd --------- Signed-off-by: pu-zhe <zpuaa@outlook.com>
2026-02-07 09:25:17 +08:00
parent 6c49f95da2
commit 23524f2ca4
6 changed files with 173 additions and 28 deletions
--- a/vllm_ascend/_310p/ops/activation.py
+++ b/vllm_ascend/_310p/ops/activation.py
@@ -19,16 +19,10 @@ import torch
 import torch.nn.functional as F

 from vllm_ascend.ops.activation import AscendSiluAndMul
-from vllm_ascend.utils import get_weight_prefetch_method


 class AscendSiluAndMul310(AscendSiluAndMul):
    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        weight_prefetch_method = get_weight_prefetch_method()
-        if weight_prefetch_method:
-            weight_prefetch_method.maybe_prefetch_mlp_weight_preprocess(weight_prefetch_method.MLP_DOWN, x)
        h = x.shape[-1] // 2
        out = (F.silu(x[..., :h].to(torch.float32)) * x[..., h:].to(torch.float32)).to(torch.float16)
-        if weight_prefetch_method:
-            weight_prefetch_method.maybe_prefetch_mlp_weight_postprocess(out)
        return out