[Fix] Set div_mode to False and fix view_as position (#912)

### What this PR does / why we need it?  Set div_mode to False to use the ACLNN kernel, which is crucial when using ACL Graph. ### Does this PR introduce _any_ user-facing change?  ### How was this patch tested?  Signed-off-by: Yizhou Liu <liu_yizhou@outlook.com>
2025-05-22 09:57:25 +08:00
parent 58b413752b
commit a73bd6caf4
2 changed files with 4 additions and 4 deletions
--- a/vllm_ascend/ops/attention.py
+++ b/vllm_ascend/ops/attention.py
@@ -131,7 +131,6 @@ def vanilla_chunked_prefill(

    attn_output = (attn_output[q_mask].view([-1, num_query_heads,
                                             head_dim]).to(output.dtype))
-    output = output.view_as(attn_output)
    output.copy_(attn_output)
    return attn_output

@@ -248,6 +247,7 @@ def vanilla_chunked_prefill_mla(

    attn_output = (attn_output[q_mask].view([-1, num_heads,
                                             v_head_dim]).to(output.dtype))
+    output = output.view_as(attn_output)
    output.copy_(attn_output)
    return attn_output

--- a/vllm_ascend/quantization/w8a8.py
+++ b/vllm_ascend/quantization/w8a8.py
@@ -24,7 +24,7 @@ import torch_npu
 def quant_per_tensor(in_tensor: torch.Tensor, input_scale: torch.Tensor,
                     input_offset: torch.Tensor):
    return torch_npu.npu_quantize(in_tensor, input_scale, input_offset,
-                                  torch.qint8, -1, True)
+                                  torch.qint8, -1, False)


 class AscendW8A8LinearMethod:
@@ -102,12 +102,12 @@ class AscendW8A8LinearMethod:

    def process_weights_after_loading(self, layer):
        expanding_factor = layer.weight.data.shape[1]
-        layer.aclnn_input_scale = torch.nn.Parameter(
+        layer.aclnn_input_scale = 1 / torch.nn.Parameter(
            layer.input_scale.data.repeat(expanding_factor),
            requires_grad=False)
        layer.aclnn_input_offset = torch.nn.Parameter(
            layer.input_offset.data.repeat(expanding_factor),
-            requires_grad=False)
+            requires_grad=False).to(layer.aclnn_input_scale.dtype)
        if self.transpose_weight:
            layer.weight.data = layer.weight.data.transpose(0, 1).contiguous()
        layer.weight_scale.data = torch.flatten(layer.weight_scale.data)