Revert "[Feat] Unquantized linear nz support (#2619)" (#2896)

### What this PR does / why we need it?
This reverts commit 7b2ecc1e9a.

### Does this PR introduce _any_ user-facing change?
No

### How was this patch tested?
CI passed

- vLLM version: main
- vLLM main:
64d90c3e4f

Closes: https://github.com/vllm-project/vllm-ascend/issues/2890
Closes: https://github.com/vllm-project/vllm-ascend/issues/2887
Closes: https://github.com/vllm-project/vllm-ascend/issues/2885

Signed-off-by: Yikun Jiang <yikunkero@gmail.com>
This commit is contained in:
Yikun Jiang
2025-09-12 20:51:12 +08:00
committed by GitHub
parent fc2bcbe21c
commit 756b8a1946
4 changed files with 10 additions and 111 deletions

View File

@@ -5,13 +5,11 @@ from unittest.mock import MagicMock, patch
import torch
from tests.ut.base import TestBase
from vllm_ascend import ascend_config
from vllm_ascend.distributed import parallel_state
from vllm_ascend.ops.linear import (AscendColumnParallelLinear,
AscendMergedColumnParallelLinear,
AscendRowParallelLinear,
AscendUnquantizedLinearMethod)
AscendRowParallelLinear)
class BaseLinearTest(unittest.TestCase):
@@ -48,81 +46,6 @@ class BaseLinearTest(unittest.TestCase):
p.stop()
class TestAscendUnquantizedLinearMethod(TestBase):
def setUp(self):
self.method = AscendUnquantizedLinearMethod()
@mock.patch("torch_npu.npu_format_cast")
@mock.patch("torch.version")
def test_process_weights_after_loading_is_cann_8_3(self, mock_version,
mock_format_cast):
layer = mock.MagicMock()
mock_version.cann = "8.3.RC1"
self.method.process_weights_after_loading(layer)
mock_format_cast.assert_called_once()
@mock.patch("torch.version")
def test_process_weights_after_loading_not_cann_8_3(self, mock_version):
layer = mock.MagicMock()
mock_version.cann = "8.2.RC1"
# Should not raise exception
self.method.process_weights_after_loading(layer)
@mock.patch("torch.matmul")
@mock.patch("torch.version")
def test_apply_with_bias_is_cann_8_3(self, mock_version, mock_npu_matmul):
layer = mock.MagicMock()
layer.weight = torch.randn(128, 256)
x = torch.randn(32, 128)
bias = torch.randn(256)
expected_y_output = torch.randn(32, 256)
mock_npu_matmul.return_value = expected_y_output
mock_version.cann = "8.3.RC1"
output = self.method.apply(layer, x, bias)
expected_y_output += bias
self.assertTrue(torch.equal(output, expected_y_output))
@mock.patch("torch.matmul")
@mock.patch("torch.version")
def test_apply_without_bias_is_cann_8_3(self, mock_version,
mock_npu_matmul):
layer = mock.MagicMock()
layer.weight = torch.randn(128, 256)
x = torch.randn(32, 128)
expected_y_output = torch.randn(32, 256)
mock_npu_matmul.return_value = expected_y_output
mock_version.cann = "8.3.RC1"
output = self.method.apply(layer, x)
self.assertTrue(torch.equal(output, expected_y_output))
@mock.patch("torch.nn.functional.linear")
@mock.patch("torch.version")
def test_apply_not_cann_8_3(self, mock_version, mock_npu_linear):
layer = mock.MagicMock()
layer.weight = torch.randn(128, 256)
x = torch.randn(32, 128)
expected_y_output = torch.randn(32, 256)
mock_npu_linear.return_value = expected_y_output
mock_version.cann = "8.2.RC1"
output = self.method.apply(layer, x)
self.assertTrue(torch.equal(output, expected_y_output))
class TestAscendRowParallelLinear(BaseLinearTest):
def test_mlp_optimize(self):

View File

@@ -4,10 +4,10 @@ import torch
from vllm.attention.layer import Attention
from vllm.model_executor.layers.fused_moe import FusedMoE
from vllm.model_executor.layers.fused_moe.config import FusedMoEConfig
from vllm.model_executor.layers.linear import LinearBase
from vllm.model_executor.layers.linear import (LinearBase,
UnquantizedLinearMethod)
from tests.ut.base import TestBase
from vllm_ascend.ops.linear import AscendUnquantizedLinearMethod
from vllm_ascend.quantization.quant_config import (AscendKVCacheMethod,
AscendQuantConfig)
from vllm_ascend.utils import ASCEND_QUANTIZATION_METHOD
@@ -79,7 +79,7 @@ class TestAscendQuantConfig(TestBase):
'is_layer_skipped_ascend',
return_value=True):
method = self.ascend_config.get_quant_method(linear_layer, ".attn")
self.assertIsInstance(method, AscendUnquantizedLinearMethod)
self.assertIsInstance(method, UnquantizedLinearMethod)
# Test quantized layer
with patch.object(self.ascend_config, 'is_layer_skipped_ascend', return_value=False), \