[Feat] Unquantized linear nz support (#2619)
### What this PR does / why we need it?
Currently, when executing to the Linear layer of the model in
vLLM-Ascend, the weights input format is ND in unquantized case and
skipped ascend case, which is slower than FRACTAL_NZ.
This PR supplements the execution logic for Linear layer. When
VLLM_ASCEND_ENABLE_MLP_OPTIMIZE=1 and CANN version is 8.3, the weights
of the Linear layer will be converted to FRACTAL_NZ, in both unquantized
case and skipped ascend case.
- vLLM version: main
- vLLM main:
267c80d31f
Signed-off-by: anon189Ty <Stari_Falcon@outlook.com>
This commit is contained in:
@@ -5,11 +5,13 @@ from unittest.mock import MagicMock, patch
|
||||
|
||||
import torch
|
||||
|
||||
from tests.ut.base import TestBase
|
||||
from vllm_ascend import ascend_config
|
||||
from vllm_ascend.distributed import parallel_state
|
||||
from vllm_ascend.ops.linear import (AscendColumnParallelLinear,
|
||||
AscendMergedColumnParallelLinear,
|
||||
AscendRowParallelLinear)
|
||||
AscendRowParallelLinear,
|
||||
AscendUnquantizedLinearMethod)
|
||||
|
||||
|
||||
class BaseLinearTest(unittest.TestCase):
|
||||
@@ -46,6 +48,81 @@ class BaseLinearTest(unittest.TestCase):
|
||||
p.stop()
|
||||
|
||||
|
||||
class TestAscendUnquantizedLinearMethod(TestBase):
|
||||
|
||||
def setUp(self):
|
||||
self.method = AscendUnquantizedLinearMethod()
|
||||
|
||||
@mock.patch("torch_npu.npu_format_cast")
|
||||
@mock.patch("torch.version")
|
||||
def test_process_weights_after_loading_is_cann_8_3(self, mock_version,
|
||||
mock_format_cast):
|
||||
layer = mock.MagicMock()
|
||||
|
||||
mock_version.cann = "8.3.RC1"
|
||||
self.method.process_weights_after_loading(layer)
|
||||
mock_format_cast.assert_called_once()
|
||||
|
||||
@mock.patch("torch.version")
|
||||
def test_process_weights_after_loading_not_cann_8_3(self, mock_version):
|
||||
layer = mock.MagicMock()
|
||||
|
||||
mock_version.cann = "8.2.RC1"
|
||||
# Should not raise exception
|
||||
self.method.process_weights_after_loading(layer)
|
||||
|
||||
@mock.patch("torch.matmul")
|
||||
@mock.patch("torch.version")
|
||||
def test_apply_with_bias_is_cann_8_3(self, mock_version, mock_npu_matmul):
|
||||
layer = mock.MagicMock()
|
||||
layer.weight = torch.randn(128, 256)
|
||||
|
||||
x = torch.randn(32, 128)
|
||||
bias = torch.randn(256)
|
||||
|
||||
expected_y_output = torch.randn(32, 256)
|
||||
mock_npu_matmul.return_value = expected_y_output
|
||||
|
||||
mock_version.cann = "8.3.RC1"
|
||||
output = self.method.apply(layer, x, bias)
|
||||
|
||||
expected_y_output += bias
|
||||
self.assertTrue(torch.equal(output, expected_y_output))
|
||||
|
||||
@mock.patch("torch.matmul")
|
||||
@mock.patch("torch.version")
|
||||
def test_apply_without_bias_is_cann_8_3(self, mock_version,
|
||||
mock_npu_matmul):
|
||||
layer = mock.MagicMock()
|
||||
layer.weight = torch.randn(128, 256)
|
||||
|
||||
x = torch.randn(32, 128)
|
||||
|
||||
expected_y_output = torch.randn(32, 256)
|
||||
mock_npu_matmul.return_value = expected_y_output
|
||||
|
||||
mock_version.cann = "8.3.RC1"
|
||||
output = self.method.apply(layer, x)
|
||||
|
||||
self.assertTrue(torch.equal(output, expected_y_output))
|
||||
|
||||
@mock.patch("torch.nn.functional.linear")
|
||||
@mock.patch("torch.version")
|
||||
def test_apply_not_cann_8_3(self, mock_version, mock_npu_linear):
|
||||
layer = mock.MagicMock()
|
||||
layer.weight = torch.randn(128, 256)
|
||||
|
||||
x = torch.randn(32, 128)
|
||||
|
||||
expected_y_output = torch.randn(32, 256)
|
||||
mock_npu_linear.return_value = expected_y_output
|
||||
|
||||
mock_version.cann = "8.2.RC1"
|
||||
output = self.method.apply(layer, x)
|
||||
|
||||
self.assertTrue(torch.equal(output, expected_y_output))
|
||||
|
||||
|
||||
class TestAscendRowParallelLinear(BaseLinearTest):
|
||||
|
||||
def test_mlp_optimize(self):
|
||||
|
||||
@@ -4,10 +4,10 @@ import torch
|
||||
from vllm.attention.layer import Attention
|
||||
from vllm.model_executor.layers.fused_moe import FusedMoE
|
||||
from vllm.model_executor.layers.fused_moe.config import FusedMoEConfig
|
||||
from vllm.model_executor.layers.linear import (LinearBase,
|
||||
UnquantizedLinearMethod)
|
||||
from vllm.model_executor.layers.linear import LinearBase
|
||||
|
||||
from tests.ut.base import TestBase
|
||||
from vllm_ascend.ops.linear import AscendUnquantizedLinearMethod
|
||||
from vllm_ascend.quantization.quant_config import (AscendKVCacheMethod,
|
||||
AscendQuantConfig)
|
||||
from vllm_ascend.utils import ASCEND_QUANTIZATION_METHOD
|
||||
@@ -79,7 +79,7 @@ class TestAscendQuantConfig(TestBase):
|
||||
'is_layer_skipped_ascend',
|
||||
return_value=True):
|
||||
method = self.ascend_config.get_quant_method(linear_layer, ".attn")
|
||||
self.assertIsInstance(method, UnquantizedLinearMethod)
|
||||
self.assertIsInstance(method, AscendUnquantizedLinearMethod)
|
||||
|
||||
# Test quantized layer
|
||||
with patch.object(self.ascend_config, 'is_layer_skipped_ascend', return_value=False), \
|
||||
|
||||
Reference in New Issue
Block a user