### What this PR does / why we need it?
Now `VLLM_ASCEND_ENABLE_NZ` will have three options:
0: disable nz;
1: only quant case enable nz;
2: enable nz as long as possible;
And `VLLM_ASCEND_ENABLE_NZ`=1 by default.
All cases are shown in the table below:
| | W4A4 | W4A8 | W8A8 | fp16/bf16 | fp32 |
|---|---|---|---|---|---|
| trans nz | can't support nz | trans nz by default | trans nz by
default | trans nz when VLLM_ASCEND_ENABLE_NZ is 2 | can't support nz |
| transpose | only support not transpose case | only support transpose
case | only support transpose case | linear: only support not transpose
case<br>gmm: only support transpose case | same to fp16/bf16 |
Some exceptional cases:
1. MLAPO op need to do some additional processing on the weights,
including trans nz. If use MLAPO op, some weight will be transformed to
nz forcely;
2. MLA/SFA's weight `W_UV` will be used by op
`torch.ops._C_ascend.batch_matmul_transpose`, and this op can't support
nz currently;
### Does this PR introduce _any_ user-facing change?
Now fp16/bf16 weight will not trans nz by default.
### How was this patch tested?
- vLLM version: v0.12.0
- vLLM main:
ad32e3e19c
Signed-off-by: zzzzwwjj <1183291235@qq.com>
161 lines
5.5 KiB
Python
161 lines
5.5 KiB
Python
import os
|
|
import unittest
|
|
from unittest import mock
|
|
from unittest.mock import MagicMock, patch
|
|
|
|
import torch
|
|
from vllm import config
|
|
|
|
from tests.ut.base import TestBase
|
|
from vllm_ascend import ascend_config
|
|
from vllm_ascend.distributed import parallel_state
|
|
from vllm_ascend.ops.linear import (AscendMergedColumnParallelLinear,
|
|
AscendReplicatedLinear,
|
|
AscendRowParallelLinear,
|
|
AscendUnquantizedLinearMethod)
|
|
|
|
|
|
class BaseLinearTest(unittest.TestCase):
|
|
|
|
def setUp(self):
|
|
self.mock_group = mock.MagicMock()
|
|
self.mock_group.world_size = 2
|
|
self.mock_group.rank_in_group = 0
|
|
|
|
parallel_state._MLP_TP = self.mock_group
|
|
parallel_state._OTP = self.mock_group
|
|
|
|
self.mock_ascend_config = MagicMock()
|
|
self.mock_ascend_config.finegrained_tp_config.oproj_tensor_parallel_size = 2
|
|
self.mock_ascend_config.finegrained_tp_config.mlp_tensor_parallel_size = 2
|
|
|
|
self.patches = [
|
|
patch("vllm_ascend.ascend_config.get_ascend_config",
|
|
return_value=self.mock_ascend_config),
|
|
patch("vllm_ascend.distributed.parallel_state.get_otp_group",
|
|
return_value=self.mock_group),
|
|
patch("vllm_ascend.distributed.parallel_state.get_mlp_tp_group",
|
|
return_value=self.mock_group),
|
|
patch("vllm_ascend.ops.linear_op.get_tp_group",
|
|
return_value=self.mock_group),
|
|
patch(
|
|
"vllm.distributed.parallel_state.get_tp_group",
|
|
return_value=self.mock_group,
|
|
),
|
|
patch("vllm_ascend.utils.mlp_tp_enable", return_value=True),
|
|
patch("vllm_ascend.utils.oproj_tp_enable", return_value=True)
|
|
]
|
|
|
|
for p in self.patches:
|
|
p.start()
|
|
|
|
def tearDown(self):
|
|
for p in self.patches:
|
|
p.stop()
|
|
|
|
|
|
class TestAscendUnquantizedLinearMethod(TestBase):
|
|
|
|
def setUp(self):
|
|
self.method = AscendUnquantizedLinearMethod()
|
|
self.layer = mock.MagicMock()
|
|
mock_dtype = mock.PropertyMock(return_value=torch.float16)
|
|
type(self.layer.weight.data).dtype = mock_dtype
|
|
|
|
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_NZ": "0"})
|
|
@mock.patch("torch_npu.npu_format_cast")
|
|
def test_process_weights_after_loading_with_nz0(self, mock_format_cast):
|
|
self.method.process_weights_after_loading(self.layer)
|
|
mock_format_cast.assert_not_called()
|
|
|
|
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_NZ": "1"})
|
|
@mock.patch("torch_npu.npu_format_cast")
|
|
def test_process_weights_after_loading_with_nz1(self, mock_format_cast):
|
|
self.method.process_weights_after_loading(self.layer)
|
|
mock_format_cast.assert_not_called()
|
|
|
|
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_NZ": "2"})
|
|
@mock.patch("torch_npu.npu_format_cast")
|
|
def test_process_weights_after_loading_with_nz2(self, mock_format_cast):
|
|
self.method.process_weights_after_loading(self.layer)
|
|
mock_format_cast.assert_called_once()
|
|
|
|
|
|
class TestAscendRowParallelLinear(BaseLinearTest):
|
|
|
|
def test_mlp_optimize(self):
|
|
|
|
ascend_config._ASCEND_CONFIG = MagicMock()
|
|
ascend_config._ASCEND_CONFIG.recompute_scheduler_enable = False
|
|
ascend_config._ASCEND_CONFIG.finegrained_tp_config.mlp_tensor_parallel_size = 2
|
|
ascend_config._ASCEND_CONFIG.ascend_scheduler_config.enabled = False
|
|
|
|
linear = AscendRowParallelLinear(
|
|
input_size=16,
|
|
output_size=8,
|
|
prefix="down_proj",
|
|
)
|
|
self.assertEqual(linear.custom_op.comm_group, parallel_state._MLP_TP)
|
|
|
|
input_tensor = torch.randn(16, 8)
|
|
linear(input_tensor)
|
|
|
|
def test_oproj_tp(self):
|
|
|
|
config._current_vllm_config = MagicMock()
|
|
|
|
ascend_config._ASCEND_CONFIG = MagicMock()
|
|
ascend_config._ASCEND_CONFIG.recompute_scheduler_enable = False
|
|
ascend_config._ASCEND_CONFIG.finegrained_tp_config.oproj_tensor_parallel_size = 2
|
|
ascend_config._ASCEND_CONFIG.ascend_scheduler_config.enabled = False
|
|
|
|
linear = AscendRowParallelLinear(
|
|
input_size=16,
|
|
output_size=8,
|
|
prefix="o_proj",
|
|
)
|
|
self.assertEqual(linear.custom_op.comm_group, parallel_state._OTP)
|
|
|
|
input_tensor = torch.randn(16, 8)
|
|
linear(input_tensor)
|
|
|
|
|
|
class TestAscendMergedColumnParallelLinear(BaseLinearTest):
|
|
|
|
def test_merged_mlp_tp_init(self):
|
|
|
|
ascend_config._ASCEND_CONFIG = MagicMock()
|
|
ascend_config._ASCEND_CONFIG.recompute_scheduler_enable = False
|
|
ascend_config._ASCEND_CONFIG.finegrained_tp_config.mlp_tensor_parallel_size = 2
|
|
ascend_config._ASCEND_CONFIG.ascend_scheduler_config.enabled = False
|
|
|
|
linear = AscendMergedColumnParallelLinear(
|
|
input_size=16,
|
|
output_sizes=[8, 8],
|
|
prefix="gate_up_proj",
|
|
)
|
|
self.assertEqual(linear.custom_op.comm_group, parallel_state._MLP_TP)
|
|
|
|
|
|
class TestAscendReplicatedLinear(BaseLinearTest):
|
|
|
|
def test_init_disable_tp(self):
|
|
linear = AscendReplicatedLinear(
|
|
input_size=16,
|
|
output_size=8,
|
|
)
|
|
self.assertTrue(
|
|
isinstance(linear.quant_method, AscendUnquantizedLinearMethod))
|
|
|
|
def test_init_without_disable_tp(self):
|
|
linear = AscendReplicatedLinear(
|
|
input_size=16,
|
|
output_size=8,
|
|
)
|
|
self.assertTrue(
|
|
isinstance(linear.quant_method, AscendUnquantizedLinearMethod))
|
|
|
|
|
|
if __name__ == '__main__':
|
|
unittest.main()
|