[Feat] Unquantized Linear to nz and control all nz-cast (#3356)
### What this PR does / why we need it? Currently, when executing to the Linear layer of models in vLLM-Ascend, the weights format is ND in unquantized case and skipped ascend case. This PR supplements the execution logic for Linear layer. We use a new global variable: VLLM_ASCEND_ENABLE_NZ. When VLLM_ASCEND_ENABLE_NZ=1 and CANN version is 8.3, the weights of the Linear layer will be converted to FRACTAL_NZ, in both unquantized case and skipped ascend case. We also use VLLM_ASCEND_ENABLE_NZ to control the existing NZ conversion, such as w8a8-quantized case. ### Does this PR introduce _any_ user-facing change? Add a new global variable VLLM_ASCEND_ENABLE_NZ. If you want to use NZ format, you should set VLLM_ASCEND_ENABLE_NZ=1. ### How was this patch tested? - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 Signed-off-by: anon189Ty <Stari_Falcon@outlook.com>
This commit is contained in:
@@ -12,7 +12,7 @@
|
||||
# limitations under the License.
|
||||
# This file is a part of the vllm-ascend project.
|
||||
#
|
||||
from unittest.mock import Mock, patch
|
||||
from unittest.mock import MagicMock, Mock, patch
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
@@ -20,6 +20,7 @@ from vllm.config import CacheConfig
|
||||
from vllm.model_executor.layers.logits_processor import LogitsProcessor
|
||||
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
|
||||
|
||||
from vllm_ascend import ascend_config
|
||||
from vllm_ascend.models.deepseek_v2 import (CustomDeepseekV2MLAAttention,
|
||||
CustomDeepseekV2RowParallelLinear)
|
||||
|
||||
@@ -46,6 +47,13 @@ def test_row_parallel_linear(cls, mock_distributed):
|
||||
def test_custom_deepseek_v2_mla_attention(mock_rms_norm, mock_mla_forward,
|
||||
mock_distributed, base_config):
|
||||
mock_rms_norm.return_value = (torch.randn(2, 128), torch.randn(2, 128))
|
||||
# Make a fake ascend config because of the AscendLinearBase
|
||||
vllm_config = MagicMock()
|
||||
vllm_config.additional_config = None
|
||||
vllm_config.parallel_config.enable_expert_parallel = False
|
||||
vllm_config.parallel_config.tensor_parallel_size = 1
|
||||
vllm_config.kv_transfer_config = None
|
||||
ascend_config.init_ascend_config(vllm_config)
|
||||
|
||||
attn = CustomDeepseekV2MLAAttention(config=base_config,
|
||||
hidden_size=128,
|
||||
@@ -78,6 +86,7 @@ def test_custom_deepseek_v2_mla_attention(mock_rms_norm, mock_mla_forward,
|
||||
kv_lora_rank=16,
|
||||
prefix="layers.1.self_attn")
|
||||
assert hasattr(attn, "q_proj")
|
||||
ascend_config._ASCEND_CONFIG = None
|
||||
|
||||
|
||||
def test_deepseek_v2_lmhead(mock_distributed, vllm_config):
|
||||
@@ -90,6 +99,14 @@ def test_deepseek_v2_lmhead(mock_distributed, vllm_config):
|
||||
|
||||
config = SimpleConfig()
|
||||
|
||||
# Make a fake ascend config because of the AscendLinearBase
|
||||
vllm_config = MagicMock()
|
||||
vllm_config.additional_config = None
|
||||
vllm_config.parallel_config.enable_expert_parallel = False
|
||||
vllm_config.parallel_config.tensor_parallel_size = 1
|
||||
vllm_config.kv_transfer_config = None
|
||||
ascend_config.init_ascend_config(vllm_config)
|
||||
|
||||
# 直接创建lmhead和logits_processor
|
||||
lmhead = ParallelLMHead(config.vocab_size, config.hidden_size)
|
||||
logits_processor = LogitsProcessor(config.vocab_size)
|
||||
@@ -105,3 +122,4 @@ def test_deepseek_v2_lmhead(mock_distributed, vllm_config):
|
||||
return_value=mock_logits):
|
||||
logits = logits_processor(lmhead, mock_output)
|
||||
assert logits.shape == (2, 4, config.vocab_size)
|
||||
ascend_config._ASCEND_CONFIG = None
|
||||
|
||||
Reference in New Issue
Block a user