[Feat] Unquantized Linear to nz and control all nz-cast (#3356)

### What this PR does / why we need it? Currently, when executing to the Linear layer of models in vLLM-Ascend, the weights format is ND in unquantized case and skipped ascend case. This PR supplements the execution logic for Linear layer. We use a new global variable: VLLM_ASCEND_ENABLE_NZ. When VLLM_ASCEND_ENABLE_NZ=1 and CANN version is 8.3, the weights of the Linear layer will be converted to FRACTAL_NZ, in both unquantized case and skipped ascend case. We also use VLLM_ASCEND_ENABLE_NZ to control the existing NZ conversion, such as w8a8-quantized case. ### Does this PR introduce _any_ user-facing change? Add a new global variable VLLM_ASCEND_ENABLE_NZ. If you want to use NZ format, you should set VLLM_ASCEND_ENABLE_NZ=1. ### How was this patch tested? - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 Signed-off-by: anon189Ty <Stari_Falcon@outlook.com>
2025-10-14 17:39:26 +08:00
parent 5c45c227dc
commit 07e39620ea
22 changed files with 413 additions and 49 deletions
--- a/tests/ut/models/test_deepseek_v2.py
+++ b/tests/ut/models/test_deepseek_v2.py
@@ -12,7 +12,7 @@
 # limitations under the License.
 # This file is a part of the vllm-ascend project.
 #
-from unittest.mock import Mock, patch
+from unittest.mock import MagicMock, Mock, patch

 import pytest
 import torch
@@ -20,6 +20,7 @@ from vllm.config import CacheConfig
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead

+from vllm_ascend import ascend_config
 from vllm_ascend.models.deepseek_v2 import (CustomDeepseekV2MLAAttention,
                                            CustomDeepseekV2RowParallelLinear)

@@ -46,6 +47,13 @@ def test_row_parallel_linear(cls, mock_distributed):
 def test_custom_deepseek_v2_mla_attention(mock_rms_norm, mock_mla_forward,
                                          mock_distributed, base_config):
    mock_rms_norm.return_value = (torch.randn(2, 128), torch.randn(2, 128))
+    # Make a fake ascend config because of the AscendLinearBase
+    vllm_config = MagicMock()
+    vllm_config.additional_config = None
+    vllm_config.parallel_config.enable_expert_parallel = False
+    vllm_config.parallel_config.tensor_parallel_size = 1
+    vllm_config.kv_transfer_config = None
+    ascend_config.init_ascend_config(vllm_config)

    attn = CustomDeepseekV2MLAAttention(config=base_config,
                                        hidden_size=128,
@@ -78,6 +86,7 @@ def test_custom_deepseek_v2_mla_attention(mock_rms_norm, mock_mla_forward,
                                        kv_lora_rank=16,
                                        prefix="layers.1.self_attn")
    assert hasattr(attn, "q_proj")
+    ascend_config._ASCEND_CONFIG = None


 def test_deepseek_v2_lmhead(mock_distributed, vllm_config):
@@ -90,6 +99,14 @@ def test_deepseek_v2_lmhead(mock_distributed, vllm_config):

    config = SimpleConfig()

+    # Make a fake ascend config because of the AscendLinearBase
+    vllm_config = MagicMock()
+    vllm_config.additional_config = None
+    vllm_config.parallel_config.enable_expert_parallel = False
+    vllm_config.parallel_config.tensor_parallel_size = 1
+    vllm_config.kv_transfer_config = None
+    ascend_config.init_ascend_config(vllm_config)
+
    # 直接创建lmhead和logits_processor
    lmhead = ParallelLMHead(config.vocab_size, config.hidden_size)
    logits_processor = LogitsProcessor(config.vocab_size)
@@ -105,3 +122,4 @@ def test_deepseek_v2_lmhead(mock_distributed, vllm_config):
                          return_value=mock_logits):
            logits = logits_processor(lmhead, mock_output)
    assert logits.shape == (2, 4, config.vocab_size)
+    ascend_config._ASCEND_CONFIG = None