init v0.11.0rc0

This commit is contained in:
2025-10-14 10:38:28 +08:00
parent 67afd0ea78
commit 66dc16f966
278 changed files with 28130 additions and 11708 deletions

View File

@@ -165,8 +165,6 @@ class TestTorchairDeepSeekMTP(PytestBase):
mocker.patch(
"vllm_ascend.torchair.models.torchair_deepseek_mtp.TorchairDeepSeekMultiTokenPredictorLayer.__call__",
return_value=None)
mocker.patch("vllm.model_executor.layers.sampler.get_sampler",
return_value=None)
mocker.patch(
"vllm_ascend.ops.vocab_parallel_embedding.AscendVocabParallelEmbedding.__init__",
return_value=None)

View File

@@ -100,6 +100,11 @@ def mock_distributed():
pp_group.rank_in_group = 0
pp_group.world_size = 1
mlp_tp_group = Mock(spec=GroupCoordinator)
mlp_tp_group.rank_in_group = 0
mlp_tp_group.world_size = 1
mlp_tp_group.all_gather = Mock(return_value=torch.randn(2, 4, 128))
mock_vllm_config = Mock()
mock_vllm_config.scheduler_config = Mock(max_num_seqs=256)
mock_vllm_config.model_config = Mock(max_model_len=2048, quant_config=None)
@@ -196,10 +201,6 @@ def test_torchair_deepseek_v2_mlp(mock_distributed, base_config):
quant_config=None)
assert isinstance(mlp.act_fn, TorchairDeepseekV2SiluAndMul)
x = torch.randn(2, 4, 128)
output = mlp(x)
assert output.shape == (2, 4, 128)
with patch(
"vllm_ascend.torchair.models.torchair_deepseek_v2.QuantizationConfig"
) as mock_quant_config:
@@ -274,7 +275,12 @@ def test_torchair_deepseek_v2_mla_attention(mock_rms_norm, mock_distributed,
@patch("torch_npu.npu_add_rms_norm")
@patch("torch_npu.npu_rms_norm")
def test_torchair_deepseek_v2_decoder_layer(mock_rms_norm, mock_add_norm,
@patch("torch.ops.vllm.maybe_wait_prefetch_done", side_effect=lambda x: None)
@patch("torch.ops.vllm.maybe_chunk_residual",
side_effect=lambda x, residual: residual)
def test_torchair_deepseek_v2_decoder_layer(mock_maybe_chunk_residual,
mock_maybe_wait_prefetch_done,
mock_rms_norm, mock_add_norm,
mock_distributed, base_config,
vllm_config):
mock_rms_norm.return_value = (torch.randn(2, 128), torch.randn(2, 128))

View File

@@ -24,10 +24,10 @@ from vllm.model_executor.layers.fused_moe import FusedMoEMethodBase
from vllm_ascend.ascend_forward_context import _get_fused_moe_state
from vllm_ascend.quantization.quant_config import AscendFusedMoEMethod
from vllm_ascend.quantization.quantizer import W8A8Quantizer
from vllm_ascend.torchair.ops.torchair_fused_moe import (
TorchairAscendFusedMoE, TorchairAscendUnquantizedFusedMoEMethod)
from vllm_ascend.utils import AscendSocVersion, adapt_patch # noqa E402
from vllm_ascend.utils import adapt_patch # noqa E402
from vllm_ascend.utils import AscendSocVersion, vllm_version_is
adapt_patch(True)
@@ -54,6 +54,10 @@ def mock_dp_and_tp_group(mocker):
@pytest.fixture
def mock_dist_env(mocker: MockerFixture):
# init dist env patch
if vllm_version_is("0.10.2"):
dp_metadata = MagicMock(cu_tokens_across_dp_cpu=[5, 10])
else:
dp_metadata = MagicMock(num_tokens_across_dp_cpu=[5, 5])
with patch('torch.distributed.get_rank', return_value=0), \
patch('torch.distributed.get_world_size', return_value=4), \
@@ -67,13 +71,13 @@ def mock_dist_env(mocker: MockerFixture):
patch('torch.distributed.all_to_all_single', return_value=torch.randn(8, 32)), \
patch('vllm_ascend.torchair.ops.torchair_fused_moe.tensor_model_parallel_all_reduce',
return_value=torch.randn(5, 32)), \
patch('vllm_ascend.torchair.ops.torchair_fused_moe.data_parallel_reduce_scatter',
return_value=torch.randn(5, 32)), \
patch('vllm.model_executor.layers.fused_moe.config.get_dp_group',
return_value=mock_dp_and_tp_group(mocker)), \
patch('vllm_ascend.torchair.ops.torchair_fused_moe.get_ascend_config',
return_value=MagicMock(
torchair_graph_config=MagicMock(enabled=False, enable_multistream_moe=False),
torchair_graph_config=MagicMock(enabled=False),
enable_multistream_moe=False,
enable_shared_expert_dp=False,
expert_map_path=None
)), \
patch('vllm_ascend.torchair.ops.torchair_fused_moe.determine_expert_map',
@@ -81,7 +85,7 @@ def mock_dist_env(mocker: MockerFixture):
patch('vllm_ascend.torchair.ops.torchair_fused_moe.get_forward_context',
return_value=MagicMock(
max_tokens_across_dp=10,
dp_metadata=MagicMock(cu_tokens_across_dp_cpu=[5, 10])
dp_metadata=dp_metadata,
)), \
patch('vllm_ascend.torchair.ops.torchair_fused_moe.get_current_vllm_config',
return_value=MagicMock(
@@ -154,6 +158,8 @@ def default_moe_config():
def moe_method(mock_dist_env):
moe = MagicMock()
moe.moe_parallel_config.return_value = MagicMock(ep_size=4)
moe.moe_parallel_config.use_ep = False
moe.moe_parallel_config.dp_size = 1
return TorchairAscendUnquantizedFusedMoEMethod(moe)
@@ -199,6 +205,9 @@ class MockFusedMoEMethod(FusedMoEMethodBase):
expert_weights: torch.Tensor) -> torch.Tensor:
pass
def get_fused_moe_quant_config(self, layer: torch.nn.Module):
pass
class TestTorchairAscendFusedMoe:
@@ -236,12 +245,9 @@ class TestTorchairAscendFusedMoe:
mock_quant_method = MockFusedMoEMethod()
mock_quant_config.get_quant_method.return_value = mock_quant_method
mock_quant_config.is_layer_skipped_ascend.return_value = False
with patch(
'vllm_ascend.quantization.quantizer.AscendQuantizer.get_quantizer',
return_value=W8A8Quantizer):
with patch("vllm_ascend.quantization.quant_config.get_quant_method"):
moe = TorchairAscendFusedMoE(**default_moe_config,
quant_config=mock_quant_config)
assert moe.quant_method is not None
assert isinstance(moe.quant_method, AscendFusedMoEMethod)

View File

@@ -5,8 +5,9 @@ import torch
from tests.ut.base import TestBase
from vllm_ascend.torchair.ops.torchair_rotary_embedding import (
custom_rotary_embedding_enabled, native_rope_deepseek_forward,
rope_forward_oot, rotate_half, yarn_find_correction_dim, yarn_get_mscale)
_set_cos_sin_cache, custom_rotary_embedding_enabled,
native_rope_deepseek_forward, rope_forward_oot, rotate_half,
yarn_find_correction_dim, yarn_get_mscale)
class TestCustomRotaryEmbeddingEnabled(TestBase):
@@ -103,7 +104,7 @@ class TestRopeForwardOot(TestBase):
self.assertTrue(torch.equal(result_q, self.query))
self.assertTrue(torch.equal(result_k, self.key))
@patch('torch.ops._C')
@patch('torch.ops._C_ascend')
@patch(
'vllm_ascend.torchair.ops.torchair_rotary_embedding.get_ascend_config')
@patch('vllm_ascend.torchair.ops.torchair_rotary_embedding.is_310p',
@@ -200,6 +201,28 @@ class MockRopeModule:
self.sin_cached = None
self.rotary_dim = 1
self.base = 1
self.beta_fast = 32
self.beta_slow = 1
self.max_position_embeddings = 4096
self.mscale = 1.0
self.scaling_factor = 40
def register_buffer(self):
pass
class TestSetSinCosCache(TestBase):
def test_set_cos_sin_cache(self):
module = MockRopeModule()
with patch.object(module, "register_buffer") as mock_register_buffer:
_set_cos_sin_cache(module,
1024,
device="cpu",
dtype=torch.bfloat16)
mock_register_buffer.assert_called()
class TestNativeRopeDeepseekForward(TestBase):
@@ -220,30 +243,6 @@ class TestNativeRopeDeepseekForward(TestBase):
assert q_pe.shape == query.shape
assert k_pe.shape == key.shape
@patch(
'vllm_ascend.torchair.ops.torchair_rotary_embedding._set_cos_sin_cache'
)
@patch(
'vllm_ascend.torchair.ops.torchair_rotary_embedding.rope_forward_oot')
def test_native_rope_deepseek_forward_cache_handling(
self, mock_rope_forward_oot, mock_set_cache):
# Test cache situation is true
module = MockRopeModule(max_seq_len=1024)
positions = torch.tensor([1, 2, 3])
query = torch.randn(1, 8, 128)
key = torch.randn(1, 8, 128)
mock_rope_forward_oot.return_value = (query, key)
q_pe, k_pe = native_rope_deepseek_forward(module,
positions,
query,
key,
max_seq_len=2048)
assert q_pe.shape == query.shape
assert k_pe.shape == key.shape
@patch(
'vllm_ascend.torchair.ops.torchair_rotary_embedding.rope_forward_oot')
def test_native_rope_deepseek_forward_key_reshaping(

View File

@@ -1,4 +1,3 @@
import copy
from unittest.mock import Mock, patch
import torch
@@ -85,19 +84,19 @@ class TestAscendW4A8DynamicFusedMoEMethod(TestBase):
# old quant version weight
param_dict = self.quant_method.get_dynamic_quant_param(
self.experts, self.input_size, self.output_size, torch.bfloat16)
self.assertEqual(param_dict["w13_weight_scale"].dtype, torch.bfloat16)
self.assertEqual(param_dict["w13_weight_scale"].dtype, torch.float32)
self.assertEqual(param_dict["w13_weight_scale"].shape,
(self.experts, 2 * self.input_size, 1))
self.assertEqual(param_dict["w13_weight_scale_second"].dtype,
torch.bfloat16)
torch.float32)
self.assertEqual(param_dict["w13_weight_scale_second"].shape,
(self.experts, 2 * self.input_size,
self.output_size // self.group_size))
self.assertEqual(param_dict["w2_weight_scale"].dtype, torch.bfloat16)
self.assertEqual(param_dict["w2_weight_scale"].dtype, torch.float32)
self.assertEqual(param_dict["w2_weight_scale"].shape,
(self.experts, self.output_size, 1))
self.assertEqual(param_dict["w2_weight_scale_second"].dtype,
torch.bfloat16)
torch.float32)
self.assertEqual(param_dict["w2_weight_scale_second"].shape,
(self.experts, self.output_size,
self.input_size // self.group_size))
@@ -109,40 +108,80 @@ class TestAscendW4A8DynamicFusedMoEMethod(TestBase):
self.assertEqual(
param_dict["w2_scale_bias"].shape,
(self.experts, self.output_size, 16 // self.quant_method.tp_size))
# per-channel weight
self.quant_method.is_per_channel_weight = True
param_dict = self.quant_method.get_dynamic_quant_param(
self.experts, self.input_size, self.output_size, torch.bfloat16)
pergroup_param = [
"w13_weight_scale_second", "w13_weight_offset_second",
"w2_weight_scale_second", "w2_weight_offset_second"
]
is_contains = any(key in param_dict for key in pergroup_param)
self.assertFalse(is_contains)
def build_layer(self,
is_new_quant_version=True,
is_per_channel_weight=False):
layer = torch.nn.Module()
if is_new_quant_version:
layer.w13_weight = torch.nn.Parameter(torch.zeros(
(self.experts, self.input_size, self.output_size),
dtype=torch.int8),
requires_grad=False)
layer.w2_weight = torch.nn.Parameter(torch.zeros(
(self.experts, self.output_size // 2, self.input_size),
dtype=torch.int8),
requires_grad=False)
w13_scale_bias = torch.zeros(
(self.experts, 2 * self.input_size, 1), dtype=torch.float32)
layer.w13_scale_bias = torch.nn.Parameter(w13_scale_bias,
requires_grad=False)
w2_scale_bias = torch.zeros((self.experts, self.output_size,
16 // self.quant_method.tp_size),
dtype=torch.float32)
layer.w2_scale_bias = torch.nn.Parameter(w2_scale_bias,
requires_grad=False)
else:
layer.w13_weight = torch.nn.Parameter(torch.zeros(
(self.experts, 2 * self.input_size, self.output_size),
dtype=torch.int8),
requires_grad=False)
layer.w2_weight = torch.nn.Parameter(torch.zeros(
(self.experts, self.output_size, self.input_size),
dtype=torch.int8),
requires_grad=False)
layer.w13_weight_scale = torch.nn.Parameter(torch.ones(
(self.experts, 2 * self.input_size, 1), dtype=torch.float32),
requires_grad=False)
layer.w2_weight_scale = torch.nn.Parameter(torch.ones(
(self.experts, self.output_size, 1), dtype=torch.float32),
requires_grad=False)
if not is_per_channel_weight:
layer.w13_weight_scale_second = torch.nn.Parameter(
torch.ones((self.experts, 2 * self.input_size,
self.output_size // self.group_size),
dtype=torch.float32),
requires_grad=False)
layer.w13_weight_offset_second = torch.nn.Parameter(
torch.empty_like(layer.w13_weight_scale_second.data),
requires_grad=False)
layer.w2_weight_scale_second = torch.nn.Parameter(
torch.ones((self.experts, self.output_size,
self.input_size // self.group_size),
dtype=torch.float32),
requires_grad=False)
layer.w2_weight_offset_second = torch.nn.Parameter(
torch.empty_like(layer.w2_weight_scale_second.data),
requires_grad=False)
return layer
@patch('torch_npu.npu_quantize')
@patch('torch.Tensor.npu')
def test_process_weights_after_loading(self, mock_npu, mock_npu_quantize):
# old quant version weight
layer = torch.nn.Module()
layer.w13_weight = torch.nn.Parameter(torch.zeros(
(self.experts, 2 * self.input_size, self.output_size),
dtype=torch.int8),
requires_grad=False)
layer.w2_weight = torch.nn.Parameter(torch.zeros(
(self.experts, self.output_size, self.input_size),
dtype=torch.int8),
requires_grad=False)
layer.w13_weight_scale = torch.nn.Parameter(torch.ones(
(self.experts, 2 * self.input_size, 1), dtype=torch.bfloat16),
requires_grad=False)
layer.w13_weight_scale_second = torch.nn.Parameter(torch.ones(
(self.experts, 2 * self.input_size,
self.output_size // self.group_size),
dtype=torch.bfloat16),
requires_grad=False)
layer.w2_weight_scale = torch.nn.Parameter(torch.ones(
(self.experts, self.output_size, 1), dtype=torch.bfloat16),
requires_grad=False)
layer.w2_weight_scale_second = torch.nn.Parameter(torch.ones(
(self.experts, self.output_size,
self.input_size // self.group_size),
dtype=torch.bfloat16),
requires_grad=False)
new_layer = copy.deepcopy(layer)
mock_npu.return_value = torch.Tensor()
mock_npu_quantize.return_value = torch.Tensor()
# old quant version weight
layer = self.build_layer(is_new_quant_version=False)
self.quant_method.process_weights_after_loading(layer)
self.assertTrue(hasattr(layer, "w13_scale_bias"))
self.assertEqual(layer.w13_scale_bias.data.shape,
@@ -154,23 +193,17 @@ class TestAscendW4A8DynamicFusedMoEMethod(TestBase):
self.assertEqual(layer.w2_scale_bias.data.dtype, torch.float32)
# new quant version weight
self.quant_method.new_quant_version = True
new_layer.w13_weight.data = torch.zeros(
(self.experts, self.input_size, self.output_size),
dtype=torch.int8)
new_layer.w2_weight.data = torch.zeros(
(self.experts, self.output_size // 2, self.input_size),
dtype=torch.int8)
w13_scale_bias = torch.zeros((self.experts, 2 * self.input_size, 1),
dtype=torch.float32)
new_layer.w13_scale_bias = torch.nn.Parameter(w13_scale_bias,
requires_grad=False)
w2_scale_bias = torch.zeros(
(self.experts, self.output_size, 16 // self.quant_method.tp_size),
dtype=torch.float32)
new_layer.w2_scale_bias = torch.nn.Parameter(w2_scale_bias,
requires_grad=False)
new_layer = self.build_layer(is_new_quant_version=True)
self.quant_method.process_weights_after_loading(new_layer)
self.assertEqual(new_layer.w13_scale_bias.data.shape,
(self.experts, 2 * self.input_size))
self.assertEqual(new_layer.w2_scale_bias.data.shape,
(self.experts, self.output_size))
self.assertFalse(hasattr(new_layer, "w13_weight_scale_second"))
# per-channel weight
self.quant_method.is_per_channel_weight = True
per_channel_layer = self.build_layer(is_new_quant_version=True,
is_per_channel_weight=True)
self.quant_method.process_weights_after_loading(per_channel_layer)
self.assertEqual(new_layer.w13_scale_bias.data.shape,
(self.experts, 2 * self.input_size))

View File

@@ -0,0 +1,95 @@
from unittest.mock import MagicMock, patch
import torch
from vllm.attention.backends.abstract import AttentionType
from vllm.distributed.parallel_state import GroupCoordinator
from tests.ut.base import TestBase
from vllm_ascend.attention.attention_v1 import AscendAttentionState
from vllm_ascend.torchair.torchair_attention import \
AscendAttentionTorchairBackendImpl
class TestAscendAttentionTorchairBackendImpl(TestBase):
@patch("torch.zeros")
@patch('vllm.distributed.parallel_state._TP',
new_callable=lambda: MagicMock(spec=GroupCoordinator)) # TODO
@patch("vllm.distributed.get_tensor_model_parallel_world_size",
return_value=2) # TODO
@patch("vllm.config.get_current_vllm_config") # TODO
@patch("vllm_ascend.torchair.torchair_mla.get_ascend_config") # TODO
def setUp(self, ascend_config, vllm_config, mock_get_tp_size, mock_tp,
mock_zeros):
mock_tp.world_size = 2 # TODO
ascend_config.torchair_graph_config.enabled = True # TODO
ascend_config.torchair_graph_config.enable_kv_nz = False # TODO
speculative_config = MagicMock()
speculative_config.num_speculative_tokens = 4
vllm_config.speculative_config = speculative_config
num_heads = 32
head_size = 128 # TODO
scale = 0.1 # TODO
num_kv_heads = 4
kv_cache_dtype = "auto"
attn_type = AttentionType.DECODER
mock_zeros.return_value = torch.ones((),
device='cpu',
dtype=torch.int32)
self.impl = AscendAttentionTorchairBackendImpl(
num_heads=num_heads,
head_size=head_size,
scale=scale,
num_kv_heads=num_kv_heads,
alibi_slopes=None,
sliding_window=None,
kv_cache_dtype=kv_cache_dtype,
blocksparse_params=None,
logits_soft_cap=None,
attn_type=attn_type,
kv_sharing_target_layer_name=None)
@patch("torch_npu.npu_scatter_nd_update_")
@patch("torch_npu.npu_fused_infer_attention_score")
def test_forward_with_decode_only(self, mock_fused, _):
layer = MagicMock()
layer._k_scale_float = 1.0
layer._v_scale_float = 1.0
seq_len = 1
num_tokens = 100
num_blocks = 256
block_size = 4
query = torch.randn(num_tokens, seq_len,
self.impl.num_heads * self.impl.head_size)
key = torch.randn(num_tokens, seq_len,
self.impl.num_kv_heads * self.impl.head_size)
value = torch.randn(num_tokens, seq_len,
self.impl.num_kv_heads * self.impl.head_size)
kv_cache = (torch.randn(num_blocks, block_size,
self.impl.num_heads * self.impl.head_size),
torch.randn(num_blocks, block_size,
self.impl.num_heads * self.impl.head_size))
output = torch.randn(num_tokens, self.impl.num_heads,
self.impl.head_size)
decode = MagicMock() # TODO
decode.seq_lens_list = [2] * num_tokens
decode.block_table = torch.ones(num_tokens, 8, dtype=torch.int32)
decode.attn_mask = None
metadata = MagicMock()
metadata.attn_state = AscendAttentionState.DecodeOnly
metadata.slot_mapping = torch.arange(num_tokens, dtype=torch.int32)
metadata.decode = decode
mock_fused.return_value = (torch.ones(num_tokens, self.impl.num_heads,
self.impl.head_size),
torch.ones(1))
result = self.impl.forward(layer, query, key, value, kv_cache,
metadata, output, False)
self.assertEqual(result.shape[0], num_tokens)

View File

@@ -190,12 +190,15 @@ class TestAscendMLATorchairMetadataBuilder(TestBase):
mock_vllm_config.scheduler_config.chunked_prefill_enabled = False
mock_device = 'cpu'
mock_vllm_config.speculative_config = None
ascend_config = MagicMock()
ascend_config.torchair_graph_config = MagicMock()
ascend_config.torchair_graph_config.enabled = True
with patch("vllm_ascend.torchair.torchair_mla.get_ascend_config",
return_value=ascend_config):
builder = AscendMLATorchairMetadataBuilder(mock_vllm_config,
builder = AscendMLATorchairMetadataBuilder(None, None,
mock_vllm_config,
mock_device)
self.assertEqual(builder.block_size,
@@ -216,7 +219,10 @@ class TestAscendMLATorchairMetadataBuilder(TestBase):
ascend_config.torchair_graph_config = MagicMock()
ascend_config.torchair_graph_config.enabled = True
builder = AscendMLATorchairMetadataBuilder(mock_vllm_config,
mock_vllm_config.speculative_config = None
builder = AscendMLATorchairMetadataBuilder(None, None,
mock_vllm_config,
mock_device)
input_batch = MagicMock()
@@ -250,9 +256,12 @@ class TestAscendMLATorchairMetadataBuilder(TestBase):
mock_vllm_config.scheduler_config.chunked_prefill_enabled = False
mock_device = 'cpu'
mock_vllm_config.speculative_config = None
with patch("vllm_ascend.torchair.torchair_mla.get_ascend_config",
return_value=ascend_config):
builder = AscendMLATorchairMetadataBuilder(mock_vllm_config,
builder = AscendMLATorchairMetadataBuilder(None, None,
mock_vllm_config,
mock_device)
input_batch = MagicMock()
@@ -285,7 +294,10 @@ class TestAscendMLATorchairMetadataBuilder(TestBase):
mock_vllm_config.scheduler_config.chunked_prefill_enabled = False
mock_device = 'cpu'
builder = AscendMLATorchairMetadataBuilder(mock_vllm_config,
mock_vllm_config.speculative_config = None
builder = AscendMLATorchairMetadataBuilder(None, None,
mock_vllm_config,
mock_device)
block_tables = torch.randint(0, 100, (3, 10), dtype=torch.int32)
@@ -305,7 +317,10 @@ class TestAscendMLATorchairMetadataBuilder(TestBase):
mock_vllm_config.scheduler_config.chunked_prefill_enabled = False
mock_device = 'cpu'
builder = AscendMLATorchairMetadataBuilder(mock_vllm_config,
mock_vllm_config.speculative_config = None
builder = AscendMLATorchairMetadataBuilder(None, None,
mock_vllm_config,
mock_device)
block_tables = torch.randint(0, 100, (3, 10), dtype=torch.int32)
@@ -326,7 +341,10 @@ class TestAscendMLATorchairMetadataBuilder(TestBase):
mock_vllm_config.scheduler_config.chunked_prefill_enabled = False
mock_device = 'cpu'
builder = AscendMLATorchairMetadataBuilder(mock_vllm_config,
mock_vllm_config.speculative_config = None
builder = AscendMLATorchairMetadataBuilder(None, None,
mock_vllm_config,
mock_device)
block_tables = torch.randint(0, 100, (3, 10), dtype=torch.int32)
@@ -351,7 +369,11 @@ class TestAscendMLATorchairMetadataBuilder(TestBase):
mock_vllm_config.model_config.dtype = torch.float16
mock_device = 'cpu'
mock_vllm_config.speculative_config = None
builder = AscendMLATorchairMetadataBuilder(
None,
None,
mock_vllm_config,
mock_device,
metadata_cls=AscendMLATorchairMetadata)
@@ -416,7 +438,11 @@ class TestAscendMLATorchairMetadataBuilder(TestBase):
model = MagicMock(spec=nn.Module)
model.model = MagicMock(spec=nn.Module)
mock_vllm_config.speculative_config = None
builder = AscendMLATorchairMetadataBuilder(
None,
None,
mock_vllm_config,
mock_device,
metadata_cls=AscendMLATorchairMetadata)
@@ -437,14 +463,16 @@ class TestAscendMLATorchairMetadataBuilder(TestBase):
max_query_len=1,
decode_token_per_req=torch.tensor([1, 1, 1]),
block_table_tensor=torch.zeros((10, 10)),
slot_mapping_cpu=torch.tensor(range(20)),
slot_mapping=torch.tensor(range(20)),
actual_seq_lengths_q=torch.tensor([0, 1, 2]),
positions=torch.tensor([1, 1]),
attn_mask=torch.ones((15, 15)),
spec_attn_mask=None,
attn_state=AscendAttentionState.ChunkedPrefill)
attn_state=AscendAttentionState.ChunkedPrefill,
num_computed_tokens_cpu=None,
seq_lens=None)
metadata = builder.build(common_attn_metadata, model)
metadata = builder.build(1, common_attn_metadata, model)
self.assertIsInstance(metadata, AscendMLATorchairMetadata)
self.assertEqual(metadata.num_input_tokens, 0)

View File

@@ -6,7 +6,6 @@ from unittest.mock import MagicMock, patch
import torch
from tests.ut.base import TestBase
from vllm_ascend.quantization.quantizer import SUPPORT_ASCEND_QUANTIZER_TYPE
from vllm_ascend.torchair import utils
@@ -135,15 +134,3 @@ class TestTorchairUtils(TestBase):
utils.converting_weight_acl_format(model, ACL_FORMAT_FRACTAL_NZ)
mock_npu_cast.assert_not_called()
def test_torchair_quant_method_register(self):
TorchairW8A8DYNAMICQuantizer = SUPPORT_ASCEND_QUANTIZER_TYPE[
"W8A8_DYNAMIC"]
TorchairW4A8DYNAMICQuantizer = SUPPORT_ASCEND_QUANTIZER_TYPE[
"W4A8_DYNAMIC"]
utils.torchair_quant_method_register()
self.assertNotEqual(TorchairW8A8DYNAMICQuantizer,
SUPPORT_ASCEND_QUANTIZER_TYPE["W8A8_DYNAMIC"])
self.assertNotEqual(TorchairW4A8DYNAMICQuantizer,
SUPPORT_ASCEND_QUANTIZER_TYPE["W4A8_DYNAMIC"])