init v0.11.0rc0
This commit is contained in:
@@ -165,8 +165,6 @@ class TestTorchairDeepSeekMTP(PytestBase):
|
||||
mocker.patch(
|
||||
"vllm_ascend.torchair.models.torchair_deepseek_mtp.TorchairDeepSeekMultiTokenPredictorLayer.__call__",
|
||||
return_value=None)
|
||||
mocker.patch("vllm.model_executor.layers.sampler.get_sampler",
|
||||
return_value=None)
|
||||
mocker.patch(
|
||||
"vllm_ascend.ops.vocab_parallel_embedding.AscendVocabParallelEmbedding.__init__",
|
||||
return_value=None)
|
||||
|
||||
@@ -100,6 +100,11 @@ def mock_distributed():
|
||||
pp_group.rank_in_group = 0
|
||||
pp_group.world_size = 1
|
||||
|
||||
mlp_tp_group = Mock(spec=GroupCoordinator)
|
||||
mlp_tp_group.rank_in_group = 0
|
||||
mlp_tp_group.world_size = 1
|
||||
mlp_tp_group.all_gather = Mock(return_value=torch.randn(2, 4, 128))
|
||||
|
||||
mock_vllm_config = Mock()
|
||||
mock_vllm_config.scheduler_config = Mock(max_num_seqs=256)
|
||||
mock_vllm_config.model_config = Mock(max_model_len=2048, quant_config=None)
|
||||
@@ -196,10 +201,6 @@ def test_torchair_deepseek_v2_mlp(mock_distributed, base_config):
|
||||
quant_config=None)
|
||||
assert isinstance(mlp.act_fn, TorchairDeepseekV2SiluAndMul)
|
||||
|
||||
x = torch.randn(2, 4, 128)
|
||||
output = mlp(x)
|
||||
assert output.shape == (2, 4, 128)
|
||||
|
||||
with patch(
|
||||
"vllm_ascend.torchair.models.torchair_deepseek_v2.QuantizationConfig"
|
||||
) as mock_quant_config:
|
||||
@@ -274,7 +275,12 @@ def test_torchair_deepseek_v2_mla_attention(mock_rms_norm, mock_distributed,
|
||||
|
||||
@patch("torch_npu.npu_add_rms_norm")
|
||||
@patch("torch_npu.npu_rms_norm")
|
||||
def test_torchair_deepseek_v2_decoder_layer(mock_rms_norm, mock_add_norm,
|
||||
@patch("torch.ops.vllm.maybe_wait_prefetch_done", side_effect=lambda x: None)
|
||||
@patch("torch.ops.vllm.maybe_chunk_residual",
|
||||
side_effect=lambda x, residual: residual)
|
||||
def test_torchair_deepseek_v2_decoder_layer(mock_maybe_chunk_residual,
|
||||
mock_maybe_wait_prefetch_done,
|
||||
mock_rms_norm, mock_add_norm,
|
||||
mock_distributed, base_config,
|
||||
vllm_config):
|
||||
mock_rms_norm.return_value = (torch.randn(2, 128), torch.randn(2, 128))
|
||||
|
||||
@@ -24,10 +24,10 @@ from vllm.model_executor.layers.fused_moe import FusedMoEMethodBase
|
||||
|
||||
from vllm_ascend.ascend_forward_context import _get_fused_moe_state
|
||||
from vllm_ascend.quantization.quant_config import AscendFusedMoEMethod
|
||||
from vllm_ascend.quantization.quantizer import W8A8Quantizer
|
||||
from vllm_ascend.torchair.ops.torchair_fused_moe import (
|
||||
TorchairAscendFusedMoE, TorchairAscendUnquantizedFusedMoEMethod)
|
||||
from vllm_ascend.utils import AscendSocVersion, adapt_patch # noqa E402
|
||||
from vllm_ascend.utils import adapt_patch # noqa E402
|
||||
from vllm_ascend.utils import AscendSocVersion, vllm_version_is
|
||||
|
||||
adapt_patch(True)
|
||||
|
||||
@@ -54,6 +54,10 @@ def mock_dp_and_tp_group(mocker):
|
||||
@pytest.fixture
|
||||
def mock_dist_env(mocker: MockerFixture):
|
||||
# init dist env patch
|
||||
if vllm_version_is("0.10.2"):
|
||||
dp_metadata = MagicMock(cu_tokens_across_dp_cpu=[5, 10])
|
||||
else:
|
||||
dp_metadata = MagicMock(num_tokens_across_dp_cpu=[5, 5])
|
||||
|
||||
with patch('torch.distributed.get_rank', return_value=0), \
|
||||
patch('torch.distributed.get_world_size', return_value=4), \
|
||||
@@ -67,13 +71,13 @@ def mock_dist_env(mocker: MockerFixture):
|
||||
patch('torch.distributed.all_to_all_single', return_value=torch.randn(8, 32)), \
|
||||
patch('vllm_ascend.torchair.ops.torchair_fused_moe.tensor_model_parallel_all_reduce',
|
||||
return_value=torch.randn(5, 32)), \
|
||||
patch('vllm_ascend.torchair.ops.torchair_fused_moe.data_parallel_reduce_scatter',
|
||||
return_value=torch.randn(5, 32)), \
|
||||
patch('vllm.model_executor.layers.fused_moe.config.get_dp_group',
|
||||
return_value=mock_dp_and_tp_group(mocker)), \
|
||||
patch('vllm_ascend.torchair.ops.torchair_fused_moe.get_ascend_config',
|
||||
return_value=MagicMock(
|
||||
torchair_graph_config=MagicMock(enabled=False, enable_multistream_moe=False),
|
||||
torchair_graph_config=MagicMock(enabled=False),
|
||||
enable_multistream_moe=False,
|
||||
enable_shared_expert_dp=False,
|
||||
expert_map_path=None
|
||||
)), \
|
||||
patch('vllm_ascend.torchair.ops.torchair_fused_moe.determine_expert_map',
|
||||
@@ -81,7 +85,7 @@ def mock_dist_env(mocker: MockerFixture):
|
||||
patch('vllm_ascend.torchair.ops.torchair_fused_moe.get_forward_context',
|
||||
return_value=MagicMock(
|
||||
max_tokens_across_dp=10,
|
||||
dp_metadata=MagicMock(cu_tokens_across_dp_cpu=[5, 10])
|
||||
dp_metadata=dp_metadata,
|
||||
)), \
|
||||
patch('vllm_ascend.torchair.ops.torchair_fused_moe.get_current_vllm_config',
|
||||
return_value=MagicMock(
|
||||
@@ -154,6 +158,8 @@ def default_moe_config():
|
||||
def moe_method(mock_dist_env):
|
||||
moe = MagicMock()
|
||||
moe.moe_parallel_config.return_value = MagicMock(ep_size=4)
|
||||
moe.moe_parallel_config.use_ep = False
|
||||
moe.moe_parallel_config.dp_size = 1
|
||||
return TorchairAscendUnquantizedFusedMoEMethod(moe)
|
||||
|
||||
|
||||
@@ -199,6 +205,9 @@ class MockFusedMoEMethod(FusedMoEMethodBase):
|
||||
expert_weights: torch.Tensor) -> torch.Tensor:
|
||||
pass
|
||||
|
||||
def get_fused_moe_quant_config(self, layer: torch.nn.Module):
|
||||
pass
|
||||
|
||||
|
||||
class TestTorchairAscendFusedMoe:
|
||||
|
||||
@@ -236,12 +245,9 @@ class TestTorchairAscendFusedMoe:
|
||||
mock_quant_method = MockFusedMoEMethod()
|
||||
mock_quant_config.get_quant_method.return_value = mock_quant_method
|
||||
mock_quant_config.is_layer_skipped_ascend.return_value = False
|
||||
with patch(
|
||||
'vllm_ascend.quantization.quantizer.AscendQuantizer.get_quantizer',
|
||||
return_value=W8A8Quantizer):
|
||||
with patch("vllm_ascend.quantization.quant_config.get_quant_method"):
|
||||
moe = TorchairAscendFusedMoE(**default_moe_config,
|
||||
quant_config=mock_quant_config)
|
||||
|
||||
assert moe.quant_method is not None
|
||||
assert isinstance(moe.quant_method, AscendFusedMoEMethod)
|
||||
|
||||
|
||||
@@ -5,8 +5,9 @@ import torch
|
||||
|
||||
from tests.ut.base import TestBase
|
||||
from vllm_ascend.torchair.ops.torchair_rotary_embedding import (
|
||||
custom_rotary_embedding_enabled, native_rope_deepseek_forward,
|
||||
rope_forward_oot, rotate_half, yarn_find_correction_dim, yarn_get_mscale)
|
||||
_set_cos_sin_cache, custom_rotary_embedding_enabled,
|
||||
native_rope_deepseek_forward, rope_forward_oot, rotate_half,
|
||||
yarn_find_correction_dim, yarn_get_mscale)
|
||||
|
||||
|
||||
class TestCustomRotaryEmbeddingEnabled(TestBase):
|
||||
@@ -103,7 +104,7 @@ class TestRopeForwardOot(TestBase):
|
||||
self.assertTrue(torch.equal(result_q, self.query))
|
||||
self.assertTrue(torch.equal(result_k, self.key))
|
||||
|
||||
@patch('torch.ops._C')
|
||||
@patch('torch.ops._C_ascend')
|
||||
@patch(
|
||||
'vllm_ascend.torchair.ops.torchair_rotary_embedding.get_ascend_config')
|
||||
@patch('vllm_ascend.torchair.ops.torchair_rotary_embedding.is_310p',
|
||||
@@ -200,6 +201,28 @@ class MockRopeModule:
|
||||
self.sin_cached = None
|
||||
self.rotary_dim = 1
|
||||
self.base = 1
|
||||
self.beta_fast = 32
|
||||
self.beta_slow = 1
|
||||
self.max_position_embeddings = 4096
|
||||
self.mscale = 1.0
|
||||
self.scaling_factor = 40
|
||||
|
||||
def register_buffer(self):
|
||||
pass
|
||||
|
||||
|
||||
class TestSetSinCosCache(TestBase):
|
||||
|
||||
def test_set_cos_sin_cache(self):
|
||||
module = MockRopeModule()
|
||||
|
||||
with patch.object(module, "register_buffer") as mock_register_buffer:
|
||||
_set_cos_sin_cache(module,
|
||||
1024,
|
||||
device="cpu",
|
||||
dtype=torch.bfloat16)
|
||||
|
||||
mock_register_buffer.assert_called()
|
||||
|
||||
|
||||
class TestNativeRopeDeepseekForward(TestBase):
|
||||
@@ -220,30 +243,6 @@ class TestNativeRopeDeepseekForward(TestBase):
|
||||
assert q_pe.shape == query.shape
|
||||
assert k_pe.shape == key.shape
|
||||
|
||||
@patch(
|
||||
'vllm_ascend.torchair.ops.torchair_rotary_embedding._set_cos_sin_cache'
|
||||
)
|
||||
@patch(
|
||||
'vllm_ascend.torchair.ops.torchair_rotary_embedding.rope_forward_oot')
|
||||
def test_native_rope_deepseek_forward_cache_handling(
|
||||
self, mock_rope_forward_oot, mock_set_cache):
|
||||
# Test cache situation is true
|
||||
module = MockRopeModule(max_seq_len=1024)
|
||||
positions = torch.tensor([1, 2, 3])
|
||||
query = torch.randn(1, 8, 128)
|
||||
key = torch.randn(1, 8, 128)
|
||||
|
||||
mock_rope_forward_oot.return_value = (query, key)
|
||||
|
||||
q_pe, k_pe = native_rope_deepseek_forward(module,
|
||||
positions,
|
||||
query,
|
||||
key,
|
||||
max_seq_len=2048)
|
||||
|
||||
assert q_pe.shape == query.shape
|
||||
assert k_pe.shape == key.shape
|
||||
|
||||
@patch(
|
||||
'vllm_ascend.torchair.ops.torchair_rotary_embedding.rope_forward_oot')
|
||||
def test_native_rope_deepseek_forward_key_reshaping(
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
import copy
|
||||
from unittest.mock import Mock, patch
|
||||
|
||||
import torch
|
||||
@@ -85,19 +84,19 @@ class TestAscendW4A8DynamicFusedMoEMethod(TestBase):
|
||||
# old quant version weight
|
||||
param_dict = self.quant_method.get_dynamic_quant_param(
|
||||
self.experts, self.input_size, self.output_size, torch.bfloat16)
|
||||
self.assertEqual(param_dict["w13_weight_scale"].dtype, torch.bfloat16)
|
||||
self.assertEqual(param_dict["w13_weight_scale"].dtype, torch.float32)
|
||||
self.assertEqual(param_dict["w13_weight_scale"].shape,
|
||||
(self.experts, 2 * self.input_size, 1))
|
||||
self.assertEqual(param_dict["w13_weight_scale_second"].dtype,
|
||||
torch.bfloat16)
|
||||
torch.float32)
|
||||
self.assertEqual(param_dict["w13_weight_scale_second"].shape,
|
||||
(self.experts, 2 * self.input_size,
|
||||
self.output_size // self.group_size))
|
||||
self.assertEqual(param_dict["w2_weight_scale"].dtype, torch.bfloat16)
|
||||
self.assertEqual(param_dict["w2_weight_scale"].dtype, torch.float32)
|
||||
self.assertEqual(param_dict["w2_weight_scale"].shape,
|
||||
(self.experts, self.output_size, 1))
|
||||
self.assertEqual(param_dict["w2_weight_scale_second"].dtype,
|
||||
torch.bfloat16)
|
||||
torch.float32)
|
||||
self.assertEqual(param_dict["w2_weight_scale_second"].shape,
|
||||
(self.experts, self.output_size,
|
||||
self.input_size // self.group_size))
|
||||
@@ -109,40 +108,80 @@ class TestAscendW4A8DynamicFusedMoEMethod(TestBase):
|
||||
self.assertEqual(
|
||||
param_dict["w2_scale_bias"].shape,
|
||||
(self.experts, self.output_size, 16 // self.quant_method.tp_size))
|
||||
# per-channel weight
|
||||
self.quant_method.is_per_channel_weight = True
|
||||
param_dict = self.quant_method.get_dynamic_quant_param(
|
||||
self.experts, self.input_size, self.output_size, torch.bfloat16)
|
||||
pergroup_param = [
|
||||
"w13_weight_scale_second", "w13_weight_offset_second",
|
||||
"w2_weight_scale_second", "w2_weight_offset_second"
|
||||
]
|
||||
is_contains = any(key in param_dict for key in pergroup_param)
|
||||
self.assertFalse(is_contains)
|
||||
|
||||
def build_layer(self,
|
||||
is_new_quant_version=True,
|
||||
is_per_channel_weight=False):
|
||||
layer = torch.nn.Module()
|
||||
if is_new_quant_version:
|
||||
layer.w13_weight = torch.nn.Parameter(torch.zeros(
|
||||
(self.experts, self.input_size, self.output_size),
|
||||
dtype=torch.int8),
|
||||
requires_grad=False)
|
||||
layer.w2_weight = torch.nn.Parameter(torch.zeros(
|
||||
(self.experts, self.output_size // 2, self.input_size),
|
||||
dtype=torch.int8),
|
||||
requires_grad=False)
|
||||
w13_scale_bias = torch.zeros(
|
||||
(self.experts, 2 * self.input_size, 1), dtype=torch.float32)
|
||||
layer.w13_scale_bias = torch.nn.Parameter(w13_scale_bias,
|
||||
requires_grad=False)
|
||||
w2_scale_bias = torch.zeros((self.experts, self.output_size,
|
||||
16 // self.quant_method.tp_size),
|
||||
dtype=torch.float32)
|
||||
layer.w2_scale_bias = torch.nn.Parameter(w2_scale_bias,
|
||||
requires_grad=False)
|
||||
else:
|
||||
layer.w13_weight = torch.nn.Parameter(torch.zeros(
|
||||
(self.experts, 2 * self.input_size, self.output_size),
|
||||
dtype=torch.int8),
|
||||
requires_grad=False)
|
||||
layer.w2_weight = torch.nn.Parameter(torch.zeros(
|
||||
(self.experts, self.output_size, self.input_size),
|
||||
dtype=torch.int8),
|
||||
requires_grad=False)
|
||||
layer.w13_weight_scale = torch.nn.Parameter(torch.ones(
|
||||
(self.experts, 2 * self.input_size, 1), dtype=torch.float32),
|
||||
requires_grad=False)
|
||||
layer.w2_weight_scale = torch.nn.Parameter(torch.ones(
|
||||
(self.experts, self.output_size, 1), dtype=torch.float32),
|
||||
requires_grad=False)
|
||||
if not is_per_channel_weight:
|
||||
layer.w13_weight_scale_second = torch.nn.Parameter(
|
||||
torch.ones((self.experts, 2 * self.input_size,
|
||||
self.output_size // self.group_size),
|
||||
dtype=torch.float32),
|
||||
requires_grad=False)
|
||||
layer.w13_weight_offset_second = torch.nn.Parameter(
|
||||
torch.empty_like(layer.w13_weight_scale_second.data),
|
||||
requires_grad=False)
|
||||
layer.w2_weight_scale_second = torch.nn.Parameter(
|
||||
torch.ones((self.experts, self.output_size,
|
||||
self.input_size // self.group_size),
|
||||
dtype=torch.float32),
|
||||
requires_grad=False)
|
||||
layer.w2_weight_offset_second = torch.nn.Parameter(
|
||||
torch.empty_like(layer.w2_weight_scale_second.data),
|
||||
requires_grad=False)
|
||||
return layer
|
||||
|
||||
@patch('torch_npu.npu_quantize')
|
||||
@patch('torch.Tensor.npu')
|
||||
def test_process_weights_after_loading(self, mock_npu, mock_npu_quantize):
|
||||
# old quant version weight
|
||||
layer = torch.nn.Module()
|
||||
layer.w13_weight = torch.nn.Parameter(torch.zeros(
|
||||
(self.experts, 2 * self.input_size, self.output_size),
|
||||
dtype=torch.int8),
|
||||
requires_grad=False)
|
||||
layer.w2_weight = torch.nn.Parameter(torch.zeros(
|
||||
(self.experts, self.output_size, self.input_size),
|
||||
dtype=torch.int8),
|
||||
requires_grad=False)
|
||||
layer.w13_weight_scale = torch.nn.Parameter(torch.ones(
|
||||
(self.experts, 2 * self.input_size, 1), dtype=torch.bfloat16),
|
||||
requires_grad=False)
|
||||
layer.w13_weight_scale_second = torch.nn.Parameter(torch.ones(
|
||||
(self.experts, 2 * self.input_size,
|
||||
self.output_size // self.group_size),
|
||||
dtype=torch.bfloat16),
|
||||
requires_grad=False)
|
||||
layer.w2_weight_scale = torch.nn.Parameter(torch.ones(
|
||||
(self.experts, self.output_size, 1), dtype=torch.bfloat16),
|
||||
requires_grad=False)
|
||||
layer.w2_weight_scale_second = torch.nn.Parameter(torch.ones(
|
||||
(self.experts, self.output_size,
|
||||
self.input_size // self.group_size),
|
||||
dtype=torch.bfloat16),
|
||||
requires_grad=False)
|
||||
new_layer = copy.deepcopy(layer)
|
||||
|
||||
mock_npu.return_value = torch.Tensor()
|
||||
mock_npu_quantize.return_value = torch.Tensor()
|
||||
# old quant version weight
|
||||
layer = self.build_layer(is_new_quant_version=False)
|
||||
self.quant_method.process_weights_after_loading(layer)
|
||||
self.assertTrue(hasattr(layer, "w13_scale_bias"))
|
||||
self.assertEqual(layer.w13_scale_bias.data.shape,
|
||||
@@ -154,23 +193,17 @@ class TestAscendW4A8DynamicFusedMoEMethod(TestBase):
|
||||
self.assertEqual(layer.w2_scale_bias.data.dtype, torch.float32)
|
||||
# new quant version weight
|
||||
self.quant_method.new_quant_version = True
|
||||
new_layer.w13_weight.data = torch.zeros(
|
||||
(self.experts, self.input_size, self.output_size),
|
||||
dtype=torch.int8)
|
||||
new_layer.w2_weight.data = torch.zeros(
|
||||
(self.experts, self.output_size // 2, self.input_size),
|
||||
dtype=torch.int8)
|
||||
w13_scale_bias = torch.zeros((self.experts, 2 * self.input_size, 1),
|
||||
dtype=torch.float32)
|
||||
new_layer.w13_scale_bias = torch.nn.Parameter(w13_scale_bias,
|
||||
requires_grad=False)
|
||||
w2_scale_bias = torch.zeros(
|
||||
(self.experts, self.output_size, 16 // self.quant_method.tp_size),
|
||||
dtype=torch.float32)
|
||||
new_layer.w2_scale_bias = torch.nn.Parameter(w2_scale_bias,
|
||||
requires_grad=False)
|
||||
new_layer = self.build_layer(is_new_quant_version=True)
|
||||
self.quant_method.process_weights_after_loading(new_layer)
|
||||
self.assertEqual(new_layer.w13_scale_bias.data.shape,
|
||||
(self.experts, 2 * self.input_size))
|
||||
self.assertEqual(new_layer.w2_scale_bias.data.shape,
|
||||
(self.experts, self.output_size))
|
||||
self.assertFalse(hasattr(new_layer, "w13_weight_scale_second"))
|
||||
# per-channel weight
|
||||
self.quant_method.is_per_channel_weight = True
|
||||
per_channel_layer = self.build_layer(is_new_quant_version=True,
|
||||
is_per_channel_weight=True)
|
||||
self.quant_method.process_weights_after_loading(per_channel_layer)
|
||||
self.assertEqual(new_layer.w13_scale_bias.data.shape,
|
||||
(self.experts, 2 * self.input_size))
|
||||
|
||||
95
tests/ut/torchair/test_torchair_attention.py
Normal file
95
tests/ut/torchair/test_torchair_attention.py
Normal file
@@ -0,0 +1,95 @@
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import torch
|
||||
from vllm.attention.backends.abstract import AttentionType
|
||||
from vllm.distributed.parallel_state import GroupCoordinator
|
||||
|
||||
from tests.ut.base import TestBase
|
||||
from vllm_ascend.attention.attention_v1 import AscendAttentionState
|
||||
from vllm_ascend.torchair.torchair_attention import \
|
||||
AscendAttentionTorchairBackendImpl
|
||||
|
||||
|
||||
class TestAscendAttentionTorchairBackendImpl(TestBase):
|
||||
|
||||
@patch("torch.zeros")
|
||||
@patch('vllm.distributed.parallel_state._TP',
|
||||
new_callable=lambda: MagicMock(spec=GroupCoordinator)) # TODO
|
||||
@patch("vllm.distributed.get_tensor_model_parallel_world_size",
|
||||
return_value=2) # TODO
|
||||
@patch("vllm.config.get_current_vllm_config") # TODO
|
||||
@patch("vllm_ascend.torchair.torchair_mla.get_ascend_config") # TODO
|
||||
def setUp(self, ascend_config, vllm_config, mock_get_tp_size, mock_tp,
|
||||
mock_zeros):
|
||||
mock_tp.world_size = 2 # TODO
|
||||
ascend_config.torchair_graph_config.enabled = True # TODO
|
||||
ascend_config.torchair_graph_config.enable_kv_nz = False # TODO
|
||||
speculative_config = MagicMock()
|
||||
speculative_config.num_speculative_tokens = 4
|
||||
vllm_config.speculative_config = speculative_config
|
||||
|
||||
num_heads = 32
|
||||
head_size = 128 # TODO
|
||||
scale = 0.1 # TODO
|
||||
num_kv_heads = 4
|
||||
kv_cache_dtype = "auto"
|
||||
attn_type = AttentionType.DECODER
|
||||
mock_zeros.return_value = torch.ones((),
|
||||
device='cpu',
|
||||
dtype=torch.int32)
|
||||
|
||||
self.impl = AscendAttentionTorchairBackendImpl(
|
||||
num_heads=num_heads,
|
||||
head_size=head_size,
|
||||
scale=scale,
|
||||
num_kv_heads=num_kv_heads,
|
||||
alibi_slopes=None,
|
||||
sliding_window=None,
|
||||
kv_cache_dtype=kv_cache_dtype,
|
||||
blocksparse_params=None,
|
||||
logits_soft_cap=None,
|
||||
attn_type=attn_type,
|
||||
kv_sharing_target_layer_name=None)
|
||||
|
||||
@patch("torch_npu.npu_scatter_nd_update_")
|
||||
@patch("torch_npu.npu_fused_infer_attention_score")
|
||||
def test_forward_with_decode_only(self, mock_fused, _):
|
||||
layer = MagicMock()
|
||||
layer._k_scale_float = 1.0
|
||||
layer._v_scale_float = 1.0
|
||||
|
||||
seq_len = 1
|
||||
num_tokens = 100
|
||||
num_blocks = 256
|
||||
block_size = 4
|
||||
|
||||
query = torch.randn(num_tokens, seq_len,
|
||||
self.impl.num_heads * self.impl.head_size)
|
||||
key = torch.randn(num_tokens, seq_len,
|
||||
self.impl.num_kv_heads * self.impl.head_size)
|
||||
value = torch.randn(num_tokens, seq_len,
|
||||
self.impl.num_kv_heads * self.impl.head_size)
|
||||
kv_cache = (torch.randn(num_blocks, block_size,
|
||||
self.impl.num_heads * self.impl.head_size),
|
||||
torch.randn(num_blocks, block_size,
|
||||
self.impl.num_heads * self.impl.head_size))
|
||||
output = torch.randn(num_tokens, self.impl.num_heads,
|
||||
self.impl.head_size)
|
||||
|
||||
decode = MagicMock() # TODO
|
||||
decode.seq_lens_list = [2] * num_tokens
|
||||
decode.block_table = torch.ones(num_tokens, 8, dtype=torch.int32)
|
||||
decode.attn_mask = None
|
||||
|
||||
metadata = MagicMock()
|
||||
metadata.attn_state = AscendAttentionState.DecodeOnly
|
||||
metadata.slot_mapping = torch.arange(num_tokens, dtype=torch.int32)
|
||||
metadata.decode = decode
|
||||
|
||||
mock_fused.return_value = (torch.ones(num_tokens, self.impl.num_heads,
|
||||
self.impl.head_size),
|
||||
torch.ones(1))
|
||||
|
||||
result = self.impl.forward(layer, query, key, value, kv_cache,
|
||||
metadata, output, False)
|
||||
self.assertEqual(result.shape[0], num_tokens)
|
||||
@@ -190,12 +190,15 @@ class TestAscendMLATorchairMetadataBuilder(TestBase):
|
||||
mock_vllm_config.scheduler_config.chunked_prefill_enabled = False
|
||||
mock_device = 'cpu'
|
||||
|
||||
mock_vllm_config.speculative_config = None
|
||||
|
||||
ascend_config = MagicMock()
|
||||
ascend_config.torchair_graph_config = MagicMock()
|
||||
ascend_config.torchair_graph_config.enabled = True
|
||||
with patch("vllm_ascend.torchair.torchair_mla.get_ascend_config",
|
||||
return_value=ascend_config):
|
||||
builder = AscendMLATorchairMetadataBuilder(mock_vllm_config,
|
||||
builder = AscendMLATorchairMetadataBuilder(None, None,
|
||||
mock_vllm_config,
|
||||
mock_device)
|
||||
|
||||
self.assertEqual(builder.block_size,
|
||||
@@ -216,7 +219,10 @@ class TestAscendMLATorchairMetadataBuilder(TestBase):
|
||||
ascend_config.torchair_graph_config = MagicMock()
|
||||
ascend_config.torchair_graph_config.enabled = True
|
||||
|
||||
builder = AscendMLATorchairMetadataBuilder(mock_vllm_config,
|
||||
mock_vllm_config.speculative_config = None
|
||||
|
||||
builder = AscendMLATorchairMetadataBuilder(None, None,
|
||||
mock_vllm_config,
|
||||
mock_device)
|
||||
|
||||
input_batch = MagicMock()
|
||||
@@ -250,9 +256,12 @@ class TestAscendMLATorchairMetadataBuilder(TestBase):
|
||||
mock_vllm_config.scheduler_config.chunked_prefill_enabled = False
|
||||
mock_device = 'cpu'
|
||||
|
||||
mock_vllm_config.speculative_config = None
|
||||
|
||||
with patch("vllm_ascend.torchair.torchair_mla.get_ascend_config",
|
||||
return_value=ascend_config):
|
||||
builder = AscendMLATorchairMetadataBuilder(mock_vllm_config,
|
||||
builder = AscendMLATorchairMetadataBuilder(None, None,
|
||||
mock_vllm_config,
|
||||
mock_device)
|
||||
|
||||
input_batch = MagicMock()
|
||||
@@ -285,7 +294,10 @@ class TestAscendMLATorchairMetadataBuilder(TestBase):
|
||||
mock_vllm_config.scheduler_config.chunked_prefill_enabled = False
|
||||
mock_device = 'cpu'
|
||||
|
||||
builder = AscendMLATorchairMetadataBuilder(mock_vllm_config,
|
||||
mock_vllm_config.speculative_config = None
|
||||
|
||||
builder = AscendMLATorchairMetadataBuilder(None, None,
|
||||
mock_vllm_config,
|
||||
mock_device)
|
||||
block_tables = torch.randint(0, 100, (3, 10), dtype=torch.int32)
|
||||
|
||||
@@ -305,7 +317,10 @@ class TestAscendMLATorchairMetadataBuilder(TestBase):
|
||||
mock_vllm_config.scheduler_config.chunked_prefill_enabled = False
|
||||
mock_device = 'cpu'
|
||||
|
||||
builder = AscendMLATorchairMetadataBuilder(mock_vllm_config,
|
||||
mock_vllm_config.speculative_config = None
|
||||
|
||||
builder = AscendMLATorchairMetadataBuilder(None, None,
|
||||
mock_vllm_config,
|
||||
mock_device)
|
||||
block_tables = torch.randint(0, 100, (3, 10), dtype=torch.int32)
|
||||
|
||||
@@ -326,7 +341,10 @@ class TestAscendMLATorchairMetadataBuilder(TestBase):
|
||||
mock_vllm_config.scheduler_config.chunked_prefill_enabled = False
|
||||
mock_device = 'cpu'
|
||||
|
||||
builder = AscendMLATorchairMetadataBuilder(mock_vllm_config,
|
||||
mock_vllm_config.speculative_config = None
|
||||
|
||||
builder = AscendMLATorchairMetadataBuilder(None, None,
|
||||
mock_vllm_config,
|
||||
mock_device)
|
||||
|
||||
block_tables = torch.randint(0, 100, (3, 10), dtype=torch.int32)
|
||||
@@ -351,7 +369,11 @@ class TestAscendMLATorchairMetadataBuilder(TestBase):
|
||||
mock_vllm_config.model_config.dtype = torch.float16
|
||||
mock_device = 'cpu'
|
||||
|
||||
mock_vllm_config.speculative_config = None
|
||||
|
||||
builder = AscendMLATorchairMetadataBuilder(
|
||||
None,
|
||||
None,
|
||||
mock_vllm_config,
|
||||
mock_device,
|
||||
metadata_cls=AscendMLATorchairMetadata)
|
||||
@@ -416,7 +438,11 @@ class TestAscendMLATorchairMetadataBuilder(TestBase):
|
||||
model = MagicMock(spec=nn.Module)
|
||||
model.model = MagicMock(spec=nn.Module)
|
||||
|
||||
mock_vllm_config.speculative_config = None
|
||||
|
||||
builder = AscendMLATorchairMetadataBuilder(
|
||||
None,
|
||||
None,
|
||||
mock_vllm_config,
|
||||
mock_device,
|
||||
metadata_cls=AscendMLATorchairMetadata)
|
||||
@@ -437,14 +463,16 @@ class TestAscendMLATorchairMetadataBuilder(TestBase):
|
||||
max_query_len=1,
|
||||
decode_token_per_req=torch.tensor([1, 1, 1]),
|
||||
block_table_tensor=torch.zeros((10, 10)),
|
||||
slot_mapping_cpu=torch.tensor(range(20)),
|
||||
slot_mapping=torch.tensor(range(20)),
|
||||
actual_seq_lengths_q=torch.tensor([0, 1, 2]),
|
||||
positions=torch.tensor([1, 1]),
|
||||
attn_mask=torch.ones((15, 15)),
|
||||
spec_attn_mask=None,
|
||||
attn_state=AscendAttentionState.ChunkedPrefill)
|
||||
attn_state=AscendAttentionState.ChunkedPrefill,
|
||||
num_computed_tokens_cpu=None,
|
||||
seq_lens=None)
|
||||
|
||||
metadata = builder.build(common_attn_metadata, model)
|
||||
metadata = builder.build(1, common_attn_metadata, model)
|
||||
|
||||
self.assertIsInstance(metadata, AscendMLATorchairMetadata)
|
||||
self.assertEqual(metadata.num_input_tokens, 0)
|
||||
|
||||
@@ -6,7 +6,6 @@ from unittest.mock import MagicMock, patch
|
||||
import torch
|
||||
|
||||
from tests.ut.base import TestBase
|
||||
from vllm_ascend.quantization.quantizer import SUPPORT_ASCEND_QUANTIZER_TYPE
|
||||
from vllm_ascend.torchair import utils
|
||||
|
||||
|
||||
@@ -135,15 +134,3 @@ class TestTorchairUtils(TestBase):
|
||||
|
||||
utils.converting_weight_acl_format(model, ACL_FORMAT_FRACTAL_NZ)
|
||||
mock_npu_cast.assert_not_called()
|
||||
|
||||
def test_torchair_quant_method_register(self):
|
||||
|
||||
TorchairW8A8DYNAMICQuantizer = SUPPORT_ASCEND_QUANTIZER_TYPE[
|
||||
"W8A8_DYNAMIC"]
|
||||
TorchairW4A8DYNAMICQuantizer = SUPPORT_ASCEND_QUANTIZER_TYPE[
|
||||
"W4A8_DYNAMIC"]
|
||||
utils.torchair_quant_method_register()
|
||||
self.assertNotEqual(TorchairW8A8DYNAMICQuantizer,
|
||||
SUPPORT_ASCEND_QUANTIZER_TYPE["W8A8_DYNAMIC"])
|
||||
self.assertNotEqual(TorchairW4A8DYNAMICQuantizer,
|
||||
SUPPORT_ASCEND_QUANTIZER_TYPE["W4A8_DYNAMIC"])
|
||||
|
||||
Reference in New Issue
Block a user