init v0.11.0rc0

This commit is contained in:
2025-10-14 10:38:28 +08:00
parent 67afd0ea78
commit 66dc16f966
278 changed files with 28130 additions and 11708 deletions

View File

@@ -24,10 +24,10 @@ from vllm.model_executor.layers.fused_moe import FusedMoEMethodBase
from vllm_ascend.ascend_forward_context import _get_fused_moe_state
from vllm_ascend.quantization.quant_config import AscendFusedMoEMethod
from vllm_ascend.quantization.quantizer import W8A8Quantizer
from vllm_ascend.torchair.ops.torchair_fused_moe import (
TorchairAscendFusedMoE, TorchairAscendUnquantizedFusedMoEMethod)
from vllm_ascend.utils import AscendSocVersion, adapt_patch # noqa E402
from vllm_ascend.utils import adapt_patch # noqa E402
from vllm_ascend.utils import AscendSocVersion, vllm_version_is
adapt_patch(True)
@@ -54,6 +54,10 @@ def mock_dp_and_tp_group(mocker):
@pytest.fixture
def mock_dist_env(mocker: MockerFixture):
# init dist env patch
if vllm_version_is("0.10.2"):
dp_metadata = MagicMock(cu_tokens_across_dp_cpu=[5, 10])
else:
dp_metadata = MagicMock(num_tokens_across_dp_cpu=[5, 5])
with patch('torch.distributed.get_rank', return_value=0), \
patch('torch.distributed.get_world_size', return_value=4), \
@@ -67,13 +71,13 @@ def mock_dist_env(mocker: MockerFixture):
patch('torch.distributed.all_to_all_single', return_value=torch.randn(8, 32)), \
patch('vllm_ascend.torchair.ops.torchair_fused_moe.tensor_model_parallel_all_reduce',
return_value=torch.randn(5, 32)), \
patch('vllm_ascend.torchair.ops.torchair_fused_moe.data_parallel_reduce_scatter',
return_value=torch.randn(5, 32)), \
patch('vllm.model_executor.layers.fused_moe.config.get_dp_group',
return_value=mock_dp_and_tp_group(mocker)), \
patch('vllm_ascend.torchair.ops.torchair_fused_moe.get_ascend_config',
return_value=MagicMock(
torchair_graph_config=MagicMock(enabled=False, enable_multistream_moe=False),
torchair_graph_config=MagicMock(enabled=False),
enable_multistream_moe=False,
enable_shared_expert_dp=False,
expert_map_path=None
)), \
patch('vllm_ascend.torchair.ops.torchair_fused_moe.determine_expert_map',
@@ -81,7 +85,7 @@ def mock_dist_env(mocker: MockerFixture):
patch('vllm_ascend.torchair.ops.torchair_fused_moe.get_forward_context',
return_value=MagicMock(
max_tokens_across_dp=10,
dp_metadata=MagicMock(cu_tokens_across_dp_cpu=[5, 10])
dp_metadata=dp_metadata,
)), \
patch('vllm_ascend.torchair.ops.torchair_fused_moe.get_current_vllm_config',
return_value=MagicMock(
@@ -154,6 +158,8 @@ def default_moe_config():
def moe_method(mock_dist_env):
moe = MagicMock()
moe.moe_parallel_config.return_value = MagicMock(ep_size=4)
moe.moe_parallel_config.use_ep = False
moe.moe_parallel_config.dp_size = 1
return TorchairAscendUnquantizedFusedMoEMethod(moe)
@@ -199,6 +205,9 @@ class MockFusedMoEMethod(FusedMoEMethodBase):
expert_weights: torch.Tensor) -> torch.Tensor:
pass
def get_fused_moe_quant_config(self, layer: torch.nn.Module):
pass
class TestTorchairAscendFusedMoe:
@@ -236,12 +245,9 @@ class TestTorchairAscendFusedMoe:
mock_quant_method = MockFusedMoEMethod()
mock_quant_config.get_quant_method.return_value = mock_quant_method
mock_quant_config.is_layer_skipped_ascend.return_value = False
with patch(
'vllm_ascend.quantization.quantizer.AscendQuantizer.get_quantizer',
return_value=W8A8Quantizer):
with patch("vllm_ascend.quantization.quant_config.get_quant_method"):
moe = TorchairAscendFusedMoE(**default_moe_config,
quant_config=mock_quant_config)
assert moe.quant_method is not None
assert isinstance(moe.quant_method, AscendFusedMoEMethod)

View File

@@ -5,8 +5,9 @@ import torch
from tests.ut.base import TestBase
from vllm_ascend.torchair.ops.torchair_rotary_embedding import (
custom_rotary_embedding_enabled, native_rope_deepseek_forward,
rope_forward_oot, rotate_half, yarn_find_correction_dim, yarn_get_mscale)
_set_cos_sin_cache, custom_rotary_embedding_enabled,
native_rope_deepseek_forward, rope_forward_oot, rotate_half,
yarn_find_correction_dim, yarn_get_mscale)
class TestCustomRotaryEmbeddingEnabled(TestBase):
@@ -103,7 +104,7 @@ class TestRopeForwardOot(TestBase):
self.assertTrue(torch.equal(result_q, self.query))
self.assertTrue(torch.equal(result_k, self.key))
@patch('torch.ops._C')
@patch('torch.ops._C_ascend')
@patch(
'vllm_ascend.torchair.ops.torchair_rotary_embedding.get_ascend_config')
@patch('vllm_ascend.torchair.ops.torchair_rotary_embedding.is_310p',
@@ -200,6 +201,28 @@ class MockRopeModule:
self.sin_cached = None
self.rotary_dim = 1
self.base = 1
self.beta_fast = 32
self.beta_slow = 1
self.max_position_embeddings = 4096
self.mscale = 1.0
self.scaling_factor = 40
def register_buffer(self):
pass
class TestSetSinCosCache(TestBase):
def test_set_cos_sin_cache(self):
module = MockRopeModule()
with patch.object(module, "register_buffer") as mock_register_buffer:
_set_cos_sin_cache(module,
1024,
device="cpu",
dtype=torch.bfloat16)
mock_register_buffer.assert_called()
class TestNativeRopeDeepseekForward(TestBase):
@@ -220,30 +243,6 @@ class TestNativeRopeDeepseekForward(TestBase):
assert q_pe.shape == query.shape
assert k_pe.shape == key.shape
@patch(
'vllm_ascend.torchair.ops.torchair_rotary_embedding._set_cos_sin_cache'
)
@patch(
'vllm_ascend.torchair.ops.torchair_rotary_embedding.rope_forward_oot')
def test_native_rope_deepseek_forward_cache_handling(
self, mock_rope_forward_oot, mock_set_cache):
# Test cache situation is true
module = MockRopeModule(max_seq_len=1024)
positions = torch.tensor([1, 2, 3])
query = torch.randn(1, 8, 128)
key = torch.randn(1, 8, 128)
mock_rope_forward_oot.return_value = (query, key)
q_pe, k_pe = native_rope_deepseek_forward(module,
positions,
query,
key,
max_seq_len=2048)
assert q_pe.shape == query.shape
assert k_pe.shape == key.shape
@patch(
'vllm_ascend.torchair.ops.torchair_rotary_embedding.rope_forward_oot')
def test_native_rope_deepseek_forward_key_reshaping(