[main] rename device type (#5099)
### What this PR does / why we need it?
Rename `_910B` to `A2`;
Rename `_910_93` to `A3`;
Rename `_910_95` to `A5`;
- vLLM version: v0.12.0
- vLLM main:
ad32e3e19c
Signed-off-by: zzzzwwjj <1183291235@qq.com>
This commit is contained in:
31
setup.py
31
setup.py
@@ -135,23 +135,22 @@ else:
|
||||
def gen_build_info():
|
||||
soc_version = envs.SOC_VERSION
|
||||
|
||||
# TODO(zzzzwwjj): Add A5 case
|
||||
soc_to_device = {
|
||||
"910b": "_910B",
|
||||
"910c": "_910_93",
|
||||
"910b": "A2",
|
||||
"910c": "A3",
|
||||
"310p": "_310P",
|
||||
"ascend910b1": "_910B",
|
||||
"ascend910b2": "_910B",
|
||||
"ascend910b2c": "_910B",
|
||||
"ascend910b3": "_910B",
|
||||
"ascend910b4": "_910B",
|
||||
"ascend910b4-1": "_910B",
|
||||
"ascend910_9391": "_910_93",
|
||||
"ascend910_9381": "_910_93",
|
||||
"ascend910_9372": "_910_93",
|
||||
"ascend910_9392": "_910_93",
|
||||
"ascend910_9382": "_910_93",
|
||||
"ascend910_9362": "_910_93",
|
||||
"ascend910b1": "A2",
|
||||
"ascend910b2": "A2",
|
||||
"ascend910b2c": "A2",
|
||||
"ascend910b3": "A2",
|
||||
"ascend910b4": "A2",
|
||||
"ascend910b4-1": "A2",
|
||||
"ascend910_9391": "A3",
|
||||
"ascend910_9381": "A3",
|
||||
"ascend910_9372": "A3",
|
||||
"ascend910_9392": "A3",
|
||||
"ascend910_9382": "A3",
|
||||
"ascend910_9362": "A3",
|
||||
"ascend310p1": "_310P",
|
||||
"ascend310p3": "_310P",
|
||||
"ascend310p5": "_310P",
|
||||
@@ -160,7 +159,7 @@ def gen_build_info():
|
||||
"ascend310p3vir02": "_310P",
|
||||
"ascend310p3vir04": "_310P",
|
||||
"ascend310p3vir08": "_310P",
|
||||
"ascend910_9579": "_910_95",
|
||||
"ascend910_9579": "A5",
|
||||
}
|
||||
|
||||
assert soc_version in soc_to_device, f"Undefined soc_version: {soc_version}. Please file an issue to vllm-ascend."
|
||||
|
||||
@@ -215,7 +215,7 @@ def test_aclgraph_capture_replay_metrics_dp2(
|
||||
# Part A: Warmup runs (Profile run + 2 runs per captured graph)
|
||||
warmup_runs = 1 + (2 * max_batch_sizes)
|
||||
soc_version = get_ascend_device_type()
|
||||
if soc_version in {AscendDeviceType._910_93} and "DeepSeek" in model:
|
||||
if soc_version in {AscendDeviceType.A3} and "DeepSeek" in model:
|
||||
# An extra warmup run is needed for MC2 warmup here
|
||||
warmup_runs += 1
|
||||
|
||||
|
||||
@@ -26,7 +26,7 @@ class TestAscendAttentionBackend(TestBase):
|
||||
AscendAttentionMetadataBuilder)
|
||||
|
||||
@patch('vllm_ascend.utils.get_ascend_device_type',
|
||||
return_value=AscendDeviceType._910_93)
|
||||
return_value=AscendDeviceType.A3)
|
||||
def test_get_kv_cache_shape_not_310p(self, mock_soc_version):
|
||||
result = AscendAttentionBackend.get_kv_cache_shape(10, 20, 30, 40)
|
||||
self.assertEqual(result, (2, 10, 20, 30, 40))
|
||||
@@ -103,7 +103,7 @@ class TestAscendAttentionMetadataBuilder(TestBase):
|
||||
|
||||
@patch('vllm_ascend.attention.attention_v1.AscendMetadata')
|
||||
@patch('vllm_ascend.utils.get_ascend_device_type',
|
||||
return_value=AscendDeviceType._910_93)
|
||||
return_value=AscendDeviceType.A3)
|
||||
def test_build_non_310p(self, mock_soc_version, mock_ascend_metadata):
|
||||
common_attn_metadata = AscendCommonAttentionMetadata(
|
||||
query_start_loc=torch.tensor([0, 2, 5, 9]),
|
||||
|
||||
@@ -49,7 +49,7 @@ def test_SiluAndMul_forward(mock_maybe_prefetch_mlp_down_proj,
|
||||
|
||||
with patch("vllm_ascend.utils.get_ascend_device_type",
|
||||
return_value=AscendDeviceType._310P
|
||||
if is_310p else AscendDeviceType._910_93):
|
||||
if is_310p else AscendDeviceType.A3):
|
||||
layer = SiluAndMul()
|
||||
out = layer.forward(dummy_tensor)
|
||||
|
||||
|
||||
@@ -127,7 +127,7 @@ def mock_dist_env(mocker: MockerFixture):
|
||||
return_value=mock_forward_context_obj), \
|
||||
patch('vllm_ascend.ops.fused_moe.prepare_finalize.get_forward_context',
|
||||
return_value=mock_forward_context_obj), \
|
||||
patch("vllm_ascend.utils.get_ascend_device_type", return_value=AscendDeviceType._910_93), \
|
||||
patch("vllm_ascend.utils.get_ascend_device_type", return_value=AscendDeviceType.A3), \
|
||||
patch('vllm_ascend.ops.fused_moe.moe_mlp.get_forward_context',
|
||||
return_value=mock_forward_context_obj), \
|
||||
patch('vllm_ascend.ops.fused_moe.moe_comm_method.MC2CommImpl._get_token_dispatcher',
|
||||
@@ -323,7 +323,7 @@ class TestUnifiedApplyMLP(TestBase):
|
||||
|
||||
@patch('vllm_ascend.ops.fused_moe.moe_mlp.get_forward_context')
|
||||
@patch('vllm_ascend.utils.get_ascend_device_type',
|
||||
return_value=AscendDeviceType._910_93)
|
||||
return_value=AscendDeviceType.A3)
|
||||
@patch('torch_npu.npu_grouped_matmul')
|
||||
@patch('torch_npu.npu_dynamic_quant')
|
||||
@patch('torch_npu.npu_dequant_swiglu_quant')
|
||||
@@ -386,7 +386,7 @@ class TestUnifiedApplyMLP(TestBase):
|
||||
self.assertEqual(result.dtype, torch.bfloat16)
|
||||
|
||||
@patch('vllm_ascend.utils.get_ascend_device_type',
|
||||
return_value=AscendDeviceType._910_93)
|
||||
return_value=AscendDeviceType.A3)
|
||||
@patch('torch_npu.npu_grouped_matmul')
|
||||
@patch('torch_npu.npu_swiglu')
|
||||
@patch('torch_npu.npu_dynamic_quant')
|
||||
|
||||
@@ -30,7 +30,7 @@ def test_RMSNorm_forward(mock_add_rmsnorm, mock_rmsnorm, is_310p, residual,
|
||||
|
||||
with patch("vllm_ascend.utils.get_ascend_device_type",
|
||||
return_value=AscendDeviceType._310P
|
||||
if is_310p else AscendDeviceType._910_93):
|
||||
if is_310p else AscendDeviceType.A3):
|
||||
layer = RMSNorm(hidden_size=8, eps=1e-05)
|
||||
if residual is not None:
|
||||
out_x, out_residual = layer.forward_oot(dummy_tensor, residual)
|
||||
|
||||
@@ -99,7 +99,7 @@ class TestAscendRotaryEmbedding(unittest.TestCase):
|
||||
|
||||
@patch('torch.ops._C_ascend')
|
||||
@patch('vllm_ascend.utils.get_ascend_device_type',
|
||||
return_value=AscendDeviceType._910_93)
|
||||
return_value=AscendDeviceType.A3)
|
||||
@patch('vllm_ascend.ops.rotary_embedding._custom_rotary_embedding_enabled',
|
||||
return_value=True)
|
||||
@patch('torch.ops._npu_rotary_embedding')
|
||||
|
||||
@@ -53,7 +53,7 @@ class TestTokenDispatcherWithMC2(TestBase):
|
||||
# Mock get_ascend_device_type()
|
||||
self.ascend_soc_version_patch = patch(
|
||||
"vllm_ascend.ops.fused_moe.token_dispatcher.get_ascend_device_type",
|
||||
return_value=AscendDeviceType._910_93)
|
||||
return_value=AscendDeviceType.A3)
|
||||
self.ascend_soc_version_patch.start()
|
||||
|
||||
kwargs = {"with_quant": False, "top_k": 8, "num_experts": 128}
|
||||
|
||||
@@ -347,7 +347,7 @@ class TestAscendC8KVCacheMethod(TestBase):
|
||||
self.assertEqual(param.shape, expected_shape)
|
||||
|
||||
@patch('vllm_ascend.utils.get_ascend_device_type',
|
||||
return_value=AscendDeviceType._910_93)
|
||||
return_value=AscendDeviceType.A3)
|
||||
def test_process_weights_after_loading_not_310p(self, mock_soc_version):
|
||||
key_data = torch.ones(4 * 64)
|
||||
value_data = torch.ones(4 * 64) * 2
|
||||
|
||||
@@ -231,7 +231,7 @@ class TestNPUPlatform(TestBase):
|
||||
@patch("vllm_ascend.ascend_config.init_ascend_config")
|
||||
@patch("vllm_ascend.utils.update_aclgraph_sizes")
|
||||
@patch('vllm_ascend.utils.get_ascend_device_type',
|
||||
return_value=AscendDeviceType._910_93)
|
||||
return_value=AscendDeviceType.A3)
|
||||
@patch("os.environ", {})
|
||||
@patch(
|
||||
"vllm_ascend.core.recompute_scheduler.RecomputeSchedulerConfig.initialize_from_config"
|
||||
@@ -263,7 +263,7 @@ class TestNPUPlatform(TestBase):
|
||||
mock_init_ascend.assert_called_once_with(vllm_config)
|
||||
|
||||
@patch('vllm_ascend.utils.get_ascend_device_type',
|
||||
return_value=AscendDeviceType._910_93)
|
||||
return_value=AscendDeviceType.A3)
|
||||
@patch("vllm_ascend.ascend_config.init_ascend_config")
|
||||
@patch(
|
||||
"vllm_ascend.core.recompute_scheduler.RecomputeSchedulerConfig.initialize_from_config"
|
||||
@@ -288,7 +288,7 @@ class TestNPUPlatform(TestBase):
|
||||
self.assertTrue("Model config is missing" in cm.output[0])
|
||||
|
||||
@patch('vllm_ascend.utils.get_ascend_device_type',
|
||||
return_value=AscendDeviceType._910_93)
|
||||
return_value=AscendDeviceType.A3)
|
||||
@patch("vllm_ascend.ascend_config.init_ascend_config")
|
||||
@patch(
|
||||
"vllm_ascend.core.recompute_scheduler.RecomputeSchedulerConfig.initialize_from_config"
|
||||
@@ -324,7 +324,7 @@ class TestNPUPlatform(TestBase):
|
||||
)
|
||||
|
||||
@patch('vllm_ascend.utils.get_ascend_device_type',
|
||||
return_value=AscendDeviceType._910_93)
|
||||
return_value=AscendDeviceType.A3)
|
||||
@patch("vllm_ascend.utils.update_default_aclgraph_sizes")
|
||||
@patch("vllm_ascend.ascend_config.init_ascend_config")
|
||||
@patch(
|
||||
@@ -365,7 +365,7 @@ class TestNPUPlatform(TestBase):
|
||||
@pytest.mark.skip(
|
||||
"Revert me when vllm support setting cudagraph_mode on oot platform")
|
||||
@patch('vllm_ascend.utils.get_ascend_device_type',
|
||||
return_value=AscendDeviceType._910_93)
|
||||
return_value=AscendDeviceType.A3)
|
||||
@patch("vllm_ascend.ascend_config.init_ascend_config")
|
||||
def test_check_and_update_config_unsupported_cudagraph_mode(
|
||||
self, mock_init_ascend, mock_soc_version):
|
||||
@@ -394,7 +394,7 @@ class TestNPUPlatform(TestBase):
|
||||
)
|
||||
|
||||
@patch('vllm_ascend.utils.get_ascend_device_type',
|
||||
return_value=AscendDeviceType._910_93)
|
||||
return_value=AscendDeviceType.A3)
|
||||
@patch("vllm_ascend.ascend_config.init_ascend_config")
|
||||
@patch(
|
||||
"vllm_ascend.core.recompute_scheduler.RecomputeSchedulerConfig.initialize_from_config"
|
||||
@@ -421,7 +421,7 @@ class TestNPUPlatform(TestBase):
|
||||
self.assertEqual(vllm_config.cache_config.block_size, 128)
|
||||
|
||||
@patch('vllm_ascend.utils.get_ascend_device_type',
|
||||
return_value=AscendDeviceType._910_93)
|
||||
return_value=AscendDeviceType.A3)
|
||||
@patch("vllm_ascend.ascend_config.init_ascend_config")
|
||||
@patch(
|
||||
"vllm_ascend.core.recompute_scheduler.RecomputeSchedulerConfig.initialize_from_config"
|
||||
|
||||
@@ -244,7 +244,7 @@ def select_moe_comm_method(num_tokens: int,
|
||||
|
||||
if not vllm_config.parallel_config.enable_expert_parallel:
|
||||
moe_comm_type = MoECommType.ALLGATHER
|
||||
elif soc_version in {AscendDeviceType._910B}:
|
||||
elif soc_version in {AscendDeviceType.A2}:
|
||||
if (num_tokens <= mc2_tokens_capacity
|
||||
and vllm_config.parallel_config.world_size_across_dp /
|
||||
vllm_config.parallel_config.pipeline_parallel_size >= 16):
|
||||
@@ -256,7 +256,7 @@ def select_moe_comm_method(num_tokens: int,
|
||||
else:
|
||||
moe_comm_type = MoECommType.ALLGATHER
|
||||
|
||||
elif soc_version in {AscendDeviceType._910_93}:
|
||||
elif soc_version in {AscendDeviceType.A3}:
|
||||
ascend_config = get_ascend_config()
|
||||
dynamic_eplb = ascend_config.dynamic_eplb or ascend_config.expert_map_record_path
|
||||
# TODO: drop the EP-size guard when dispatch_ffn_combine supports larger EP sizes
|
||||
|
||||
@@ -640,7 +640,7 @@ class AscendAttentionBackendImpl(AttentionImpl):
|
||||
attn_metadata: AscendMetadata,
|
||||
output: Optional[torch.Tensor] = None,
|
||||
) -> torch.Tensor:
|
||||
if get_ascend_device_type() == AscendDeviceType._910_95:
|
||||
if get_ascend_device_type() == AscendDeviceType.A5:
|
||||
return self._forward_decode_only_ascend91095(
|
||||
query, attn_metadata, output)
|
||||
if self.sliding_window is not None and attn_metadata.seq_lens.shape[
|
||||
@@ -729,7 +729,7 @@ class AscendAttentionBackendImpl(AttentionImpl):
|
||||
if self.key_cache is None:
|
||||
self.key_cache, self.value_cache = kv_cache[0], kv_cache[1]
|
||||
slots = attn_metadata.slot_mapping
|
||||
if get_ascend_device_type() == AscendDeviceType._910_95:
|
||||
if get_ascend_device_type() == AscendDeviceType.A5:
|
||||
# TODO: Once eagle running to here, it may has error because of the 0 dim of slot_mapping.
|
||||
# Should check if the 0 dim of slot_mapping must equal to the 0 dim of key.
|
||||
# If it's necessary, the slots should be sliced.
|
||||
|
||||
@@ -99,7 +99,7 @@ class TokenDispatcherWithMC2(MoETokenDispatcher):
|
||||
self.enable_dispatch_v2 = hasattr(torch_npu,
|
||||
"npu_moe_distribute_dispatch_v2")
|
||||
self.need_extra_args = (
|
||||
get_ascend_device_type() == AscendDeviceType._910_93)
|
||||
get_ascend_device_type() == AscendDeviceType.A3)
|
||||
|
||||
# NOTE: When in A2, setting the environment variables HCCL_INTRA_PCIE_ENABLE=1 and
|
||||
# HCCL_INTRA_ROCE_ENABLE=0 can reduce cross-machine communication traffic and significantly
|
||||
|
||||
@@ -499,7 +499,7 @@ class AscendMRotaryEmbedding(MRotaryEmbedding):
|
||||
key: torch.Tensor,
|
||||
):
|
||||
if self.mrope_section != [16, 24, 24] or \
|
||||
get_ascend_device_type() == AscendDeviceType._910_95:
|
||||
get_ascend_device_type() == AscendDeviceType.A5:
|
||||
return super().forward_oot(positions, query, key)
|
||||
|
||||
import torch_npu
|
||||
|
||||
@@ -674,10 +674,10 @@ def register_ascend_customop(vllm_config: Optional[VllmConfig] = None):
|
||||
|
||||
|
||||
class AscendDeviceType(Enum):
|
||||
_910B = 0 # A2
|
||||
_910_93 = 1 # A3
|
||||
A2 = 0
|
||||
A3 = 1
|
||||
_310P = 2
|
||||
_910_95 = 3 # A5
|
||||
A5 = 3
|
||||
|
||||
|
||||
_ascend_device_type = None
|
||||
@@ -696,13 +696,13 @@ def check_ascend_device_type():
|
||||
|
||||
soc_version = torch_npu.npu.get_soc_version()
|
||||
if 220 <= soc_version <= 225:
|
||||
cur_device_type = AscendDeviceType._910B
|
||||
cur_device_type = AscendDeviceType.A2
|
||||
elif 250 <= soc_version <= 255:
|
||||
cur_device_type = AscendDeviceType._910_93
|
||||
cur_device_type = AscendDeviceType.A3
|
||||
elif 200 <= soc_version <= 205:
|
||||
cur_device_type = AscendDeviceType._310P
|
||||
elif soc_version == 260:
|
||||
cur_device_type = AscendDeviceType._910_95
|
||||
cur_device_type = AscendDeviceType.A5
|
||||
else:
|
||||
raise RuntimeError(f"Can not support soc_version: {soc_version}.")
|
||||
|
||||
|
||||
@@ -90,7 +90,7 @@ class NPUWorker(WorkerBase):
|
||||
# Register ops when worker init.
|
||||
from vllm_ascend import ops
|
||||
ops.register_dummy_fusion_op()
|
||||
if get_ascend_device_type() != AscendDeviceType._910_95:
|
||||
if get_ascend_device_type() != AscendDeviceType.A5:
|
||||
_register_atb_extensions()
|
||||
register_ascend_customop(vllm_config)
|
||||
# init ascend config and soc version
|
||||
@@ -360,7 +360,7 @@ class NPUWorker(WorkerBase):
|
||||
self.model_runner.capture_model()
|
||||
# Call ATB matmul to warm up; otherwise, the first operation (ReshapeAndCache)
|
||||
# may cause performance degradation at runtime.
|
||||
if get_ascend_device_type() != AscendDeviceType._910_95:
|
||||
if get_ascend_device_type() != AscendDeviceType.A5:
|
||||
self._warm_up_atb()
|
||||
# Reset the seed to ensure that the random state is not affected by
|
||||
# the model initialization and profiling.
|
||||
|
||||
Reference in New Issue
Block a user