diff --git a/setup.py b/setup.py index b0c54cdf..a38d9563 100644 --- a/setup.py +++ b/setup.py @@ -135,23 +135,22 @@ else: def gen_build_info(): soc_version = envs.SOC_VERSION - # TODO(zzzzwwjj): Add A5 case soc_to_device = { - "910b": "_910B", - "910c": "_910_93", + "910b": "A2", + "910c": "A3", "310p": "_310P", - "ascend910b1": "_910B", - "ascend910b2": "_910B", - "ascend910b2c": "_910B", - "ascend910b3": "_910B", - "ascend910b4": "_910B", - "ascend910b4-1": "_910B", - "ascend910_9391": "_910_93", - "ascend910_9381": "_910_93", - "ascend910_9372": "_910_93", - "ascend910_9392": "_910_93", - "ascend910_9382": "_910_93", - "ascend910_9362": "_910_93", + "ascend910b1": "A2", + "ascend910b2": "A2", + "ascend910b2c": "A2", + "ascend910b3": "A2", + "ascend910b4": "A2", + "ascend910b4-1": "A2", + "ascend910_9391": "A3", + "ascend910_9381": "A3", + "ascend910_9372": "A3", + "ascend910_9392": "A3", + "ascend910_9382": "A3", + "ascend910_9362": "A3", "ascend310p1": "_310P", "ascend310p3": "_310P", "ascend310p5": "_310P", @@ -160,7 +159,7 @@ def gen_build_info(): "ascend310p3vir02": "_310P", "ascend310p3vir04": "_310P", "ascend310p3vir08": "_310P", - "ascend910_9579": "_910_95", + "ascend910_9579": "A5", } assert soc_version in soc_to_device, f"Undefined soc_version: {soc_version}. Please file an issue to vllm-ascend." diff --git a/tests/e2e/multicard/test_aclgraph_capture_replay.py b/tests/e2e/multicard/test_aclgraph_capture_replay.py index e81b5615..a5e3b846 100644 --- a/tests/e2e/multicard/test_aclgraph_capture_replay.py +++ b/tests/e2e/multicard/test_aclgraph_capture_replay.py @@ -215,7 +215,7 @@ def test_aclgraph_capture_replay_metrics_dp2( # Part A: Warmup runs (Profile run + 2 runs per captured graph) warmup_runs = 1 + (2 * max_batch_sizes) soc_version = get_ascend_device_type() - if soc_version in {AscendDeviceType._910_93} and "DeepSeek" in model: + if soc_version in {AscendDeviceType.A3} and "DeepSeek" in model: # An extra warmup run is needed for MC2 warmup here warmup_runs += 1 diff --git a/tests/ut/attention/test_attention_v1.py b/tests/ut/attention/test_attention_v1.py index 5923a021..e2c83e41 100644 --- a/tests/ut/attention/test_attention_v1.py +++ b/tests/ut/attention/test_attention_v1.py @@ -26,7 +26,7 @@ class TestAscendAttentionBackend(TestBase): AscendAttentionMetadataBuilder) @patch('vllm_ascend.utils.get_ascend_device_type', - return_value=AscendDeviceType._910_93) + return_value=AscendDeviceType.A3) def test_get_kv_cache_shape_not_310p(self, mock_soc_version): result = AscendAttentionBackend.get_kv_cache_shape(10, 20, 30, 40) self.assertEqual(result, (2, 10, 20, 30, 40)) @@ -103,7 +103,7 @@ class TestAscendAttentionMetadataBuilder(TestBase): @patch('vllm_ascend.attention.attention_v1.AscendMetadata') @patch('vllm_ascend.utils.get_ascend_device_type', - return_value=AscendDeviceType._910_93) + return_value=AscendDeviceType.A3) def test_build_non_310p(self, mock_soc_version, mock_ascend_metadata): common_attn_metadata = AscendCommonAttentionMetadata( query_start_loc=torch.tensor([0, 2, 5, 9]), diff --git a/tests/ut/ops/test_activation.py b/tests/ut/ops/test_activation.py index 12e8f4c2..9b802365 100644 --- a/tests/ut/ops/test_activation.py +++ b/tests/ut/ops/test_activation.py @@ -49,7 +49,7 @@ def test_SiluAndMul_forward(mock_maybe_prefetch_mlp_down_proj, with patch("vllm_ascend.utils.get_ascend_device_type", return_value=AscendDeviceType._310P - if is_310p else AscendDeviceType._910_93): + if is_310p else AscendDeviceType.A3): layer = SiluAndMul() out = layer.forward(dummy_tensor) diff --git a/tests/ut/ops/test_fused_moe.py b/tests/ut/ops/test_fused_moe.py index d1981b2a..94a52b7c 100644 --- a/tests/ut/ops/test_fused_moe.py +++ b/tests/ut/ops/test_fused_moe.py @@ -127,7 +127,7 @@ def mock_dist_env(mocker: MockerFixture): return_value=mock_forward_context_obj), \ patch('vllm_ascend.ops.fused_moe.prepare_finalize.get_forward_context', return_value=mock_forward_context_obj), \ - patch("vllm_ascend.utils.get_ascend_device_type", return_value=AscendDeviceType._910_93), \ + patch("vllm_ascend.utils.get_ascend_device_type", return_value=AscendDeviceType.A3), \ patch('vllm_ascend.ops.fused_moe.moe_mlp.get_forward_context', return_value=mock_forward_context_obj), \ patch('vllm_ascend.ops.fused_moe.moe_comm_method.MC2CommImpl._get_token_dispatcher', @@ -323,7 +323,7 @@ class TestUnifiedApplyMLP(TestBase): @patch('vllm_ascend.ops.fused_moe.moe_mlp.get_forward_context') @patch('vllm_ascend.utils.get_ascend_device_type', - return_value=AscendDeviceType._910_93) + return_value=AscendDeviceType.A3) @patch('torch_npu.npu_grouped_matmul') @patch('torch_npu.npu_dynamic_quant') @patch('torch_npu.npu_dequant_swiglu_quant') @@ -386,7 +386,7 @@ class TestUnifiedApplyMLP(TestBase): self.assertEqual(result.dtype, torch.bfloat16) @patch('vllm_ascend.utils.get_ascend_device_type', - return_value=AscendDeviceType._910_93) + return_value=AscendDeviceType.A3) @patch('torch_npu.npu_grouped_matmul') @patch('torch_npu.npu_swiglu') @patch('torch_npu.npu_dynamic_quant') diff --git a/tests/ut/ops/test_layernorm.py b/tests/ut/ops/test_layernorm.py index e50656e8..03befc7e 100644 --- a/tests/ut/ops/test_layernorm.py +++ b/tests/ut/ops/test_layernorm.py @@ -30,7 +30,7 @@ def test_RMSNorm_forward(mock_add_rmsnorm, mock_rmsnorm, is_310p, residual, with patch("vllm_ascend.utils.get_ascend_device_type", return_value=AscendDeviceType._310P - if is_310p else AscendDeviceType._910_93): + if is_310p else AscendDeviceType.A3): layer = RMSNorm(hidden_size=8, eps=1e-05) if residual is not None: out_x, out_residual = layer.forward_oot(dummy_tensor, residual) diff --git a/tests/ut/ops/test_rotary_embedding.py b/tests/ut/ops/test_rotary_embedding.py index f3e263ff..569b70ab 100644 --- a/tests/ut/ops/test_rotary_embedding.py +++ b/tests/ut/ops/test_rotary_embedding.py @@ -99,7 +99,7 @@ class TestAscendRotaryEmbedding(unittest.TestCase): @patch('torch.ops._C_ascend') @patch('vllm_ascend.utils.get_ascend_device_type', - return_value=AscendDeviceType._910_93) + return_value=AscendDeviceType.A3) @patch('vllm_ascend.ops.rotary_embedding._custom_rotary_embedding_enabled', return_value=True) @patch('torch.ops._npu_rotary_embedding') diff --git a/tests/ut/ops/test_token_dispatcher.py b/tests/ut/ops/test_token_dispatcher.py index 26a741c3..140bae5c 100644 --- a/tests/ut/ops/test_token_dispatcher.py +++ b/tests/ut/ops/test_token_dispatcher.py @@ -53,7 +53,7 @@ class TestTokenDispatcherWithMC2(TestBase): # Mock get_ascend_device_type() self.ascend_soc_version_patch = patch( "vllm_ascend.ops.fused_moe.token_dispatcher.get_ascend_device_type", - return_value=AscendDeviceType._910_93) + return_value=AscendDeviceType.A3) self.ascend_soc_version_patch.start() kwargs = {"with_quant": False, "top_k": 8, "num_experts": 128} diff --git a/tests/ut/quantization/test_w8a8.py b/tests/ut/quantization/test_w8a8.py index c574d998..9a1c0e8d 100644 --- a/tests/ut/quantization/test_w8a8.py +++ b/tests/ut/quantization/test_w8a8.py @@ -347,7 +347,7 @@ class TestAscendC8KVCacheMethod(TestBase): self.assertEqual(param.shape, expected_shape) @patch('vllm_ascend.utils.get_ascend_device_type', - return_value=AscendDeviceType._910_93) + return_value=AscendDeviceType.A3) def test_process_weights_after_loading_not_310p(self, mock_soc_version): key_data = torch.ones(4 * 64) value_data = torch.ones(4 * 64) * 2 diff --git a/tests/ut/test_platform.py b/tests/ut/test_platform.py index 6a2ce5c8..a5257ce9 100644 --- a/tests/ut/test_platform.py +++ b/tests/ut/test_platform.py @@ -231,7 +231,7 @@ class TestNPUPlatform(TestBase): @patch("vllm_ascend.ascend_config.init_ascend_config") @patch("vllm_ascend.utils.update_aclgraph_sizes") @patch('vllm_ascend.utils.get_ascend_device_type', - return_value=AscendDeviceType._910_93) + return_value=AscendDeviceType.A3) @patch("os.environ", {}) @patch( "vllm_ascend.core.recompute_scheduler.RecomputeSchedulerConfig.initialize_from_config" @@ -263,7 +263,7 @@ class TestNPUPlatform(TestBase): mock_init_ascend.assert_called_once_with(vllm_config) @patch('vllm_ascend.utils.get_ascend_device_type', - return_value=AscendDeviceType._910_93) + return_value=AscendDeviceType.A3) @patch("vllm_ascend.ascend_config.init_ascend_config") @patch( "vllm_ascend.core.recompute_scheduler.RecomputeSchedulerConfig.initialize_from_config" @@ -288,7 +288,7 @@ class TestNPUPlatform(TestBase): self.assertTrue("Model config is missing" in cm.output[0]) @patch('vllm_ascend.utils.get_ascend_device_type', - return_value=AscendDeviceType._910_93) + return_value=AscendDeviceType.A3) @patch("vllm_ascend.ascend_config.init_ascend_config") @patch( "vllm_ascend.core.recompute_scheduler.RecomputeSchedulerConfig.initialize_from_config" @@ -324,7 +324,7 @@ class TestNPUPlatform(TestBase): ) @patch('vllm_ascend.utils.get_ascend_device_type', - return_value=AscendDeviceType._910_93) + return_value=AscendDeviceType.A3) @patch("vllm_ascend.utils.update_default_aclgraph_sizes") @patch("vllm_ascend.ascend_config.init_ascend_config") @patch( @@ -365,7 +365,7 @@ class TestNPUPlatform(TestBase): @pytest.mark.skip( "Revert me when vllm support setting cudagraph_mode on oot platform") @patch('vllm_ascend.utils.get_ascend_device_type', - return_value=AscendDeviceType._910_93) + return_value=AscendDeviceType.A3) @patch("vllm_ascend.ascend_config.init_ascend_config") def test_check_and_update_config_unsupported_cudagraph_mode( self, mock_init_ascend, mock_soc_version): @@ -394,7 +394,7 @@ class TestNPUPlatform(TestBase): ) @patch('vllm_ascend.utils.get_ascend_device_type', - return_value=AscendDeviceType._910_93) + return_value=AscendDeviceType.A3) @patch("vllm_ascend.ascend_config.init_ascend_config") @patch( "vllm_ascend.core.recompute_scheduler.RecomputeSchedulerConfig.initialize_from_config" @@ -421,7 +421,7 @@ class TestNPUPlatform(TestBase): self.assertEqual(vllm_config.cache_config.block_size, 128) @patch('vllm_ascend.utils.get_ascend_device_type', - return_value=AscendDeviceType._910_93) + return_value=AscendDeviceType.A3) @patch("vllm_ascend.ascend_config.init_ascend_config") @patch( "vllm_ascend.core.recompute_scheduler.RecomputeSchedulerConfig.initialize_from_config" diff --git a/vllm_ascend/ascend_forward_context.py b/vllm_ascend/ascend_forward_context.py index 3ebf11b4..ebc22bd1 100644 --- a/vllm_ascend/ascend_forward_context.py +++ b/vllm_ascend/ascend_forward_context.py @@ -244,7 +244,7 @@ def select_moe_comm_method(num_tokens: int, if not vllm_config.parallel_config.enable_expert_parallel: moe_comm_type = MoECommType.ALLGATHER - elif soc_version in {AscendDeviceType._910B}: + elif soc_version in {AscendDeviceType.A2}: if (num_tokens <= mc2_tokens_capacity and vllm_config.parallel_config.world_size_across_dp / vllm_config.parallel_config.pipeline_parallel_size >= 16): @@ -256,7 +256,7 @@ def select_moe_comm_method(num_tokens: int, else: moe_comm_type = MoECommType.ALLGATHER - elif soc_version in {AscendDeviceType._910_93}: + elif soc_version in {AscendDeviceType.A3}: ascend_config = get_ascend_config() dynamic_eplb = ascend_config.dynamic_eplb or ascend_config.expert_map_record_path # TODO: drop the EP-size guard when dispatch_ffn_combine supports larger EP sizes diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py index 7faa30a9..875dd432 100644 --- a/vllm_ascend/attention/attention_v1.py +++ b/vllm_ascend/attention/attention_v1.py @@ -640,7 +640,7 @@ class AscendAttentionBackendImpl(AttentionImpl): attn_metadata: AscendMetadata, output: Optional[torch.Tensor] = None, ) -> torch.Tensor: - if get_ascend_device_type() == AscendDeviceType._910_95: + if get_ascend_device_type() == AscendDeviceType.A5: return self._forward_decode_only_ascend91095( query, attn_metadata, output) if self.sliding_window is not None and attn_metadata.seq_lens.shape[ @@ -729,7 +729,7 @@ class AscendAttentionBackendImpl(AttentionImpl): if self.key_cache is None: self.key_cache, self.value_cache = kv_cache[0], kv_cache[1] slots = attn_metadata.slot_mapping - if get_ascend_device_type() == AscendDeviceType._910_95: + if get_ascend_device_type() == AscendDeviceType.A5: # TODO: Once eagle running to here, it may has error because of the 0 dim of slot_mapping. # Should check if the 0 dim of slot_mapping must equal to the 0 dim of key. # If it's necessary, the slots should be sliced. diff --git a/vllm_ascend/ops/fused_moe/token_dispatcher.py b/vllm_ascend/ops/fused_moe/token_dispatcher.py index e45504d9..1246d648 100644 --- a/vllm_ascend/ops/fused_moe/token_dispatcher.py +++ b/vllm_ascend/ops/fused_moe/token_dispatcher.py @@ -99,7 +99,7 @@ class TokenDispatcherWithMC2(MoETokenDispatcher): self.enable_dispatch_v2 = hasattr(torch_npu, "npu_moe_distribute_dispatch_v2") self.need_extra_args = ( - get_ascend_device_type() == AscendDeviceType._910_93) + get_ascend_device_type() == AscendDeviceType.A3) # NOTE: When in A2, setting the environment variables HCCL_INTRA_PCIE_ENABLE=1 and # HCCL_INTRA_ROCE_ENABLE=0 can reduce cross-machine communication traffic and significantly diff --git a/vllm_ascend/ops/rotary_embedding.py b/vllm_ascend/ops/rotary_embedding.py index 7f860470..566c48f4 100644 --- a/vllm_ascend/ops/rotary_embedding.py +++ b/vllm_ascend/ops/rotary_embedding.py @@ -499,7 +499,7 @@ class AscendMRotaryEmbedding(MRotaryEmbedding): key: torch.Tensor, ): if self.mrope_section != [16, 24, 24] or \ - get_ascend_device_type() == AscendDeviceType._910_95: + get_ascend_device_type() == AscendDeviceType.A5: return super().forward_oot(positions, query, key) import torch_npu diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py index dc02e1bc..9e2431bb 100644 --- a/vllm_ascend/utils.py +++ b/vllm_ascend/utils.py @@ -674,10 +674,10 @@ def register_ascend_customop(vllm_config: Optional[VllmConfig] = None): class AscendDeviceType(Enum): - _910B = 0 # A2 - _910_93 = 1 # A3 + A2 = 0 + A3 = 1 _310P = 2 - _910_95 = 3 # A5 + A5 = 3 _ascend_device_type = None @@ -696,13 +696,13 @@ def check_ascend_device_type(): soc_version = torch_npu.npu.get_soc_version() if 220 <= soc_version <= 225: - cur_device_type = AscendDeviceType._910B + cur_device_type = AscendDeviceType.A2 elif 250 <= soc_version <= 255: - cur_device_type = AscendDeviceType._910_93 + cur_device_type = AscendDeviceType.A3 elif 200 <= soc_version <= 205: cur_device_type = AscendDeviceType._310P elif soc_version == 260: - cur_device_type = AscendDeviceType._910_95 + cur_device_type = AscendDeviceType.A5 else: raise RuntimeError(f"Can not support soc_version: {soc_version}.") diff --git a/vllm_ascend/worker/worker_v1.py b/vllm_ascend/worker/worker_v1.py index 29e2fb85..3e1f3f59 100644 --- a/vllm_ascend/worker/worker_v1.py +++ b/vllm_ascend/worker/worker_v1.py @@ -90,7 +90,7 @@ class NPUWorker(WorkerBase): # Register ops when worker init. from vllm_ascend import ops ops.register_dummy_fusion_op() - if get_ascend_device_type() != AscendDeviceType._910_95: + if get_ascend_device_type() != AscendDeviceType.A5: _register_atb_extensions() register_ascend_customop(vllm_config) # init ascend config and soc version @@ -360,7 +360,7 @@ class NPUWorker(WorkerBase): self.model_runner.capture_model() # Call ATB matmul to warm up; otherwise, the first operation (ReshapeAndCache) # may cause performance degradation at runtime. - if get_ascend_device_type() != AscendDeviceType._910_95: + if get_ascend_device_type() != AscendDeviceType.A5: self._warm_up_atb() # Reset the seed to ensure that the random state is not affected by # the model initialization and profiling.