[main] rename device type (#5099)

### What this PR does / why we need it? Rename `_910B` to `A2`; Rename `_910_93` to `A3`; Rename `_910_95` to `A5`; - vLLM version: v0.12.0 - vLLM main: ad32e3e19c Signed-off-by: zzzzwwjj <1183291235@qq.com>
2025-12-17 14:08:19 +08:00
parent 4144376e88
commit 06b82e7503
16 changed files with 47 additions and 48 deletions
--- a/vllm_ascend/ascend_forward_context.py
+++ b/vllm_ascend/ascend_forward_context.py
@@ -244,7 +244,7 @@ def select_moe_comm_method(num_tokens: int,

    if not vllm_config.parallel_config.enable_expert_parallel:
        moe_comm_type = MoECommType.ALLGATHER
-    elif soc_version in {AscendDeviceType._910B}:
+    elif soc_version in {AscendDeviceType.A2}:
        if (num_tokens <= mc2_tokens_capacity
                and vllm_config.parallel_config.world_size_across_dp /
                vllm_config.parallel_config.pipeline_parallel_size >= 16):
@@ -256,7 +256,7 @@ def select_moe_comm_method(num_tokens: int,
            else:
                moe_comm_type = MoECommType.ALLGATHER

-    elif soc_version in {AscendDeviceType._910_93}:
+    elif soc_version in {AscendDeviceType.A3}:
        ascend_config = get_ascend_config()
        dynamic_eplb = ascend_config.dynamic_eplb or ascend_config.expert_map_record_path
        # TODO: drop the EP-size guard when dispatch_ffn_combine supports larger EP sizes
--- a/vllm_ascend/attention/attention_v1.py
+++ b/vllm_ascend/attention/attention_v1.py
@@ -640,7 +640,7 @@ class AscendAttentionBackendImpl(AttentionImpl):
        attn_metadata: AscendMetadata,
        output: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
-        if get_ascend_device_type() == AscendDeviceType._910_95:
+        if get_ascend_device_type() == AscendDeviceType.A5:
            return self._forward_decode_only_ascend91095(
                query, attn_metadata, output)
        if self.sliding_window is not None and attn_metadata.seq_lens.shape[
@@ -729,7 +729,7 @@ class AscendAttentionBackendImpl(AttentionImpl):
            if self.key_cache is None:
                self.key_cache, self.value_cache = kv_cache[0], kv_cache[1]
            slots = attn_metadata.slot_mapping
-            if get_ascend_device_type() == AscendDeviceType._910_95:
+            if get_ascend_device_type() == AscendDeviceType.A5:
                # TODO: Once eagle running to here, it may has error because of the 0 dim of slot_mapping.
                # Should check if the 0 dim of slot_mapping must equal to the 0 dim of key.
                # If it's necessary, the slots should be sliced.
--- a/vllm_ascend/ops/fused_moe/token_dispatcher.py
+++ b/vllm_ascend/ops/fused_moe/token_dispatcher.py
@@ -99,7 +99,7 @@ class TokenDispatcherWithMC2(MoETokenDispatcher):
        self.enable_dispatch_v2 = hasattr(torch_npu,
                                          "npu_moe_distribute_dispatch_v2")
        self.need_extra_args = (
-            get_ascend_device_type() == AscendDeviceType._910_93)
+            get_ascend_device_type() == AscendDeviceType.A3)

        # NOTE: When in A2, setting the environment variables HCCL_INTRA_PCIE_ENABLE=1 and
        # HCCL_INTRA_ROCE_ENABLE=0 can reduce cross-machine communication traffic and significantly
--- a/vllm_ascend/ops/rotary_embedding.py
+++ b/vllm_ascend/ops/rotary_embedding.py
@@ -499,7 +499,7 @@ class AscendMRotaryEmbedding(MRotaryEmbedding):
        key: torch.Tensor,
    ):
        if self.mrope_section != [16, 24, 24] or \
-            get_ascend_device_type() == AscendDeviceType._910_95:
+            get_ascend_device_type() == AscendDeviceType.A5:
            return super().forward_oot(positions, query, key)

        import torch_npu
--- a/vllm_ascend/utils.py
+++ b/vllm_ascend/utils.py
@@ -674,10 +674,10 @@ def register_ascend_customop(vllm_config: Optional[VllmConfig] = None):


 class AscendDeviceType(Enum):
-    _910B = 0  # A2
-    _910_93 = 1  # A3
+    A2 = 0
+    A3 = 1
    _310P = 2
-    _910_95 = 3  # A5
+    A5 = 3


 _ascend_device_type = None
@@ -696,13 +696,13 @@ def check_ascend_device_type():

    soc_version = torch_npu.npu.get_soc_version()
    if 220 <= soc_version <= 225:
-        cur_device_type = AscendDeviceType._910B
+        cur_device_type = AscendDeviceType.A2
    elif 250 <= soc_version <= 255:
-        cur_device_type = AscendDeviceType._910_93
+        cur_device_type = AscendDeviceType.A3
    elif 200 <= soc_version <= 205:
        cur_device_type = AscendDeviceType._310P
    elif soc_version == 260:
-        cur_device_type = AscendDeviceType._910_95
+        cur_device_type = AscendDeviceType.A5
    else:
        raise RuntimeError(f"Can not support soc_version: {soc_version}.")

--- a/vllm_ascend/worker/worker_v1.py
+++ b/vllm_ascend/worker/worker_v1.py
@@ -90,7 +90,7 @@ class NPUWorker(WorkerBase):
        # Register ops when worker init.
        from vllm_ascend import ops
        ops.register_dummy_fusion_op()
-        if get_ascend_device_type() != AscendDeviceType._910_95:
+        if get_ascend_device_type() != AscendDeviceType.A5:
            _register_atb_extensions()
        register_ascend_customop(vllm_config)
        # init ascend config and soc version
@@ -360,7 +360,7 @@ class NPUWorker(WorkerBase):
            self.model_runner.capture_model()
        # Call ATB matmul to warm up; otherwise, the first operation (ReshapeAndCache)
        # may cause performance degradation at runtime.
-        if get_ascend_device_type() != AscendDeviceType._910_95:
+        if get_ascend_device_type() != AscendDeviceType.A5:
            self._warm_up_atb()
        # Reset the seed to ensure that the random state is not affected by
        # the model initialization and profiling.