From 136ea9ff560b610f0db0065dc520062acbbe4283 Mon Sep 17 00:00:00 2001 From: zzzzwwjj <34335947+zzzzwwjj@users.noreply.github.com> Date: Wed, 26 Nov 2025 14:28:55 +0800 Subject: [PATCH] [refact] unified soc_version code (#4359) ### What this PR does / why we need it? Currently, there are two paths to judge the chip type in code, `get_ascend_soc_version` use `get_soc_version` api in torch_npu, and `is_310p` `use _build_info.__soc_version__`, which generate when install. We need to unify the two paths. We need to unify these codes based on the following points: 1. We need to ensure consistency in chip type judgment between compiling and running states; 2. In compiling state, we need chip type to complete op's compilation, but in running state, we only need device type(910B/910_93/310P/910_95/etc) to make code branch judgement; 3. In compiling state, torch_npu may not have been installed yet, so we can't use torch_npu's api. Based on the above points, we have made the following changes: 1. When user set env `SOC_VERSION`, use it; when not set, query soc_version by `npu-smi`; 2. generate device_type based on soc_version when compiling, and write `__device_type__` instead of `__soc_version__` in `_build_info.py`; 3. In running state, use `__device_type__` to judge code branch. ### Does this PR introduce _any_ user-facing change? When not set env `SOC_VERSION`, it will not be `ASCEND910B1` by default, we will query soc_version by `npu-smi`. And env `SOC_VERSION` must be in the list `soc_to_device` in `setup.py`. - vLLM version: v0.11.0 - vLLM main: https://github.com/vllm-project/vllm/commit/2918c1b49c88c29783c86f78d2c4221cb9622379 Signed-off-by: zzzzwwjj <1183291235@qq.com> --- .../disaggregated_prefill_v1/gen_ranktable.py | 9 +- setup.py | 88 +++++++++++++++++-- tests/ut/attention/test_attention_v1.py | 40 +++++---- tests/ut/models/conftest.py | 2 +- tests/ut/ops/test_activation.py | 12 ++- tests/ut/ops/test_fused_moe.py | 27 +++--- tests/ut/ops/test_layernorm.py | 6 +- tests/ut/ops/test_rotary_embedding.py | 8 +- tests/ut/ops/test_token_dispatcher.py | 8 +- tests/ut/quantization/test_w8a8.py | 22 +++-- tests/ut/test_platform.py | 52 ++++++----- tests/ut/test_utils.py | 10 --- .../torchair/ops/test_torchair_fused_moe.py | 4 +- .../ops/test_torchair_rotary_embedding.py | 8 +- .../test_torchair_w8a8_dynamic.py | 6 +- tests/ut/worker/test_model_runner_v1.py | 24 ++--- tests/ut/worker/test_worker_v1.py | 14 +-- vllm_ascend/attention/attention_v1.py | 16 ++-- .../llmdatadist_c_mgr_connector.py | 6 +- vllm_ascend/envs.py | 6 +- vllm_ascend/lora/punica_npu.py | 4 +- vllm_ascend/ops/activation.py | 4 +- vllm_ascend/ops/fused_moe/fused_moe.py | 9 +- vllm_ascend/ops/fused_moe/moe_mlp.py | 5 +- vllm_ascend/ops/fused_moe/token_dispatcher.py | 6 +- vllm_ascend/ops/layernorm.py | 11 +-- vllm_ascend/ops/rotary_embedding.py | 8 +- .../patch/platform/patch_distributed.py | 4 +- vllm_ascend/platform.py | 7 +- vllm_ascend/quantization/w8a8.py | 17 ++-- vllm_ascend/sample/sampler.py | 5 +- .../torchair/models/torchair_pangu_moe.py | 9 +- .../torchair/ops/torchair_activation.py | 4 +- .../torchair/ops/torchair_fused_moe.py | 10 +-- .../torchair/ops/torchair_layernorm.py | 4 +- .../torchair/ops/torchair_rotary_embedding.py | 8 +- .../quantization/torchair_w8a8_dynamic.py | 8 +- vllm_ascend/torchair/torchair_attention.py | 9 +- vllm_ascend/torchair/torchair_model_runner.py | 28 +++--- vllm_ascend/utils.py | 56 ++++++------ vllm_ascend/worker/model_runner_v1.py | 16 ++-- vllm_ascend/worker/worker_v1.py | 4 +- 42 files changed, 361 insertions(+), 243 deletions(-) diff --git a/examples/disaggregated_prefill_v1/gen_ranktable.py b/examples/disaggregated_prefill_v1/gen_ranktable.py index 3ed8b768..a687fa68 100644 --- a/examples/disaggregated_prefill_v1/gen_ranktable.py +++ b/examples/disaggregated_prefill_v1/gen_ranktable.py @@ -4,7 +4,7 @@ import os import torch.distributed as dist -from vllm_ascend.utils import AscendSocVersion, init_ascend_soc_version, get_ascend_soc_version +from vllm_ascend.utils import AscendDeviceType, get_ascend_device_type parser = argparse.ArgumentParser( description="Arguments of rank table generator", ) @@ -42,8 +42,7 @@ local_rank = os.environ.get("LOCAL_RANK") # and is different from WORLD_SIZE in gen_rank_table.sh. world_size = os.environ.get("WORLD_SIZE") -init_ascend_soc_version() -soc_info = get_ascend_soc_version() +device_type = get_ascend_device_type() def get_cmd_stdout(cmd): @@ -83,7 +82,7 @@ if local_rank == "0": device_id = local_device_ids[idx] chip_id = device_id % chips_per_card card_id = device_id // chips_per_card - if soc_info == AscendSocVersion.A3: + if device_type == AscendDeviceType._910_93: device_ip = get_cmd_stdout( f"{hccn_tool_path} -i {device_id} -vnic -g | grep ipaddr" ).split(":")[1].strip() @@ -103,7 +102,7 @@ if local_rank == "0": "device_id": str(device_id), "device_ip": str(device_ip), } - if soc_info == AscendSocVersion.A3: + if device_type == AscendDeviceType._910_93: device_info.update({ "super_pod_id": str(super_pod_id), "super_device_id": str(super_device_id) diff --git a/setup.py b/setup.py index 1c4ced47..3ab900dc 100644 --- a/setup.py +++ b/setup.py @@ -65,25 +65,103 @@ def check_or_set_default_env(cmake_args, return cmake_args +def get_value_from_lines(lines: List[str], key: str) -> str: + for line in lines: + line = ' '.join(line.split()) + if key in line: + return line.split(':')[-1].strip() + return "" + + +def get_chip_info() -> str: + try: + npu_info_lines = subprocess.check_output( + ['npu-smi', 'info', '-l']).decode().strip().split('\n') + npu_id = int(get_value_from_lines(npu_info_lines, 'NPU ID')) + chip_info_lines = subprocess.check_output( + ['npu-smi', 'info', '-t', 'board', '-i', + str(npu_id), '-c', '0']).decode().strip().split('\n') + chip_name = get_value_from_lines(chip_info_lines, 'Chip Name') + chip_type = get_value_from_lines(chip_info_lines, 'Chip Type') + npu_name = get_value_from_lines(chip_info_lines, 'NPU Name') + + if "310" in chip_name: + # 310P case + assert chip_type + return (chip_type + chip_name).lower() + elif "910" in chip_name: + if chip_type: + # A2 case + assert not npu_name + return (chip_type + chip_name).lower() + else: + # A3 case + assert npu_name + return (chip_name + '_' + npu_name).lower() + else: + # TODO(zzzzwwjj): Currently, A5's chip name has not determined yet. + raise ValueError( + f"Unable to recognize chip name: {chip_name}, please manually set env SOC_VERSION" + ) + except subprocess.CalledProcessError as e: + raise RuntimeError(f"Get chip info failed: {e}") + except FileNotFoundError: + # cpu envir, release code case, return `ascend910b1` by default + return "ascend910b1" + + envs = load_module_from_path("envs", os.path.join(ROOT_DIR, "vllm_ascend", "envs.py")) +soc_version = get_chip_info() + +if not envs.SOC_VERSION: + envs.SOC_VERSION = soc_version +else: + if envs.SOC_VERSION != soc_version: + logging.warning( + f"env SOC_VERSION: {envs.SOC_VERSION} is not equal to soc_version from npu-smi: {soc_version}" + ) + def gen_build_info(): soc_version = envs.SOC_VERSION - if not soc_version: - raise ValueError( - "SOC version is not set. Please set SOC_VERSION environment variable." - ) if "310" in soc_version and not envs.COMPILE_CUSTOM_KERNELS: raise ValueError( "SOC version 310 only supports custom kernels. Please set COMPILE_CUSTOM_KERNELS=1 to enable custom kernels." ) + # TODO(zzzzwwjj): Add A5 case + soc_to_device = { + "ascend910b1": "_910B", + "ascend910b2": "_910B", + "ascend910b2c": "_910B", + "ascend910b3": "_910B", + "ascend910b4": "_910B", + "ascend910b4-1": "_910B", + "ascend910_9391": "_910_93", + "ascend910_9381": "_910_93", + "ascend910_9372": "_910_93", + "ascend910_9392": "_910_93", + "ascend910_9382": "_910_93", + "ascend910_9362": "_910_93", + "ascend310p1": "_310P", + "ascend310p3": "_310P", + "ascend310p5": "_310P", + "ascend310p7": "_310P", + "ascend310p3vir01": "_310P", + "ascend310p3vir02": "_310P", + "ascend310p3vir04": "_310P", + "ascend310p3vir08": "_310P", + } + + assert soc_version in soc_to_device, f"Undefined soc_version: {soc_version}. Please file an issue to vllm-ascend." + device_type = soc_to_device[soc_version] + package_dir = os.path.join(ROOT_DIR, "vllm_ascend", "_build_info.py") with open(package_dir, "w+") as f: f.write('# Auto-generated file\n') - f.write(f"__soc_version__ = '{soc_version}'\n") + f.write(f"__device_type__ = '{device_type}'\n") f.write(f"__sleep_mode_enabled__ = {envs.COMPILE_CUSTOM_KERNELS}\n") logging.info(f"Generated _build_info.py with SOC version: {soc_version}") diff --git a/tests/ut/attention/test_attention_v1.py b/tests/ut/attention/test_attention_v1.py index 33a0db56..129b5410 100644 --- a/tests/ut/attention/test_attention_v1.py +++ b/tests/ut/attention/test_attention_v1.py @@ -9,6 +9,7 @@ from vllm_ascend.attention.attention_v1 import (AscendAttentionBackend, AscendAttentionMetadataBuilder, AscendAttentionState) from vllm_ascend.attention.utils import AscendCommonAttentionMetadata +from vllm_ascend.utils import AscendDeviceType class TestAscendAttentionBackend(TestBase): @@ -24,14 +25,15 @@ class TestAscendAttentionBackend(TestBase): self.assertEqual(AscendAttentionBackend.get_builder_cls(), AscendAttentionMetadataBuilder) - @patch('vllm_ascend.attention.attention_v1.is_310p') - def test_get_kv_cache_shape_310p(self, mock_is_310p): - mock_is_310p.return_value = True + @patch('vllm_ascend.attention.attention_v1.get_ascend_device_type', + return_value=AscendDeviceType._310P) + def test_get_kv_cache_shape_310p(self, mock_soc_version): result = AscendAttentionBackend.get_kv_cache_shape(10, 20, 30, 40) self.assertEqual(result, (2, 10, 30 * 40 // 16, 20, 16)) - @patch('vllm_ascend.attention.attention_v1.is_310p', return_value=False) - def test_get_kv_cache_shape_not_310p(self, mock_is_310p): + @patch('vllm_ascend.utils.get_ascend_device_type', + return_value=AscendDeviceType._910_93) + def test_get_kv_cache_shape_not_310p(self, mock_soc_version): result = AscendAttentionBackend.get_kv_cache_shape(10, 20, 30, 40) self.assertEqual(result, (2, 10, 20, 30, 40)) @@ -96,8 +98,9 @@ class TestAscendAttentionMetadataBuilder(TestBase): @patch('vllm_ascend.attention.attention_v1.AscendMetadata') @patch('torch_npu.npu_format_cast') @patch('vllm_ascend.utils.nd_to_nz_2d') - @patch('vllm_ascend.attention.attention_v1.is_310p', return_value=True) - def test_build_prefill_no_cache(self, mock_is_310p, mock_nd_to_nz_2d, + @patch('vllm_ascend.utils.get_ascend_device_type', + return_value=AscendDeviceType._310P) + def test_build_prefill_no_cache(self, mock_soc_version, mock_nd_to_nz_2d, mock_npu_format_cast, mock_ascend_metadata): common_attn_metadata = AscendCommonAttentionMetadata( @@ -128,10 +131,11 @@ class TestAscendAttentionMetadataBuilder(TestBase): @patch('vllm_ascend.attention.attention_v1.AscendMetadata') @patch('torch_npu.npu_format_cast') @patch('vllm_ascend.utils.nd_to_nz_spec') - @patch('vllm_ascend.attention.attention_v1.is_310p', return_value=True) + @patch('vllm_ascend.utils.get_ascend_device_type', + return_value=AscendDeviceType._310P) @patch('vllm_ascend.attention.attention_v1.AscendAttentionState') def test_build_chunked_prefill(self, mock_ascend_attention_state, - mock_is_310p, mock_nd_to_nz_spec, + mock_soc_version, mock_nd_to_nz_spec, mock_npu_format_cast, mock_ascend_metadata): common_attn_metadata = AscendCommonAttentionMetadata( query_start_loc=torch.tensor([0, 2, 5, 9]), @@ -162,8 +166,9 @@ class TestAscendAttentionMetadataBuilder(TestBase): self.builder.build(1, common_attn_metadata, mock_model) @patch('vllm_ascend.attention.attention_v1.AscendMetadata') - @patch('vllm_ascend.attention.attention_v1.is_310p', return_value=False) - def test_build_non_310p(self, mock_is_310p, mock_ascend_metadata): + @patch('vllm_ascend.utils.get_ascend_device_type', + return_value=AscendDeviceType._910_93) + def test_build_non_310p(self, mock_soc_version, mock_ascend_metadata): common_attn_metadata = AscendCommonAttentionMetadata( query_start_loc=torch.tensor([0, 2, 5, 9]), query_start_loc_cpu=torch.tensor([0, 2, 5, 9]), @@ -450,12 +455,13 @@ class TestAscendAttentionBackendImpl(TestBase): assert output.shape == (10, 8 * 64) @patch('vllm_ascend.attention.attention_v1.get_forward_context') - @patch('vllm_ascend.attention.attention_v1.is_310p', return_value=False) + @patch('vllm_ascend.utils.get_ascend_device_type', + return_value=AscendDeviceType._910_93) @patch('torch_npu._npu_reshape_and_cache') @patch('vllm_ascend.attention.attention_v1.vanilla_chunked_prefill') def test_forward_head_size_192(self, mock_vanilla_prefill, - mock_npu_reshape_and_cache, mock_is_310p, - mock_get_forward_context): + mock_npu_reshape_and_cache, + mock_soc_version, mock_get_forward_context): """Test forward pass when head_size is 192""" self.impl.head_size = 192 @@ -522,9 +528,11 @@ class TestAscendAttentionBackendImpl(TestBase): @patch('torch_npu.npu_format_cast') @patch('torch_npu._npu_reshape_and_cache') @patch('torch_npu.npu_fused_infer_attention_score') - @patch('vllm_ascend.attention.attention_v1.is_310p', return_value=True) + @patch('vllm_ascend.utils.get_ascend_device_type', + return_value=AscendDeviceType._310P) @patch('vllm_ascend.attention.attention_v1.get_forward_context') - def test_forward_310p_device(self, mock_get_forward_context, mock_is_310p, + def test_forward_310p_device(self, mock_get_forward_context, + mock_soc_version, mock_npu_fused_infer_attention_score, mock_npu_reshape_and_cache, mock_npu_format_cast): diff --git a/tests/ut/models/conftest.py b/tests/ut/models/conftest.py index 4f17e2df..faad7ff6 100644 --- a/tests/ut/models/conftest.py +++ b/tests/ut/models/conftest.py @@ -92,7 +92,7 @@ def mock_distributed(): with patch("vllm_ascend.ops.fused_moe.fused_moe.get_current_vllm_config", return_value=mock_vllm_config), \ patch("vllm_ascend.ops.fused_moe.token_dispatcher.torch.distributed.get_rank", return_value=0), \ - patch("vllm_ascend.ops.fused_moe.token_dispatcher.get_ascend_soc_version", return_value=None), \ + patch("vllm_ascend.ops.fused_moe.token_dispatcher.get_ascend_device_type", return_value=None), \ patch.dict("vllm.distributed.parallel_state.__dict__", _TP=tp_group, _EP=ep_group, _DP=dp_group, _PP=pp_group), \ patch.dict("vllm_ascend.distributed.parallel_state.__dict__", _MC2=ep_group), \ diff --git a/tests/ut/ops/test_activation.py b/tests/ut/ops/test_activation.py index 76bc55dc..12e8f4c2 100644 --- a/tests/ut/ops/test_activation.py +++ b/tests/ut/ops/test_activation.py @@ -19,6 +19,8 @@ import pytest import torch from vllm.model_executor.layers.activation import QuickGELU, SiluAndMul +from vllm_ascend.utils import AscendDeviceType + @pytest.fixture def dummy_tensor(): @@ -36,20 +38,22 @@ def test_QuickGELU_forward(mock_gelu, dummy_tensor): mock_gelu.assert_called_once() -@pytest.mark.parametrize("is_310p_return", [True, False]) +@pytest.mark.parametrize("is_310p", [True, False]) @patch("torch_npu.npu_swiglu", side_effect=lambda x: x + 1) @patch("torch.ops.vllm.maybe_wait_prefetch_done", side_effect=lambda x: None) @patch("torch.ops.vllm.maybe_prefetch_mlp_down_proj", side_effect=lambda x: None) def test_SiluAndMul_forward(mock_maybe_prefetch_mlp_down_proj, mock_maybe_wait_prefetch_done, mock_swiglu, - is_310p_return, dummy_tensor): + is_310p, dummy_tensor): - with patch("vllm_ascend.utils.is_310p", return_value=is_310p_return): + with patch("vllm_ascend.utils.get_ascend_device_type", + return_value=AscendDeviceType._310P + if is_310p else AscendDeviceType._910_93): layer = SiluAndMul() out = layer.forward(dummy_tensor) - if is_310p_return: + if is_310p: expected_arg = dummy_tensor.to(torch.float32) else: expected_arg = dummy_tensor diff --git a/tests/ut/ops/test_fused_moe.py b/tests/ut/ops/test_fused_moe.py index 3a27e44e..e891a465 100644 --- a/tests/ut/ops/test_fused_moe.py +++ b/tests/ut/ops/test_fused_moe.py @@ -29,7 +29,7 @@ from vllm_ascend.ops.fused_moe.fused_moe import ( AscendFusedMoE, AscendUnquantizedFusedMoEMethod) from vllm_ascend.ops.fused_moe.moe_mlp import (cumsum_group_list, unified_apply_mlp) -from vllm_ascend.utils import AscendSocVersion, adapt_patch +from vllm_ascend.utils import AscendDeviceType, adapt_patch adapt_patch(True) @@ -129,7 +129,7 @@ def mock_dist_env(mocker: MockerFixture): return_value=mock_forward_context_obj), \ patch('vllm_ascend.ops.fused_moe.prepare_finalize.get_forward_context', return_value=mock_forward_context_obj), \ - patch("vllm_ascend.utils.get_ascend_soc_version", return_value=AscendSocVersion.A3), \ + patch("vllm_ascend.utils.get_ascend_device_type", return_value=AscendDeviceType._910_93), \ patch('vllm_ascend.ops.fused_moe.moe_mlp.get_forward_context', return_value=mock_forward_context_obj), \ patch('vllm_ascend.ops.fused_moe.moe_comm_method.MC2CommImpl._get_token_dispatcher', @@ -323,22 +323,21 @@ class TestCumsumGroupList(TestBase): class TestUnifiedApplyMLP(TestBase): @patch('vllm_ascend.ops.fused_moe.moe_mlp.get_forward_context') - @patch('vllm_ascend.ops.fused_moe.moe_mlp.is_310p') + @patch('vllm_ascend.utils.get_ascend_device_type', + return_value=AscendDeviceType._910_93) @patch('torch_npu.npu_grouped_matmul') @patch('torch_npu.npu_dynamic_quant') @patch('torch_npu.npu_dequant_swiglu_quant') def test_unified_apply_mlp_with_quantization_mc2(self, mock_npu_dequant, mock_npu_dynamic_quant, mock_npu_grouped_matmul, - mock_is_310p, + mock_soc_version, mock_get_forward_context): mock_forward_context = MagicMock() mock_forward_context.moe_comm_type = MoECommType.MC2 mock_get_forward_context.return_value = mock_forward_context - mock_is_310p.return_value = False - mock_npu_dynamic_quant.return_value = (torch.randint(-128, 127, (10, 20), dtype=torch.int8), @@ -387,7 +386,8 @@ class TestUnifiedApplyMLP(TestBase): self.assertEqual(result.dtype, torch.bfloat16) - @patch('vllm_ascend.ops.fused_moe.moe_mlp.is_310p') + @patch('vllm_ascend.utils.get_ascend_device_type', + return_value=AscendDeviceType._910_93) @patch('torch_npu.npu_grouped_matmul') @patch('torch_npu.npu_swiglu') @patch('torch_npu.npu_dynamic_quant') @@ -395,9 +395,7 @@ class TestUnifiedApplyMLP(TestBase): mock_npu_dynamic_quant, mock_npu_swiglu, mock_npu_grouped_matmul, - mock_is_310p): - mock_is_310p.return_value = False - + mock_soc_version): mock_npu_grouped_matmul.side_effect = [[ torch.randn(10, 40, dtype=torch.float16) ], [torch.randn(10, 20, dtype=torch.float16)]] @@ -490,15 +488,14 @@ class TestUnifiedApplyMLP(TestBase): self.assertEqual(result.shape, hidden_states_shape) self.assertEqual(result.dtype, torch.bfloat16) - @patch('vllm_ascend.ops.fused_moe.moe_mlp.is_310p') + @patch('vllm_ascend.utils.get_ascend_device_type', + return_value=AscendDeviceType._310P) @patch('torch_npu.npu_grouped_matmul') @patch('torch_npu.npu_swiglu') @patch('torch_npu.npu_dynamic_quant') def test_unified_apply_mlp_without_quantization_310p( self, mock_npu_dynamic_quant, mock_npu_swiglu, - mock_npu_grouped_matmul, mock_is_310p): - mock_is_310p.return_value = True - + mock_npu_grouped_matmul, mock_soc_version): mock_gmm1_out = torch.randn(10, 40, dtype=torch.float16) mock_gmm2_out = torch.randn(10, 20, dtype=torch.float16) mock_npu_grouped_matmul.side_effect = [[mock_gmm1_out], @@ -527,8 +524,6 @@ class TestUnifiedApplyMLP(TestBase): topk_scales=topk_scales, with_quant=False) - mock_is_310p.assert_called_once() - self.assertEqual(mock_npu_grouped_matmul.call_count, 2) mock_npu_swiglu.assert_called_once() diff --git a/tests/ut/ops/test_layernorm.py b/tests/ut/ops/test_layernorm.py index 9da7eb5b..314775f8 100644 --- a/tests/ut/ops/test_layernorm.py +++ b/tests/ut/ops/test_layernorm.py @@ -7,6 +7,7 @@ from vllm.model_executor.layers.layernorm import RMSNorm from tests.ut.base import PytestBase from vllm_ascend.quantization.w8a8 import AscendW8A8LinearMethod +from vllm_ascend.utils import AscendDeviceType def mock_rms_norm(x, weight, eps): @@ -60,8 +61,9 @@ class TestAscendRMSNorm(PytestBase): # Test case for addrmsnorm + w8a8 quant fusion def test_forward_oot_with_quant_fusion(self, mocker: MockerFixture): - mock_is_310p = mocker.patch("vllm_ascend.utils.is_310p") - mock_is_310p.return_value = False + mock_soc_version = mocker.patch( + "vllm_ascend.utils.get_ascend_device_type") + mock_soc_version.return_value = AscendDeviceType._910_93 mock_get_forward_context = mocker.patch( "vllm_ascend.ops.layernorm.get_forward_context") diff --git a/tests/ut/ops/test_rotary_embedding.py b/tests/ut/ops/test_rotary_embedding.py index 580a4fc3..f5d4f663 100644 --- a/tests/ut/ops/test_rotary_embedding.py +++ b/tests/ut/ops/test_rotary_embedding.py @@ -12,6 +12,7 @@ from vllm.platforms import CpuArchEnum from tests.ut.base import TestBase from vllm_ascend.ascend_forward_context import set_ascend_forward_context from vllm_ascend.ops.rotary_embedding import _custom_rotary_embedding_enabled +from vllm_ascend.utils import AscendDeviceType MODEL = "Qwen3-0.6B" MODEL_VL = "Qwen/Qwen2.5-VL-3B-Instruct" @@ -97,7 +98,8 @@ class TestAscendRotaryEmbedding(unittest.TestCase): self.mock_self.is_neox_style = self.is_neox_style @patch('torch.ops._C_ascend') - @patch('vllm_ascend.ops.rotary_embedding.is_310p', return_value=False) + @patch('vllm_ascend.utils.get_ascend_device_type', + return_value=AscendDeviceType._910_93) @patch('vllm_ascend.ops.rotary_embedding._custom_rotary_embedding_enabled', return_value=True) @patch('torch.ops._npu_rotary_embedding') @@ -106,8 +108,8 @@ class TestAscendRotaryEmbedding(unittest.TestCase): @patch('vllm.distributed.parallel_state._DP', MagicMock(world_size=1)) @patch('vllm.distributed.parallel_state._TP', MagicMock(world_size=1)) def test_rope_forward_oot_custom_kernel(self, mock_rotary_embedding, - mock_custom_enabled, mock_is_310p, - mock__c): + mock_custom_enabled, + mock_soc_version, mock__c): mock_config = MagicMock() mock_config.torchair_graph_config.enabled = False diff --git a/tests/ut/ops/test_token_dispatcher.py b/tests/ut/ops/test_token_dispatcher.py index 4abe3d7f..15fd094f 100644 --- a/tests/ut/ops/test_token_dispatcher.py +++ b/tests/ut/ops/test_token_dispatcher.py @@ -22,7 +22,7 @@ import torch from tests.ut.base import TestBase from vllm_ascend.ops.fused_moe.token_dispatcher import ( # isort: skip - AscendSocVersion, TokenDispatcherWithAll2AllV, + AscendDeviceType, TokenDispatcherWithAll2AllV, TokenDispatcherWithAllGather, TokenDispatcherWithMC2) @@ -50,10 +50,10 @@ class TestTokenDispatcherWithMC2(TestBase): return_value=self.forward_context) self.forward_context_patch.start() - # Mock get_ascend_soc_version() + # Mock get_ascend_device_type() self.ascend_soc_version_patch = patch( - "vllm_ascend.ops.fused_moe.token_dispatcher.get_ascend_soc_version", - return_value=AscendSocVersion.A3) + "vllm_ascend.ops.fused_moe.token_dispatcher.get_ascend_device_type", + return_value=AscendDeviceType._910_93) self.ascend_soc_version_patch.start() kwargs = {"with_quant": False, "top_k": 8, "num_experts": 128} diff --git a/tests/ut/quantization/test_w8a8.py b/tests/ut/quantization/test_w8a8.py index dbefae30..ce18023f 100644 --- a/tests/ut/quantization/test_w8a8.py +++ b/tests/ut/quantization/test_w8a8.py @@ -12,6 +12,7 @@ from vllm_ascend.quantization.w8a8 import (AscendC8KVCacheMethod, AscendW8A8LinearMethod, fused_experts, fused_experts_310p, quant_per_tensor) +from vllm_ascend.utils import AscendDeviceType class TestQuantPerTensor(TestBase): @@ -118,9 +119,11 @@ class TestAscendW8A8LinearMethod(TestBase): expected_y_output += bias self.assertTrue(torch.equal(output, expected_y_output)) - @patch("vllm_ascend.quantization.w8a8.is_310p", return_value=True) + @patch('vllm_ascend.utils.get_ascend_device_type', + return_value=AscendDeviceType._310P) @patch("torch_npu.npu_quant_matmul") - def test_apply_with_x_is_310p(self, mock_npu_quant_matmul, mock_is_310p): + def test_apply_with_x_is_310p(self, mock_npu_quant_matmul, + mock_soc_version): layer = MagicMock() layer.aclnn_input_scale = 0.1 layer.aclnn_input_offset = 0.2 @@ -279,11 +282,12 @@ class TestAscendW8A8FusedMoEMethod(TestBase): mock_fused_experts.assert_called_once() self.assertEqual(result.shape, (32, self.hidden_size)) - @patch("vllm_ascend.quantization.w8a8.is_310p", return_value=True) + @patch('vllm_ascend.quantization.w8a8.get_ascend_device_type', + return_value=AscendDeviceType._310P) @patch('vllm_ascend.quantization.w8a8.select_experts') @patch('vllm_ascend.quantization.w8a8.fused_experts_310p') def test_apply_is_310p(self, mock_fused_experts_310p, mock_select_experts, - mock_is_310p): + mock_soc_version): # Setup mock_layer = MagicMock() x = torch.randn(32, self.hidden_size) @@ -342,8 +346,9 @@ class TestAscendC8KVCacheMethod(TestBase): expected_shape = (self.layer.num_kv_heads * self.layer.head_size, ) self.assertEqual(param.shape, expected_shape) - @patch("vllm_ascend.quantization.w8a8.is_310p", return_value=False) - def test_process_weights_after_loading_not_310p(self, mock_is_310p): + @patch('vllm_ascend.utils.get_ascend_device_type', + return_value=AscendDeviceType._910_93) + def test_process_weights_after_loading_not_310p(self, mock_soc_version): key_data = torch.ones(4 * 64) value_data = torch.ones(4 * 64) * 2 @@ -356,8 +361,9 @@ class TestAscendC8KVCacheMethod(TestBase): self.assertTrue(torch.all(self.method.antiquant_scale_comb[0] == 1)) self.assertTrue(torch.all(self.method.antiquant_scale_comb[1] == 2)) - @patch("vllm_ascend.quantization.w8a8.is_310p", return_value=True) - def test_process_weights_after_loading_is_310p(self, mock_is_310p): + @patch('vllm_ascend.utils.get_ascend_device_type', + return_value=AscendDeviceType._310P) + def test_process_weights_after_loading_is_310p(self, mock_soc_version): key_data = torch.ones(4 * 64) value_data = torch.ones(4 * 64) * 2 diff --git a/tests/ut/test_platform.py b/tests/ut/test_platform.py index 5b6a46c1..798cf14a 100644 --- a/tests/ut/test_platform.py +++ b/tests/ut/test_platform.py @@ -9,7 +9,7 @@ from vllm.platforms import PlatformEnum from tests.ut.base import TestBase from vllm_ascend.platform import NPUPlatform -from vllm_ascend.utils import ASCEND_QUANTIZATION_METHOD +from vllm_ascend.utils import ASCEND_QUANTIZATION_METHOD, AscendDeviceType class TestNPUPlatform(TestBase): @@ -231,13 +231,14 @@ class TestNPUPlatform(TestBase): @patch("vllm_ascend.ascend_config.check_ascend_config") @patch("vllm_ascend.ascend_config.init_ascend_config") @patch("vllm_ascend.utils.update_aclgraph_sizes") - @patch("vllm_ascend.utils.is_310p", return_value=False) + @patch('vllm_ascend.utils.get_ascend_device_type', + return_value=AscendDeviceType._910_93) @patch("os.environ", {}) @patch( "vllm_ascend.core.recompute_schedule_config.RecomputeSchedulerConfig.initialize_from_config" ) def test_check_and_update_config_basic_config_update( - self, mock_init_recompute, mock_is_310p, mock_update_acl, + self, mock_init_recompute, mock_soc_version, mock_update_acl, mock_init_ascend, mock_check_ascend): mock_init_ascend.return_value = TestNPUPlatform.mock_vllm_ascend_config( ) @@ -259,7 +260,8 @@ class TestNPUPlatform(TestBase): mock_init_ascend.assert_called_once_with(vllm_config) mock_check_ascend.assert_called_once() - @patch("vllm_ascend.utils.is_310p", return_value=False) + @patch('vllm_ascend.utils.get_ascend_device_type', + return_value=AscendDeviceType._910_93) @patch("vllm_ascend.ascend_config.check_ascend_config") @patch("vllm_ascend.ascend_config.init_ascend_config") @patch( @@ -267,7 +269,7 @@ class TestNPUPlatform(TestBase): ) def test_check_and_update_config_no_model_config_warning( self, mock_init_recompute, mock_init_ascend, mock_check_ascend, - mock_is_310p): + mock_soc_version): mock_init_ascend.return_value = TestNPUPlatform.mock_vllm_ascend_config( ) vllm_config = TestNPUPlatform.mock_vllm_config() @@ -283,7 +285,8 @@ class TestNPUPlatform(TestBase): self.platform.check_and_update_config(vllm_config) self.assertTrue("Model config is missing" in cm.output[0]) - @patch("vllm_ascend.utils.is_310p", return_value=False) + @patch('vllm_ascend.utils.get_ascend_device_type', + return_value=AscendDeviceType._910_93) @patch("vllm_ascend.ascend_config.check_ascend_config") @patch("vllm_ascend.ascend_config.init_ascend_config") @patch( @@ -291,7 +294,7 @@ class TestNPUPlatform(TestBase): ) def test_check_and_update_config_enforce_eager_mode( self, mock_init_recompute, mock_init_ascend, mock_check_ascend, - mock_is_310p): + mock_soc_version): mock_init_ascend.return_value = TestNPUPlatform.mock_vllm_ascend_config( ) vllm_config = TestNPUPlatform.mock_vllm_config() @@ -318,7 +321,8 @@ class TestNPUPlatform(TestBase): CUDAGraphMode.NONE, ) - @patch("vllm_ascend.utils.is_310p", return_value=False) + @patch('vllm_ascend.utils.get_ascend_device_type', + return_value=AscendDeviceType._910_93) @patch("vllm_ascend.utils.update_default_aclgraph_sizes") @patch("vllm_ascend.ascend_config.check_ascend_config") @patch("vllm_ascend.ascend_config.init_ascend_config") @@ -327,7 +331,7 @@ class TestNPUPlatform(TestBase): ) def test_check_and_update_config_unsupported_compilation_level( self, mock_init_recompute, mock_init_ascend, mock_check_ascend, - mock_update_default, mock_is_310p): + mock_update_default, mock_soc_version): mock_update_default.return_value = MagicMock() mock_init_ascend.return_value = TestNPUPlatform.mock_vllm_ascend_config( ) @@ -357,11 +361,12 @@ class TestNPUPlatform(TestBase): @pytest.mark.skip( "Revert me when vllm support setting cudagraph_mode on oot platform") - @patch("vllm_ascend.utils.is_310p", return_value=False) + @patch('vllm_ascend.utils.get_ascend_device_type', + return_value=AscendDeviceType._910_93) @patch("vllm_ascend.ascend_config.check_ascend_config") @patch("vllm_ascend.ascend_config.init_ascend_config") def test_check_and_update_config_unsupported_cudagraph_mode( - self, mock_init_ascend, mock_check_ascend, mock_is_310p): + self, mock_init_ascend, mock_check_ascend, mock_soc_version): mock_init_ascend.return_value = TestNPUPlatform.mock_vllm_ascend_config( ) vllm_config = TestNPUPlatform.mock_vllm_config() @@ -386,7 +391,8 @@ class TestNPUPlatform(TestBase): CUDAGraphMode.NONE, ) - @patch("vllm_ascend.utils.is_310p", return_value=False) + @patch('vllm_ascend.utils.get_ascend_device_type', + return_value=AscendDeviceType._910_93) @patch("vllm_ascend.utils.update_default_aclgraph_sizes") @patch("vllm_ascend.ascend_config.check_ascend_config") @patch("vllm_ascend.ascend_config.init_ascend_config") @@ -395,7 +401,7 @@ class TestNPUPlatform(TestBase): ) def test_check_and_update_config_torchair_enabled_compilation( self, mock_init_recompute, mock_init_ascend, mock_check_ascend, - mock_update_default, mock_is_310p): + mock_update_default, mock_soc_version): mock_update_default.return_value = MagicMock() mock_ascend_config = TestNPUPlatform.mock_vllm_ascend_config() mock_ascend_config.torchair_graph_config.enabled = True @@ -424,7 +430,8 @@ class TestNPUPlatform(TestBase): CUDAGraphMode.NONE, ) - @patch("vllm_ascend.utils.is_310p", return_value=False) + @patch('vllm_ascend.utils.get_ascend_device_type', + return_value=AscendDeviceType._910_93) @patch("vllm_ascend.ascend_config.check_ascend_config") @patch("vllm_ascend.ascend_config.init_ascend_config") @patch( @@ -432,7 +439,7 @@ class TestNPUPlatform(TestBase): ) def test_check_and_update_config_cache_config_block_size( self, mock_init_recompute, mock_init_ascend, mock_check_ascend, - mock_is_310p): + mock_soc_version): mock_init_ascend.return_value = TestNPUPlatform.mock_vllm_ascend_config( ) vllm_config = TestNPUPlatform.mock_vllm_config() @@ -450,7 +457,8 @@ class TestNPUPlatform(TestBase): self.assertEqual(vllm_config.cache_config.block_size, 128) - @patch("vllm_ascend.utils.is_310p", return_value=False) + @patch('vllm_ascend.utils.get_ascend_device_type', + return_value=AscendDeviceType._910_93) @patch("vllm_ascend.ascend_config.check_ascend_config") @patch("vllm_ascend.ascend_config.init_ascend_config") @patch( @@ -458,7 +466,7 @@ class TestNPUPlatform(TestBase): ) def test_check_and_update_config_v1_worker_class_selection( self, mock_init_recompute, mock_init_ascend, mock_check_ascend, - mock_is_310p): + mock_soc_version): mock_init_ascend.return_value = TestNPUPlatform.mock_vllm_ascend_config( ) vllm_config = TestNPUPlatform.mock_vllm_config() @@ -489,12 +497,13 @@ class TestNPUPlatform(TestBase): @patch("vllm_ascend.ascend_config.check_ascend_config") @patch("vllm_ascend.ascend_config.init_ascend_config") - @patch("vllm_ascend.utils.is_310p", return_value=True) + @patch('vllm_ascend.utils.get_ascend_device_type', + return_value=AscendDeviceType._310P) @patch( "vllm_ascend.core.recompute_schedule_config.RecomputeSchedulerConfig.initialize_from_config" ) def test_check_and_update_config_310p_no_custom_ops( - self, mock_init_recompute, mock_is_310p, mock_init_ascend, + self, mock_init_recompute, mock_soc_version, mock_init_ascend, mock_check_ascend): mock_init_ascend.return_value = TestNPUPlatform.mock_vllm_ascend_config( ) @@ -511,7 +520,8 @@ class TestNPUPlatform(TestBase): self.platform.check_and_update_config(vllm_config) self.assertEqual(vllm_config.compilation_config.custom_ops, []) - @patch("vllm_ascend.utils.is_310p", return_value=False) + @patch('vllm_ascend.utils.get_ascend_device_type', + return_value=AscendDeviceType._910_93) @patch("vllm_ascend.ascend_config.check_ascend_config") @patch("vllm_ascend.ascend_config.init_ascend_config") @patch( @@ -519,7 +529,7 @@ class TestNPUPlatform(TestBase): ) def test_check_and_update_config_ascend_scheduler_config( self, mock_init_recompute, mock_init_ascend, mock_check_ascend, - mock_is_310p): + mock_soc_version): mock_ascend_config = TestNPUPlatform.mock_vllm_ascend_config() mock_ascend_config.ascend_scheduler_config.enabled = True mock_init_ascend.return_value = mock_ascend_config diff --git a/tests/ut/test_utils.py b/tests/ut/test_utils.py index d5b87130..29ed7b44 100644 --- a/tests/ut/test_utils.py +++ b/tests/ut/test_utils.py @@ -35,16 +35,6 @@ class TestUtils(TestBase): from vllm_ascend import platform importlib.reload(platform) - def test_is_310p(self): - utils._IS_310P = None - with mock.patch("vllm_ascend._build_info.__soc_version__", - "Ascend310P3"): - self.assertTrue(utils.is_310p()) - utils._IS_310P = None - with mock.patch("vllm_ascend._build_info.__soc_version__", - "Ascend910P1"): - self.assertFalse(utils.is_310p()) - def test_is_enable_nz(self): with mock.patch("vllm_ascend.utils.envs_ascend.VLLM_ASCEND_ENABLE_NZ", 1): diff --git a/tests/ut/torchair/ops/test_torchair_fused_moe.py b/tests/ut/torchair/ops/test_torchair_fused_moe.py index cf306d2a..f0782e75 100644 --- a/tests/ut/torchair/ops/test_torchair_fused_moe.py +++ b/tests/ut/torchair/ops/test_torchair_fused_moe.py @@ -28,7 +28,7 @@ from vllm_ascend.quantization.quant_config import AscendFusedMoEMethod from vllm_ascend.torchair.ops.torchair_fused_moe import ( TorchairAscendFusedMoE, TorchairAscendUnquantizedFusedMoEMethod) from vllm_ascend.utils import adapt_patch # noqa E402 -from vllm_ascend.utils import AscendSocVersion +from vllm_ascend.utils import AscendDeviceType adapt_patch(True) @@ -398,7 +398,7 @@ class TestTorchairAscendUnquantizedFusedMoEMethod: forward_context = MagicMock( fused_moe_state=get_fused_moe_state(ep_size, is_prefill, True)) with patch("vllm_ascend.torchair.ops.torchair_fused_moe.get_forward_context", return_value=forward_context), \ - patch("vllm_ascend.torchair.ops.torchair_fused_moe.get_ascend_soc_version", return_value=AscendSocVersion.A3): + patch("vllm_ascend.torchair.ops.torchair_fused_moe.get_ascend_device_type", return_value=AscendDeviceType._910_93): expert_map = torch.tensor([0, 1, 2, -1, -1, -1, -1, -1]) moe_method.ep_size = ep_size x = torch.randn(8, 2, 2) diff --git a/tests/ut/torchair/ops/test_torchair_rotary_embedding.py b/tests/ut/torchair/ops/test_torchair_rotary_embedding.py index 4adb5988..73a78b77 100644 --- a/tests/ut/torchair/ops/test_torchair_rotary_embedding.py +++ b/tests/ut/torchair/ops/test_torchair_rotary_embedding.py @@ -8,6 +8,7 @@ from vllm_ascend.torchair.ops.torchair_rotary_embedding import ( _set_cos_sin_cache, custom_rotary_embedding_enabled, native_rope_deepseek_forward, rope_forward_oot, rotate_half, yarn_find_correction_dim, yarn_get_mscale) +from vllm_ascend.utils import AscendDeviceType class TestCustomRotaryEmbeddingEnabled(TestBase): @@ -107,14 +108,15 @@ class TestRopeForwardOot(TestBase): @patch('torch.ops._C_ascend') @patch( 'vllm_ascend.torchair.ops.torchair_rotary_embedding.get_ascend_config') - @patch('vllm_ascend.torchair.ops.torchair_rotary_embedding.is_310p', - return_value=False) + @patch('vllm_ascend.utils.get_ascend_device_type', + return_value=AscendDeviceType._910_93) @patch( 'vllm_ascend.torchair.ops.torchair_rotary_embedding.custom_rotary_embedding_enabled', return_value=True) @patch('torch.ops._npu_rotary_embedding') def test_rope_forward_oot_custom_kernel(self, mock_rotary_embedding, - mock_custom_enabled, mock_is_310p, + mock_custom_enabled, + mock_soc_version, mock_get_ascend_config, mock__c): mock_config = MagicMock() mock_config.torchair_graph_config.enabled = False diff --git a/tests/ut/torchair/quantization/test_torchair_w8a8_dynamic.py b/tests/ut/torchair/quantization/test_torchair_w8a8_dynamic.py index 994f5c86..11ad00a2 100644 --- a/tests/ut/torchair/quantization/test_torchair_w8a8_dynamic.py +++ b/tests/ut/torchair/quantization/test_torchair_w8a8_dynamic.py @@ -5,7 +5,7 @@ import torch from tests.ut.base import TestBase from vllm_ascend.torchair.quantization.torchair_w8a8_dynamic import ( torchair_fused_experts_with_all2all, torchair_fused_experts_with_mc2) -from vllm_ascend.utils import AscendSocVersion +from vllm_ascend.utils import AscendDeviceType class TestAscendW8A8FusedMoEMethod(TestBase): @@ -79,7 +79,7 @@ class TestAscendW8A8FusedMoEMethod(TestBase): 'HCCL_INTRA_PCIE_ENABLE': '1' }) @patch( - "vllm_ascend.torchair.quantization.torchair_w8a8_dynamic.get_ascend_soc_version" + "vllm_ascend.torchair.quantization.torchair_w8a8_dynamic.get_ascend_device_type" ) @patch( 'vllm_ascend.torchair.quantization.torchair_w8a8_dynamic.get_mc2_group' @@ -94,7 +94,7 @@ class TestAscendW8A8FusedMoEMethod(TestBase): mock_ascend_soc_version): """Test expert_scales is passed in A2 SOC version with mc2 optimization""" # Setup mocks - mock_ascend_soc_version.return_value = AscendSocVersion.A2 + mock_ascend_soc_version.return_value = AscendDeviceType._910B mock_group = MagicMock() mock_group.rank_in_group = 0 diff --git a/tests/ut/worker/test_model_runner_v1.py b/tests/ut/worker/test_model_runner_v1.py index 1d781490..0f27548a 100644 --- a/tests/ut/worker/test_model_runner_v1.py +++ b/tests/ut/worker/test_model_runner_v1.py @@ -16,7 +16,7 @@ from unittest.mock import MagicMock, patch import pytest from vllm_ascend.ascend_forward_context import MoECommType -from vllm_ascend.utils import AscendSocVersion +from vllm_ascend.utils import AscendDeviceType from vllm_ascend.worker.model_runner_v1 import NPUModelRunner @@ -25,21 +25,21 @@ from vllm_ascend.worker.model_runner_v1 import NPUModelRunner "soc_version, enable_expert_parallel, world_size, num_tokens, mc2_tokens_capacity, quant_type, expected_method", [ # Case 1: Expert parallel is disabled, should always be 'allgather' - (AscendSocVersion.A2, False, 8, 100, 256, None, MoECommType.ALLGATHER), - (AscendSocVersion.A3, False, 16, 500, 256, None, MoECommType.ALLGATHER), + (AscendDeviceType._910B, False, 8, 100, 256, None, MoECommType.ALLGATHER), + (AscendDeviceType._910_93, False, 16, 500, 256, None, MoECommType.ALLGATHER), # Case 2: A2 SOC with w4a8_dynamic -> use alltoall when not mc2 - (AscendSocVersion.A2, True, 8, 100, 256, "w4a8_dynamic", MoECommType.ALLTOALL), - (AscendSocVersion.A2, True, 16, 257, 256, "w4a8_dynamic", MoECommType.ALLTOALL), - (AscendSocVersion.A2, True, 16, 100, 256, "w4a8_dynamic", MoECommType.MC2), # meets mc2 condition + (AscendDeviceType._910B, True, 8, 100, 256, "w4a8_dynamic", MoECommType.ALLTOALL), + (AscendDeviceType._910B, True, 16, 257, 256, "w4a8_dynamic", MoECommType.ALLTOALL), + (AscendDeviceType._910B, True, 16, 100, 256, "w4a8_dynamic", MoECommType.MC2), # meets mc2 condition # Case 3: A2 SOC without w4a8_dynamic -> fallback to allgather - (AscendSocVersion.A2, True, 8, 100, 256, None, MoECommType.ALLGATHER), - (AscendSocVersion.A2, True, 16, 257, 256, None, MoECommType.ALLGATHER), + (AscendDeviceType._910B, True, 8, 100, 256, None, MoECommType.ALLGATHER), + (AscendDeviceType._910B, True, 16, 257, 256, None, MoECommType.ALLGATHER), # Case 4: A3 SOC - (AscendSocVersion.A3, True, 8, 100, 256, None, MoECommType.MC2), - (AscendSocVersion.A3, True, 8, 257, 256, None, MoECommType.ALLTOALL), + (AscendDeviceType._910_93, True, 8, 100, 256, None, MoECommType.MC2), + (AscendDeviceType._910_93, True, 8, 257, 256, None, MoECommType.ALLTOALL), ]) # yapf: enable def test_select_moe_comm_method(soc_version, enable_expert_parallel, @@ -65,7 +65,7 @@ def test_select_moe_comm_method(soc_version, enable_expert_parallel, mock_runner.vllm_config = mock_vllm_config # Patch the helper functions - with patch('vllm_ascend.worker.model_runner_v1.get_ascend_soc_version', + with patch('vllm_ascend.worker.model_runner_v1.get_ascend_device_type', return_value=soc_version), \ patch('vllm_ascend.worker.model_runner_v1.is_global_first_rank', return_value=True), \ @@ -100,7 +100,7 @@ def test_select_moe_comm_method_unsupported_soc(): unsupported_soc = "UnsupportedSOC" - with patch('vllm_ascend.worker.model_runner_v1.get_ascend_soc_version', + with patch('vllm_ascend.worker.model_runner_v1.get_ascend_device_type', return_value=unsupported_soc), \ patch('vllm_ascend.worker.model_runner_v1.is_global_first_rank', return_value=True), \ diff --git a/tests/ut/worker/test_worker_v1.py b/tests/ut/worker/test_worker_v1.py index 11be9fb8..fbc7fdc4 100644 --- a/tests/ut/worker/test_worker_v1.py +++ b/tests/ut/worker/test_worker_v1.py @@ -52,7 +52,7 @@ class TestNPUWorker(TestBase): @patch("vllm_ascend.worker.worker_v1.register_ascend_customop") @patch("vllm_ascend.worker.worker_v1.get_ascend_config") @patch("vllm_ascend.worker.worker_v1.init_ascend_config") - @patch("vllm_ascend.worker.worker_v1.init_ascend_soc_version") + @patch("vllm_ascend.worker.worker_v1.check_ascend_device_type") @patch("vllm_ascend.worker.worker_v1.try_register_lib") @patch(init_cached_hf_modules_path) @patch("vllm_ascend.worker.worker_v1.NPUWorker._init_profiler") @@ -61,7 +61,7 @@ class TestNPUWorker(TestBase): mock_init_profiler, mock_init_cached_hf_modules, mock_try_register_lib, - mock_init_ascend_soc_version, + mock_check_ascend_device_type, mock_init_ascend_config, mock_get_ascend_config, mock_register_ascend_customop, @@ -93,7 +93,7 @@ class TestNPUWorker(TestBase): mock_register_atb_extensions.assert_called_once() mock_register_ascend_customop.assert_called_once() mock_init_ascend_config.assert_called_once_with(self.vllm_config_mock) - mock_init_ascend_soc_version.assert_called_once() + mock_check_ascend_device_type.assert_called_once() # Verify try_register_lib call mock_try_register_lib.assert_called_once_with( @@ -114,7 +114,7 @@ class TestNPUWorker(TestBase): @patch("vllm_ascend.worker.worker_v1.register_ascend_customop") @patch("vllm_ascend.worker.worker_v1.get_ascend_config") @patch("vllm_ascend.worker.worker_v1.init_ascend_config") - @patch("vllm_ascend.worker.worker_v1.init_ascend_soc_version") + @patch("vllm_ascend.worker.worker_v1.check_ascend_device_type") @patch("vllm_ascend.worker.worker_v1.try_register_lib") @patch(init_cached_hf_modules_path) @patch("vllm_ascend.worker.worker_v1.NPUWorker._init_profiler") @@ -123,7 +123,7 @@ class TestNPUWorker(TestBase): mock_init_profiler, mock_init_cached_hf_modules, mock_try_register_lib, - mock_init_ascend_soc_version, + mock_check_ascend_device_type, mock_init_ascend_config, mock_get_ascend_config, mock_register_ascend_customop, @@ -159,7 +159,7 @@ class TestNPUWorker(TestBase): @patch("vllm_ascend.worker.worker_v1.register_ascend_customop") @patch("vllm_ascend.worker.worker_v1.get_ascend_config") @patch("vllm_ascend.worker.worker_v1.init_ascend_config") - @patch("vllm_ascend.worker.worker_v1.init_ascend_soc_version") + @patch("vllm_ascend.worker.worker_v1.check_ascend_device_type") @patch("vllm_ascend.worker.worker_v1.try_register_lib") @patch(init_cached_hf_modules_path) @patch("vllm_ascend.worker.worker_v1.NPUWorker._init_profiler") @@ -168,7 +168,7 @@ class TestNPUWorker(TestBase): mock_init_profiler, mock_init_cached_hf_modules, mock_try_register_lib, - mock_init_ascend_soc_version, + mock_check_ascend_device_type, mock_init_ascend_config, mock_get_ascend_config, mock_register_ascend_customop, diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py index 4ea29ee2..1d9139c5 100644 --- a/vllm_ascend/attention/attention_v1.py +++ b/vllm_ascend/attention/attention_v1.py @@ -42,9 +42,9 @@ from vllm_ascend.attention.utils import (AscendCommonAttentionMetadata, from vllm_ascend.compilation.acl_graph import (get_graph_params, update_graph_params_workspaces) from vllm_ascend.ops.attention import vanilla_chunked_prefill -from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, aligned_16, is_310p, - nd_to_nz_2d, nd_to_nz_spec, - prefill_context_parallel_enable, +from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, AscendDeviceType, + aligned_16, get_ascend_device_type, nd_to_nz_2d, + nd_to_nz_spec, prefill_context_parallel_enable, weak_ref_tensors) # isort: off @@ -83,7 +83,7 @@ class AscendAttentionBackend(AttentionBackend): num_kv_heads: int, head_size: int, ) -> Tuple[int, ...]: - if is_310p(): + if get_ascend_device_type() == AscendDeviceType._310P: return (2, num_blocks, num_kv_heads * head_size // 16, block_size, 16) return (2, num_blocks, block_size, num_kv_heads, head_size) @@ -351,7 +351,7 @@ class AscendAttentionMetadataBuilder: query_start_loc = query_start_loc_cpu.to(self.device, non_blocking=True) - if is_310p(): + if get_ascend_device_type() == AscendDeviceType._310P: if attn_state == AscendAttentionState.PrefillNoCache: mask_nz = nd_to_nz_2d(attn_mask) attn_mask = torch_npu.npu_format_cast(mask_nz.contiguous(), @@ -702,7 +702,7 @@ class AscendAttentionBackendImpl(AttentionImpl): mask = attn_metadata.attn_mask - if is_310p(): + if get_ascend_device_type() == AscendDeviceType._310P: # align q k v output tensors query = aligned_16(query) key = aligned_16(key) @@ -783,7 +783,7 @@ class AscendAttentionBackendImpl(AttentionImpl): attn_metadata: AscendMetadata, output: Optional[torch.Tensor] = None, ) -> torch.Tensor: - if is_310p(): + if get_ascend_device_type() == AscendDeviceType._310P: # seq_lens_tensor needs to be transferred to the device for 310P. attn_metadata.seq_lens = \ attn_metadata.seq_lens.to(device=query.device) @@ -857,7 +857,7 @@ class AscendAttentionBackendImpl(AttentionImpl): assert attn_metadata is not None assert attn_metadata.attn_mask is not None - if is_310p(): + if get_ascend_device_type() == AscendDeviceType._310P: # Do reformat in case of broadcasted tensors. attn_metadata.attn_mask = \ torch_npu.npu_format_cast(attn_metadata.attn_mask.contiguous(), diff --git a/vllm_ascend/distributed/llmdatadist_c_mgr_connector.py b/vllm_ascend/distributed/llmdatadist_c_mgr_connector.py index 2ef8bf83..c0bd06d4 100644 --- a/vllm_ascend/distributed/llmdatadist_c_mgr_connector.py +++ b/vllm_ascend/distributed/llmdatadist_c_mgr_connector.py @@ -32,7 +32,7 @@ from vllm.v1.request import Request, RequestStatus import vllm_ascend.envs as envs_ascend from vllm_ascend.distributed.utils import get_transfer_timeout_value -from vllm_ascend.utils import (AscendSocVersion, get_ascend_soc_version, +from vllm_ascend.utils import (AscendDeviceType, get_ascend_device_type, prefill_context_parallel_enable) if prefill_context_parallel_enable(): @@ -376,7 +376,7 @@ class LLMDataDistCMgrConnectorWorker(): self.local_agent_metadata.cluster_id) self.init_llm_datadist() self.finished_reqs: set[str] = set() - self.soc_info = get_ascend_soc_version() + self.soc_info = get_ascend_device_type() # Set hccl deterministic for model execute os.environ["HCCL_DETERMINISTIC"] = "true" self.done_receiving_counts: defaultdict[str, @@ -761,7 +761,7 @@ class LLMDataDistCMgrConnectorWorker(): rank_table["server_list"].append( # type: ignore[attr-defined] decode_server_device_info) - if self.soc_info == AscendSocVersion.A3: + if self.soc_info == AscendDeviceType._910_93: # generate super_pod_list for rank table super_pod_list = [] prefill_super_pod_info = { diff --git a/vllm_ascend/envs.py b/vllm_ascend/envs.py index f2877a46..cd148da3 100644 --- a/vllm_ascend/envs.py +++ b/vllm_ascend/envs.py @@ -50,11 +50,11 @@ env_variables: Dict[str, Callable[[], Any]] = { # value is None, which means the system default C compiler will be used. "C_COMPILER": lambda: os.getenv("C_COMPILER", None), - # The version of the Ascend chip. If not set, the default value is - # ASCEND910B1(Available for A2 and A3 series). It's used for package building. + # The version of the Ascend chip. It's used for package building. + # If not set, we will query chip info through `npu-smi`. # Please make sure that the version is correct. "SOC_VERSION": - lambda: os.getenv("SOC_VERSION", "ASCEND910B1"), + lambda: os.getenv("SOC_VERSION", None), # If set, vllm-ascend will print verbose logs during compilation "VERBOSE": lambda: bool(int(os.getenv('VERBOSE', '0'))), diff --git a/vllm_ascend/lora/punica_npu.py b/vllm_ascend/lora/punica_npu.py index 1ff9de60..3dba7ee9 100644 --- a/vllm_ascend/lora/punica_npu.py +++ b/vllm_ascend/lora/punica_npu.py @@ -4,9 +4,9 @@ from typing import Callable, Optional, Tuple, Union import torch -from vllm_ascend.utils import is_310p +from vllm_ascend.utils import AscendDeviceType, get_ascend_device_type -if is_310p(): +if get_ascend_device_type() == AscendDeviceType._310P: from vllm.lora.ops.torch_ops import (bgmv_expand, bgmv_expand_slice, bgmv_shrink, sgmv_expand, sgmv_expand_slice, sgmv_shrink) diff --git a/vllm_ascend/ops/activation.py b/vllm_ascend/ops/activation.py index fb1abe66..4889d232 100644 --- a/vllm_ascend/ops/activation.py +++ b/vllm_ascend/ops/activation.py @@ -33,10 +33,10 @@ class AscendSiluAndMul(SiluAndMul): def forward_oot(self, x: torch.Tensor) -> torch.Tensor: import torch_npu - from vllm_ascend.utils import is_310p + from vllm_ascend.utils import AscendDeviceType, get_ascend_device_type torch.ops.vllm.maybe_prefetch_mlp_down_proj(x) - if is_310p(): + if get_ascend_device_type() == AscendDeviceType._310P: out = torch_npu.npu_swiglu(x.to(torch.float32)).to(torch.float16) else: out = torch_npu.npu_swiglu(x) diff --git a/vllm_ascend/ops/fused_moe/fused_moe.py b/vllm_ascend/ops/fused_moe/fused_moe.py index 0e536100..b9667abb 100644 --- a/vllm_ascend/ops/fused_moe/fused_moe.py +++ b/vllm_ascend/ops/fused_moe/fused_moe.py @@ -43,9 +43,9 @@ from vllm_ascend.quantization.w4a8_dynamic import \ AscendW4A8DynamicFusedMoEMethod from vllm_ascend.quantization.w8a8_dynamic import \ AscendW8A8DynamicFusedMoEMethod -from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, enable_sp, is_310p, - is_enable_nz, npu_stream_switch, - shared_expert_dp_enabled, +from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, AscendDeviceType, + enable_sp, get_ascend_device_type, is_enable_nz, + npu_stream_switch, shared_expert_dp_enabled, shared_experts_calculation_stream) @@ -79,7 +79,8 @@ class AscendUnquantizedFusedMoEMethod(UnquantizedFusedMoEMethod): w2_data = self._maybe_pad_weight(layer.w2_weight.data) layer.w2_weight = torch.nn.Parameter(w2_data, requires_grad=False) - if not is_310p() and is_enable_nz(): + if get_ascend_device_type() != AscendDeviceType._310P and is_enable_nz( + ): layer.w13_weight.data = torch_npu.npu_format_cast( layer.w13_weight.data, ACL_FORMAT_FRACTAL_NZ) layer.w2_weight.data = torch_npu.npu_format_cast( diff --git a/vllm_ascend/ops/fused_moe/moe_mlp.py b/vllm_ascend/ops/fused_moe/moe_mlp.py index 0e2b81fb..07ba732f 100644 --- a/vllm_ascend/ops/fused_moe/moe_mlp.py +++ b/vllm_ascend/ops/fused_moe/moe_mlp.py @@ -22,7 +22,8 @@ from torch.nn.functional import pad from vllm.forward_context import get_forward_context from vllm_ascend.ascend_forward_context import MoECommType -from vllm_ascend.utils import dispose_tensor, is_310p +from vllm_ascend.utils import (AscendDeviceType, dispose_tensor, + get_ascend_device_type) def cumsum_group_list(group_list: torch.Tensor, @@ -210,7 +211,7 @@ def unquant_apply_mlp(hidden_states: torch.Tensor, group_type=0, group_list=group_list, )[0] - if is_310p(): + if get_ascend_device_type() == AscendDeviceType._310P: gate_up_out = torch_npu.npu_swiglu(gate_up_out.to(torch.float32)).to( torch.float16) else: diff --git a/vllm_ascend/ops/fused_moe/token_dispatcher.py b/vllm_ascend/ops/fused_moe/token_dispatcher.py index 1ef06533..57f26046 100644 --- a/vllm_ascend/ops/fused_moe/token_dispatcher.py +++ b/vllm_ascend/ops/fused_moe/token_dispatcher.py @@ -30,7 +30,7 @@ from vllm.distributed.parallel_state import get_ep_group from vllm_ascend.distributed.parallel_state import get_mc2_group from vllm_ascend.ops.fused_moe.comm_utils import ( async_all_to_all, gather_from_sequence_parallel_region) -from vllm_ascend.utils import (AscendSocVersion, get_ascend_soc_version, +from vllm_ascend.utils import (AscendDeviceType, get_ascend_device_type, is_hierarchical_communication_enabled) @@ -98,11 +98,11 @@ class TokenDispatcherWithMC2(MoETokenDispatcher): self.enable_dispatch_v2 = hasattr(torch_npu, "npu_moe_distribute_dispatch_v2") self.need_extra_args = ( - get_ascend_soc_version() == AscendSocVersion.A3) + get_ascend_device_type() == AscendDeviceType._910_93) # NOTE: Currently, when in A3, we need to pass in some extra param into dispatch & combine self.a3_need_extra_args = \ - get_ascend_soc_version() == AscendSocVersion.A3 + get_ascend_device_type() == AscendDeviceType._910_93 # NOTE: When in A2, setting the environment variables HCCL_INTRA_PCIE_ENABLE=1 and # HCCL_INTRA_ROCE_ENABLE=0 can reduce cross-machine communication traffic and significantly # improve communication performance. diff --git a/vllm_ascend/ops/layernorm.py b/vllm_ascend/ops/layernorm.py index 6b89f4a5..8c395b54 100644 --- a/vllm_ascend/ops/layernorm.py +++ b/vllm_ascend/ops/layernorm.py @@ -32,9 +32,10 @@ def _addrmsnorm_forward_oot( ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: import torch_npu - from vllm_ascend.utils import is_310p + from vllm_ascend.utils import AscendDeviceType, get_ascend_device_type - if layer is not None and not is_310p(): + if layer is not None and get_ascend_device_type( + ) != AscendDeviceType._310P: layer_cls_name = layer.__class__.__name__ try: weight_prefetch_method = get_forward_context( @@ -67,7 +68,7 @@ def _addrmsnorm_forward_oot( ) else: - if is_310p(): + if get_ascend_device_type() == AscendDeviceType._310P: orig_dtype = residual.dtype x = x + residual.to(x.dtype) residual = x.to(orig_dtype) @@ -195,9 +196,9 @@ class AscendGemmaRMSNorm(GemmaRMSNorm): ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: import torch_npu - from vllm_ascend.utils import is_310p + from vllm_ascend.utils import AscendDeviceType, get_ascend_device_type if residual is not None: - if is_310p(): + if get_ascend_device_type() == AscendDeviceType._310P: orig_dtype = residual.dtype x = x + residual.to(x.dtype) residual = x.to(orig_dtype) diff --git a/vllm_ascend/ops/rotary_embedding.py b/vllm_ascend/ops/rotary_embedding.py index c5180ada..91a6f09f 100644 --- a/vllm_ascend/ops/rotary_embedding.py +++ b/vllm_ascend/ops/rotary_embedding.py @@ -27,7 +27,8 @@ from vllm.model_executor.layers.rotary_embedding import ( from vllm.platforms import CpuArchEnum from vllm_ascend.platform import NPUPlatform -from vllm_ascend.utils import enable_custom_op, is_310p +from vllm_ascend.utils import (AscendDeviceType, enable_custom_op, + get_ascend_device_type) def _custom_rotary_embedding_enabled(query, neox_style, head_size): @@ -49,8 +50,9 @@ def _rope_forward_oot( if self.cos_sin_cache.dtype != query.dtype: self.cos_sin_cache = self.cos_sin_cache.to(query.dtype) # adopt custom kernel path for rotary_embedding - if _custom_rotary_embedding_enabled(query, is_neox_style, - self.head_size) and not is_310p(): + if _custom_rotary_embedding_enabled( + query, is_neox_style, self.head_size) and get_ascend_device_type( + ) != AscendDeviceType._310P: query, key = torch.ops._C_ascend.rotary_embedding( positions, query, diff --git a/vllm_ascend/patch/platform/patch_distributed.py b/vllm_ascend/patch/platform/patch_distributed.py index 67d4797f..467cc045 100644 --- a/vllm_ascend/patch/platform/patch_distributed.py +++ b/vllm_ascend/patch/platform/patch_distributed.py @@ -21,7 +21,7 @@ import torch import vllm.envs as envs_vllm from vllm.config import ParallelConfig -from vllm_ascend.utils import is_310p +from vllm_ascend.utils import AscendDeviceType, get_ascend_device_type def parallel_config_get_dp_port(self) -> int: @@ -111,5 +111,5 @@ def communication_adaptation_310p(): torch.distributed.distributed_c10d.all_reduce) -if is_310p(): +if get_ascend_device_type() == AscendDeviceType._310P: communication_adaptation_310p() diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py index 22f48ff0..0797da32 100644 --- a/vllm_ascend/platform.py +++ b/vllm_ascend/platform.py @@ -30,8 +30,9 @@ from vllm_ascend.ascend_config import (check_ascend_config, get_ascend_config, init_ascend_config) from vllm_ascend.torchair.utils import (check_torchair_cache_exist, delete_torchair_cache_file) -from vllm_ascend.utils import (ASCEND_QUANTIZATION_METHOD, enable_sp, is_310p, - is_vl_model, prefill_context_parallel_enable, +from vllm_ascend.utils import (ASCEND_QUANTIZATION_METHOD, AscendDeviceType, + enable_sp, get_ascend_device_type, is_vl_model, + prefill_context_parallel_enable, update_aclgraph_sizes, update_cudagraph_capture_sizes, update_default_aclgraph_sizes) @@ -281,7 +282,7 @@ class NPUPlatform(Platform): cache_config.block_size = origin_block_size # Activate custom ops for v1, except on 310P - if not is_310p(): + if get_ascend_device_type() != AscendDeviceType._310P: compilation_config.custom_ops = ["all"] # If ascend_scheduler_config is enabled, diff --git a/vllm_ascend/quantization/w8a8.py b/vllm_ascend/quantization/w8a8.py index dcd692ac..c4f8fb04 100644 --- a/vllm_ascend/quantization/w8a8.py +++ b/vllm_ascend/quantization/w8a8.py @@ -25,7 +25,8 @@ from vllm.forward_context import get_forward_context from vllm_ascend.attention.attention_v1 import AscendAttentionState from vllm_ascend.ops.fused_moe.experts_selector import select_experts -from vllm_ascend.utils import ACL_FORMAT_FRACTAL_NZ, is_310p, is_enable_nz +from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, AscendDeviceType, + get_ascend_device_type, is_enable_nz) def quant_per_tensor(in_tensor: torch.Tensor, @@ -45,7 +46,8 @@ class AscendW8A8LinearMethod: def __init__(self) -> None: # aclnn quant matmul requires to transpose matrix B, set to true by default. - self.transpose_weight = not is_310p() + self.transpose_weight = get_ascend_device_type( + ) != AscendDeviceType._310P @staticmethod def get_weight( @@ -147,7 +149,7 @@ class AscendW8A8LinearMethod: ) quant_bias = layer.quant_bias if tp_rank == 0 else None - if is_310p(): + if get_ascend_device_type() == AscendDeviceType._310P: # On 300I Duo platform, we need transpose again if # using nz. This transpose can be skipped in torchair. output = torch_npu.npu_quant_matmul( @@ -299,7 +301,7 @@ class AscendW8A8FusedMoEMethod: e_score_correction_bias=e_score_correction_bias, global_num_experts=global_num_experts) - if is_310p(): + if get_ascend_device_type() == AscendDeviceType._310P: return fused_experts_310p(hidden_states=x, w1=layer.w13_weight, w1_scale=layer.w13_weight_scale, @@ -328,7 +330,7 @@ class AscendW8A8FusedMoEMethod: expert_map=expert_map) def process_weights_after_loading(self, layer): - if not is_310p(): + if get_ascend_device_type() != AscendDeviceType._310P: layer.w13_weight.data = layer.w13_weight.data.transpose( 1, 2).contiguous() layer.w2_weight.data = layer.w2_weight.data.transpose( @@ -345,7 +347,7 @@ class AscendW8A8FusedMoEMethod: expanding_factor_w13 = layer.w13_weight.data.shape[1] expanding_factor_w2 = layer.w2_weight.data.shape[1] - if is_310p(): + if get_ascend_device_type() == AscendDeviceType._310P: layer.w13_input_scale.data = torch.nn.Parameter( layer.w13_input_scale.data.max()) layer.w2_input_scale.data = torch.nn.Parameter( @@ -365,7 +367,8 @@ class AscendW8A8FusedMoEMethod: # converting ACL_FORMAT_FRACTAL_NZ. # npu_quant_grouped_matmul_dequant in eager mode does not accept # ACL_FORMAT_FRACTAL_NZ. - if not is_310p() and is_enable_nz(): + if get_ascend_device_type() != AscendDeviceType._310P and is_enable_nz( + ): layer.w13_weight.data = torch_npu.npu_format_cast( layer.w13_weight.data, ACL_FORMAT_FRACTAL_NZ).contiguous() layer.w2_weight.data = torch_npu.npu_format_cast( diff --git a/vllm_ascend/sample/sampler.py b/vllm_ascend/sample/sampler.py index 37abdd49..6c9f37c6 100644 --- a/vllm_ascend/sample/sampler.py +++ b/vllm_ascend/sample/sampler.py @@ -3,7 +3,7 @@ import torch_npu from vllm.v1.sample.ops.topk_topp_sampler import TopKTopPSampler, random_sample from vllm.v1.sample.sampler import Sampler -from vllm_ascend.utils import is_310p +from vllm_ascend.utils import AscendDeviceType, get_ascend_device_type DEFAULT_LOGPROBS_MODE = "raw_logprobs" @@ -25,7 +25,8 @@ class AscendTopKTopPSampler(TopKTopPSampler): p: torch.Tensor, ) -> torch.Tensor: # npu_top_k_top_p uses the operator aclnnApplyTopKTopP, but aclnnApplyTopKTopP currently does not support 310P - if not is_310p() and p is not None and k is not None and 1 <= int( + if get_ascend_device_type( + ) != AscendDeviceType._310P and p is not None and k is not None and 1 <= int( k.max()) <= 1024: # npu_top_k_top_p's parameter order is (logits, p, k), not (logits, k, p) return torch_npu.npu_top_k_top_p(logits, p, k) diff --git a/vllm_ascend/torchair/models/torchair_pangu_moe.py b/vllm_ascend/torchair/models/torchair_pangu_moe.py index 7b000d8b..d81941ff 100644 --- a/vllm_ascend/torchair/models/torchair_pangu_moe.py +++ b/vllm_ascend/torchair/models/torchair_pangu_moe.py @@ -57,7 +57,8 @@ from vllm.sequence import IntermediateTensors from vllm.v1.sample.sampler import Sampler from vllm_ascend.ascend_config import get_ascend_config -from vllm_ascend.utils import ACL_FORMAT_FRACTAL_NZ, is_310p +from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, AscendDeviceType, + get_ascend_device_type) _ROUTER_SCALE = None @@ -448,7 +449,8 @@ class PanguProMoESparseMoeBlock(nn.Module): # on 300I Duo platform, we find that num_voted_experts set to 5 achieves # good performance without sacrifice too much accuracy. for other platform, # this is set to 8 to use original pangu grouped topk. - num_voted_experts = 5 if is_310p() else 8 + num_voted_experts = 5 if get_ascend_device_type( + ) == AscendDeviceType._310P else 8 self.experts = FusedMoE( num_experts=config.num_experts, @@ -1109,7 +1111,8 @@ class PanguProMoEForCausalLM(nn.Module, SupportsPP): default_weight_loader) weight_loader(param, loaded_weight) loaded_params.add(name) - if is_310p() and "head" in name: + if get_ascend_device_type( + ) == AscendDeviceType._310P and "head" in name: # on 300I Duo platform, ACL_FORMAT_FRACTAL_NZ is much more preferred than # ACL_FORMAT_FRACTAL_ND by matmul operation. Since lmhead is also implemented # by linear, we manually cast the format here. diff --git a/vllm_ascend/torchair/ops/torchair_activation.py b/vllm_ascend/torchair/ops/torchair_activation.py index 0721ea0a..0089b663 100644 --- a/vllm_ascend/torchair/ops/torchair_activation.py +++ b/vllm_ascend/torchair/ops/torchair_activation.py @@ -28,9 +28,9 @@ def torchair_silu_and_mul_forward_oot(self, x: torch.Tensor) -> torch.Tensor: import torch_npu - from vllm_ascend.utils import is_310p + from vllm_ascend.utils import AscendDeviceType, get_ascend_device_type - if is_310p(): + if get_ascend_device_type() == AscendDeviceType._310P: out = torch_npu.npu_swiglu(x.to(torch.float32)).to(torch.float16) else: out = torch_npu.npu_swiglu(x) diff --git a/vllm_ascend/torchair/ops/torchair_fused_moe.py b/vllm_ascend/torchair/ops/torchair_fused_moe.py index a262e284..4408b310 100644 --- a/vllm_ascend/torchair/ops/torchair_fused_moe.py +++ b/vllm_ascend/torchair/ops/torchair_fused_moe.py @@ -51,8 +51,8 @@ from vllm_ascend.torchair.utils import (get_all_reduce_merge_state, get_rm_router_logits_state, npu_stream_switch, npu_wait_tensor, super_kernel) -from vllm_ascend.utils import (AscendSocVersion, dispose_tensor, - get_ascend_soc_version, is_310p, +from vllm_ascend.utils import (AscendDeviceType, dispose_tensor, + get_ascend_device_type, is_hierarchical_communication_enabled) @@ -75,11 +75,11 @@ def torchair_fused_experts_with_mc2( ep_world_size = moe_parallel_config.ep_size # NOTE: Currently, when in A3 or in torchair graph, we need to pass in some extra param into dispatch & combine - need_extra_args = (get_ascend_soc_version() == AscendSocVersion.A3 + need_extra_args = (get_ascend_device_type() == AscendDeviceType._910_93 or is_torchair) # NOTE: Currently, when in A3, we need to pass in some extra param into dispatch & combine - a3_need_extra_args = get_ascend_soc_version() == AscendSocVersion.A3 + a3_need_extra_args = get_ascend_device_type() == AscendDeviceType._910_93 # NOTE: When in A2, setting the environment variables HCCL_INTRA_PCIE_ENABLE=1 and # HCCL_INTRA_ROCE_ENABLE=0 can reduce cross-machine communication traffic and significantly # improve communication performance. @@ -467,7 +467,7 @@ def torchair_fused_experts_moge( group_list=group_list, )[0] - if is_310p(): + if get_ascend_device_type() == AscendDeviceType._310P: gate_up_out = torch_npu.npu_swiglu(gate_up_out.to(torch.float32)).to( torch.float16) else: diff --git a/vllm_ascend/torchair/ops/torchair_layernorm.py b/vllm_ascend/torchair/ops/torchair_layernorm.py index 583a376b..3a3146b8 100644 --- a/vllm_ascend/torchair/ops/torchair_layernorm.py +++ b/vllm_ascend/torchair/ops/torchair_layernorm.py @@ -57,9 +57,9 @@ def torchair_rmsnorm_forward_oot( import torch_npu - from vllm_ascend.utils import is_310p + from vllm_ascend.utils import AscendDeviceType, get_ascend_device_type if residual is not None: - if is_310p(): + if get_ascend_device_type() == AscendDeviceType._310P: orig_dtype = residual.dtype x = x + residual.to(x.dtype) residual = x.to(orig_dtype) diff --git a/vllm_ascend/torchair/ops/torchair_rotary_embedding.py b/vllm_ascend/torchair/ops/torchair_rotary_embedding.py index e64bd6f6..9fdb231b 100644 --- a/vllm_ascend/torchair/ops/torchair_rotary_embedding.py +++ b/vllm_ascend/torchair/ops/torchair_rotary_embedding.py @@ -25,7 +25,8 @@ from vllm.model_executor.layers.rotary_embedding import ( DeepseekScalingRotaryEmbedding, RotaryEmbedding) from vllm_ascend.ascend_config import get_ascend_config -from vllm_ascend.utils import enable_custom_op, is_310p +from vllm_ascend.utils import (AscendDeviceType, enable_custom_op, + get_ascend_device_type) def custom_rotary_embedding_enabled(query, neox_style, head_size): @@ -60,8 +61,9 @@ def rope_forward_oot( if is_neox_style_override is not None: neox_style = is_neox_style_override # adopt custom kernel path for rotary_embedding - if custom_rotary_embedding_enabled(query, neox_style, - self.head_size) and not is_310p(): + if custom_rotary_embedding_enabled( + query, neox_style, self.head_size) and get_ascend_device_type( + ) != AscendDeviceType._310P: query, key = torch.ops._C_ascend.rotary_embedding( positions, query, diff --git a/vllm_ascend/torchair/quantization/torchair_w8a8_dynamic.py b/vllm_ascend/torchair/quantization/torchair_w8a8_dynamic.py index d8ac2f93..8909bb79 100644 --- a/vllm_ascend/torchair/quantization/torchair_w8a8_dynamic.py +++ b/vllm_ascend/torchair/quantization/torchair_w8a8_dynamic.py @@ -28,8 +28,8 @@ from vllm_ascend.distributed.parallel_state import get_mc2_group from vllm_ascend.torchair.ops.torchair_fused_moe import torchair_select_experts from vllm_ascend.torchair.utils import (npu_stream_switch, npu_wait_tensor, super_kernel) -from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, AscendSocVersion, - dispose_tensor, get_ascend_soc_version, +from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, AscendDeviceType, + dispose_tensor, get_ascend_device_type, is_enable_nz, is_hierarchical_communication_enabled) @@ -234,11 +234,11 @@ def torchair_fused_experts_with_mc2( ep_world_size = ep_group.world_size # NOTE: Currently, when in A3 or in torchair graph, we need to pass in some extra param into dispatch & combine - need_extra_args = (get_ascend_soc_version() == AscendSocVersion.A3 + need_extra_args = (get_ascend_device_type() == AscendDeviceType._910_93 or is_torchair) # NOTE: Currently, when in A3, we need to pass in some extra param into dispatch & combine - a3_need_extra_args = get_ascend_soc_version() == AscendSocVersion.A3 + a3_need_extra_args = get_ascend_device_type() == AscendDeviceType._910_93 # NOTE: When in A2, setting the environment variables HCCL_INTRA_PCIE_ENABLE=1 and # HCCL_INTRA_ROCE_ENABLE=0 can reduce cross-machine communication traffic and significantly # improve communication performance. diff --git a/vllm_ascend/torchair/torchair_attention.py b/vllm_ascend/torchair/torchair_attention.py index 8edf9fb3..16fcb385 100644 --- a/vllm_ascend/torchair/torchair_attention.py +++ b/vllm_ascend/torchair/torchair_attention.py @@ -34,8 +34,8 @@ from vllm_ascend.attention.attention_v1 import (AscendAttentionBackend, AscendMetadata) from vllm_ascend.attention.utils import AscendCommonAttentionMetadata from vllm_ascend.torchair.utils import TorchairCommonAttentionMetadata -from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, aligned_16, is_310p, - nd_to_nz_2d) +from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, AscendDeviceType, + aligned_16, get_ascend_device_type, nd_to_nz_2d) class AscendAttentionTorchairBackend(AscendAttentionBackend): @@ -185,7 +185,8 @@ class AscendAttentionTorchairMetadataBuilder(AscendAttentionMetadataBuilder): attn_mask = common_attn_metadata.attn_mask attn_state = common_attn_metadata.attn_state - if is_310p() and attn_state == AscendAttentionState.PrefillNoCache: + if get_ascend_device_type( + ) == AscendDeviceType._310P and attn_state == AscendAttentionState.PrefillNoCache: mask_nz = nd_to_nz_2d(attn_mask) attn_mask = torch_npu.npu_format_cast(mask_nz.contiguous(), 29) @@ -381,7 +382,7 @@ class AscendAttentionTorchairBackendImpl(AttentionImpl): key = key.view(-1, self.num_kv_heads, self.head_size) value = value.view(-1, self.num_kv_heads, self.head_size) - if is_310p(): + if get_ascend_device_type() == AscendDeviceType._310P: # align q k v output tensors query = aligned_16(query) key = aligned_16(key) diff --git a/vllm_ascend/torchair/torchair_model_runner.py b/vllm_ascend/torchair/torchair_model_runner.py index 792972f0..d7c55c6e 100644 --- a/vllm_ascend/torchair/torchair_model_runner.py +++ b/vllm_ascend/torchair/torchair_model_runner.py @@ -42,8 +42,7 @@ from vllm_ascend.torchair.utils import ( register_torchair_model, torchair_ops_patch, torchair_quant_method_register, write_kv_cache_bytes_to_file) from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, ACL_FORMAT_FRACTAL_NZ, - is_310p, get_ascend_soc_version, - AscendSocVersion) + AscendDeviceType, get_ascend_device_type) from vllm_ascend.worker.model_runner_v1 import NPUModelRunner @@ -125,13 +124,13 @@ class NPUTorchairModelRunner(NPUModelRunner): max_num_tokens, tp_size) self.mc2_tokens_capacity = max_graph_batch_size - if get_ascend_soc_version( - ) == AscendSocVersion.A3 and self.mc2_tokens_capacity > 512: + if get_ascend_device_type( + ) == AscendDeviceType._910_93 and self.mc2_tokens_capacity > 512: logger.error( f"A3: the max number of tokens must smaller then 512, but now is {self.mc2_tokens_capacity}" ) - if get_ascend_soc_version( - ) == AscendSocVersion.A2 and self.mc2_tokens_capacity > 256: + if get_ascend_device_type( + ) == AscendDeviceType._910B and self.mc2_tokens_capacity > 256: logger.error( f"A2: the max number of tokens must smaller then 256, but now is {self.mc2_tokens_capacity}" ) @@ -207,7 +206,7 @@ class NPUTorchairModelRunner(NPUModelRunner): positions, attn_metadata, num_tokens, intermediate_tensors, inputs_embeds): if with_prefill or self.enable_shared_expert_dp: - if is_310p(): + if get_ascend_device_type() == AscendDeviceType._310P: converting_weight_acl_format(self.model, ACL_FORMAT_FRACTAL_ND) hidden_states = super()._generate_dummy_run_hidden_states( with_prefill, is_torchair_compile, input_ids, positions, @@ -230,7 +229,7 @@ class NPUTorchairModelRunner(NPUModelRunner): assert isinstance(kv, tuple), "kv_cache must be a tuple" torch._dynamo.mark_static(kv[0]) torch._dynamo.mark_static(kv[1]) - if is_310p(): + if get_ascend_device_type() == AscendDeviceType._310P: converting_weight_acl_format(self.model, ACL_FORMAT_FRACTAL_NZ) compiled_model = self._get_torchair_lazy_compiled_model(num_tokens) @@ -371,7 +370,7 @@ class NPUTorchairModelRunner(NPUModelRunner): "attn_metadata": attn_metadata } if not with_prefill: - if is_310p(): + if get_ascend_device_type() == AscendDeviceType._310P: converting_weight_acl_format(self.model, ACL_FORMAT_FRACTAL_NZ) compiled_model = self._get_torchair_lazy_compiled_model( padded_num_tokens_across_dp) @@ -384,7 +383,7 @@ class NPUTorchairModelRunner(NPUModelRunner): ) else: assert self.model is not None - if is_310p(): + if get_ascend_device_type() == AscendDeviceType._310P: converting_weight_acl_format(self.model, ACL_FORMAT_FRACTAL_ND) hidden_states = self.model( @@ -414,7 +413,7 @@ class NPUTorchairModelRunner(NPUModelRunner): patch_for_hcom() - if is_310p(): + if get_ascend_device_type() == AscendDeviceType._310P: # on 300I Duo platform, we need to patch broadcast. however, this patch will be # overwritten by patch_for_hcom in torchair. so we need to re-patch it here. from vllm_ascend.patch.platform.patch_distributed import \ @@ -428,7 +427,8 @@ class NPUTorchairModelRunner(NPUModelRunner): self.ascend_config.torchair_graph_config.enable_frozen_parameter # enabling tiling_schedule_optimize on 300I Duo has some bugs, so we have to # disable it on 300I Duo platform now. - config.experimental_config.tiling_schedule_optimize = not is_310p() + config.experimental_config.tiling_schedule_optimize = get_ascend_device_type( + ) != AscendDeviceType._310P config.experimental_config.enable_view_optimize = \ self.ascend_config.torchair_graph_config.enable_view_optimize torch.npu.set_compile_mode(jit_compile=False) @@ -531,8 +531,8 @@ class NPUTorchairModelRunner(NPUModelRunner): # NOTE: when enable_expert_parallel on A3, we need to check if `graph_batch_size` is divisible by `tp_size` # Because we use x_active_mask for dispatch/combine op on A3, which requires that input shape should be same # on all EP ranks - if get_ascend_soc_version( - ) == AscendSocVersion.A3 and self.parallel_config.enable_expert_parallel: + if get_ascend_device_type( + ) == AscendDeviceType._910_93 and self.parallel_config.enable_expert_parallel: self._align_graph_size_divisible_by_tp_size() def _align_graph_size_divisible_by_tp_size(self): diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py index 2ae0da31..efb1d5f5 100644 --- a/vllm_ascend/utils.py +++ b/vllm_ascend/utils.py @@ -48,7 +48,6 @@ ACL_FORMAT_FRACTAL_ND = 2 ACL_FORMAT_FRACTAL_NZ = 29 _CUSTOM_OP_ENABLED = None -_IS_310P = None _SLEEP_MODE_ENABLED = None _CURRENT_STREAM = None _PREFETCH_STREAM = None @@ -121,14 +120,6 @@ def _unregister_print_streams_on_exit(): atexit.register(_unregister_print_streams_on_exit) -def is_310p(): - global _IS_310P - if _IS_310P is None: - from vllm_ascend import _build_info # type: ignore - _IS_310P = _build_info.__soc_version__.lower().startswith("ascend310p") - return _IS_310P - - def is_enable_nz(): return envs_ascend.VLLM_ASCEND_ENABLE_NZ @@ -703,32 +694,47 @@ def register_ascend_customop(vllm_config: Optional[VllmConfig] = None): _ASCEND_CUSTOMOP_IS_REIGISTERED = True -# TODO(zzzzwwjj): Currently there is no clear SOC_VERSION policy for A2 and A3 in CANN. -# So we get the version dynamically. In the future, we should get the version info from _build_info like 310p does. -class AscendSocVersion(Enum): - A2 = 0 - A3 = 1 - UNDEFINED = 2 +class AscendDeviceType(Enum): + _910B = 0 # A2 + _910_93 = 1 # A3 + _310P = 2 + _910_95 = 3 # A5 -_ascend_soc_version = None +_ascend_device_type = None -def init_ascend_soc_version(): +def _init_ascend_device_type(): + global _ascend_device_type + from vllm_ascend import _build_info # type: ignore + _ascend_device_type = AscendDeviceType[_build_info.__device_type__] + + +def check_ascend_device_type(): + global _ascend_device_type + if _ascend_device_type is None: + _init_ascend_device_type() + soc_version = torch_npu.npu.get_soc_version() - global _ascend_soc_version if 220 <= soc_version <= 225: - _ascend_soc_version = AscendSocVersion.A2 + cur_device_type = AscendDeviceType._910B elif 250 <= soc_version <= 255: - _ascend_soc_version = AscendSocVersion.A3 + cur_device_type = AscendDeviceType._910_93 + elif 200 <= soc_version <= 205: + cur_device_type = AscendDeviceType._310P + elif soc_version == 260: + cur_device_type = AscendDeviceType._910_95 else: - _ascend_soc_version = AscendSocVersion.UNDEFINED + raise RuntimeError(f"Can not support soc_version: {soc_version}.") + + assert _ascend_device_type == cur_device_type, f"Current device type: {cur_device_type} does not match the installed version's device type: {_ascend_device_type}, please check your installation package." -def get_ascend_soc_version(): - global _ascend_soc_version - assert _ascend_soc_version is not None - return _ascend_soc_version +def get_ascend_device_type(): + global _ascend_device_type + if _ascend_device_type is None: + _init_ascend_device_type() + return _ascend_device_type def lmhead_tp_enable() -> bool: diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 30add8e7..ff55d1d1 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -138,9 +138,9 @@ from vllm_ascend.spec_decode.interface import SpecDcodeType from vllm_ascend.spec_decode.mtp_proposer import MtpProposer from vllm_ascend.torchair.torchair_mtp_proposer import TorchairMtpProposer from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, ACL_FORMAT_FRACTAL_NZ, - AscendSocVersion, ProfileExecuteDuration, - enable_sp, get_ascend_soc_version, is_310p, - is_enable_nz, is_moe_model, lmhead_tp_enable, + AscendDeviceType, ProfileExecuteDuration, + enable_sp, get_ascend_device_type, is_enable_nz, + is_moe_model, lmhead_tp_enable, prefill_context_parallel_enable) from vllm_ascend.worker.npu_input_batch import CachedRequestState, InputBatch @@ -161,7 +161,7 @@ import torch_npu # if true, allow tensor initialization and casting with internal format (e.g., NZ) torch.npu.config.allow_internal_format = True -if is_310p(): +if get_ascend_device_type() == AscendDeviceType._310P: torch_npu.npu.set_compile_mode(jit_compile=False) ACL_FORMAT = ACL_FORMAT_FRACTAL_NZ else: @@ -2226,14 +2226,14 @@ class NPUModelRunner(LoRAModelRunnerMixin): if not is_moe_model(self.vllm_config): return None - soc_version = get_ascend_soc_version() + soc_version = get_ascend_device_type() quant_type = getattr(self.vllm_config.model_config.hf_config, 'moe_quantize', None) model_type = self.vllm_config.model_config.hf_config.model_type if not self.parallel_config.enable_expert_parallel: moe_comm_type = MoECommType.ALLGATHER - elif soc_version in {AscendSocVersion.A2}: + elif soc_version in {AscendDeviceType._910B}: if (num_tokens <= self.mc2_tokens_capacity and self.parallel_config.world_size_across_dp >= 16): moe_comm_type = MoECommType.MC2 @@ -2244,7 +2244,7 @@ class NPUModelRunner(LoRAModelRunnerMixin): else: moe_comm_type = MoECommType.ALLGATHER - elif soc_version in {AscendSocVersion.A3}: + elif soc_version in {AscendDeviceType._910_93}: moe_comm_type = (MoECommType.MC2 if num_tokens <= self.mc2_tokens_capacity else MoECommType.ALLTOALL) @@ -3183,7 +3183,7 @@ class NPUModelRunner(LoRAModelRunnerMixin): self.model = get_model(vllm_config=self.vllm_config) if self.dynamic_eplb: model_register(self.model, self.model_config) - if is_310p(): + if get_ascend_device_type() == AscendDeviceType._310P: from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, QKVParallelLinear, RowParallelLinear) diff --git a/vllm_ascend/worker/worker_v1.py b/vllm_ascend/worker/worker_v1.py index f67cfec1..e9000eae 100644 --- a/vllm_ascend/worker/worker_v1.py +++ b/vllm_ascend/worker/worker_v1.py @@ -50,7 +50,7 @@ from vllm_ascend.cpu_binding import bind_cpus from vllm_ascend.device_allocator.camem import CaMemAllocator from vllm_ascend.distributed.parallel_state import init_ascend_model_parallel from vllm_ascend.platform import NPUPlatform -from vllm_ascend.utils import (init_ascend_soc_version, is_enable_nz, +from vllm_ascend.utils import (check_ascend_device_type, is_enable_nz, prefill_context_parallel_enable, register_ascend_customop, sleep_mode_enabled, try_register_lib) @@ -91,7 +91,7 @@ class NPUWorker(WorkerBase): register_ascend_customop(vllm_config) # init ascend config and soc version init_ascend_config(vllm_config) - init_ascend_soc_version() + check_ascend_device_type() use_sparse = False if vllm_config.model_config is not None: use_sparse = hasattr(vllm_config.model_config.hf_config,