2025-07-02 17:46:06 +08:00
|
|
|
import importlib
|
|
|
|
|
from unittest.mock import MagicMock, patch
|
|
|
|
|
|
2025-08-28 14:08:31 +08:00
|
|
|
import pytest
|
2025-07-02 17:46:06 +08:00
|
|
|
import torch
|
2025-12-20 09:38:53 +08:00
|
|
|
from vllm.attention.selector import AttentionSelectorConfig
|
2025-11-24 17:08:20 +08:00
|
|
|
from vllm.config.compilation import CompilationMode, CUDAGraphMode
|
2025-07-02 17:46:06 +08:00
|
|
|
from vllm.platforms import PlatformEnum
|
|
|
|
|
|
|
|
|
|
from tests.ut.base import TestBase
|
|
|
|
|
from vllm_ascend.platform import NPUPlatform
|
[Quantization] Support compressed tensors w8a8 static and w8a8 dynamic weight (#4036)
### What this PR does / why we need it?
While using the LLM Compressor quantization tool from the VLLM community
to generate quantized weights, the VLLM Ascend engine needs to be
adapted to support the compressed tensors quantization format.
1. Add AscendCompressedTensorsConfig to replace CompressedTensorsConfig
in vllm.
2. Support CompressedTensorsW8A8 static weight.
- weight: per-channel, int8, symmetric; activation: per-tensor, int8,
symmetric.
4. Support CompressedTensorsW8A8Dynamic weight.
- weight: per-channel, int8, symmetric; activation: per-token, int8,
symmetric, dynamic.
5. Modify the override_quantization_method in AscendQuantConfig.
Co-authored-by: taoqun110 taoqun@huawei.com
Co-authored-by: chenxi-hh chen464822955@163.com
- vLLM version: v0.11.2
---------
Signed-off-by: LHXuuu <scut_xlh@163.com>
Signed-off-by: chenxi-hh <chen464822955@163.com>
Signed-off-by: chenxi-hh <32731611+chenxi-hh@users.noreply.github.com>
Co-authored-by: chenxi-hh <chen464822955@163.com>
Co-authored-by: chenxi-hh <32731611+chenxi-hh@users.noreply.github.com>
2025-11-28 14:09:39 +08:00
|
|
|
from vllm_ascend.utils import (ASCEND_QUANTIZATION_METHOD,
|
|
|
|
|
COMPRESSED_TENSORS_METHOD, AscendDeviceType)
|
2025-07-02 17:46:06 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestNPUPlatform(TestBase):
|
|
|
|
|
|
2025-09-02 18:34:04 +08:00
|
|
|
@staticmethod
|
|
|
|
|
def mock_vllm_config():
|
|
|
|
|
mock_vllm_config = MagicMock()
|
|
|
|
|
mock_vllm_config.compilation_config = MagicMock()
|
|
|
|
|
mock_vllm_config.model_config = MagicMock()
|
|
|
|
|
mock_vllm_config.parallel_config = MagicMock()
|
|
|
|
|
mock_vllm_config.cache_config = MagicMock()
|
|
|
|
|
mock_vllm_config.scheduler_config = MagicMock()
|
|
|
|
|
mock_vllm_config.speculative_config = None
|
2025-12-03 23:43:05 +08:00
|
|
|
mock_vllm_config.compilation_config.pass_config.enable_sp = False
|
2025-09-02 18:34:04 +08:00
|
|
|
mock_vllm_config.compilation_config.cudagraph_mode = None
|
|
|
|
|
return mock_vllm_config
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
def mock_vllm_ascend_config():
|
|
|
|
|
mock_ascend_config = MagicMock()
|
2025-12-08 08:27:46 +08:00
|
|
|
mock_ascend_config.xlite_graph_config.enabled = False
|
2025-09-16 14:13:07 +08:00
|
|
|
mock_ascend_config.enable_shared_expert_dp = False
|
2025-09-02 18:34:04 +08:00
|
|
|
return mock_ascend_config
|
|
|
|
|
|
2025-07-02 17:46:06 +08:00
|
|
|
def setUp(self):
|
|
|
|
|
self.platform = NPUPlatform()
|
|
|
|
|
|
|
|
|
|
def test_class_variables(self):
|
|
|
|
|
self.assertEqual(NPUPlatform._enum, PlatformEnum.OOT)
|
|
|
|
|
self.assertEqual(NPUPlatform.device_name, "npu")
|
|
|
|
|
self.assertEqual(NPUPlatform.device_type, "npu")
|
|
|
|
|
self.assertEqual(NPUPlatform.simple_compile_backend, "eager")
|
|
|
|
|
self.assertEqual(NPUPlatform.ray_device_key, "NPU")
|
|
|
|
|
self.assertEqual(NPUPlatform.device_control_env_var,
|
|
|
|
|
"ASCEND_RT_VISIBLE_DEVICES")
|
|
|
|
|
self.assertEqual(NPUPlatform.dispatch_key, "PrivateUse1")
|
[Quantization] Support compressed tensors w8a8 static and w8a8 dynamic weight (#4036)
### What this PR does / why we need it?
While using the LLM Compressor quantization tool from the VLLM community
to generate quantized weights, the VLLM Ascend engine needs to be
adapted to support the compressed tensors quantization format.
1. Add AscendCompressedTensorsConfig to replace CompressedTensorsConfig
in vllm.
2. Support CompressedTensorsW8A8 static weight.
- weight: per-channel, int8, symmetric; activation: per-tensor, int8,
symmetric.
4. Support CompressedTensorsW8A8Dynamic weight.
- weight: per-channel, int8, symmetric; activation: per-token, int8,
symmetric, dynamic.
5. Modify the override_quantization_method in AscendQuantConfig.
Co-authored-by: taoqun110 taoqun@huawei.com
Co-authored-by: chenxi-hh chen464822955@163.com
- vLLM version: v0.11.2
---------
Signed-off-by: LHXuuu <scut_xlh@163.com>
Signed-off-by: chenxi-hh <chen464822955@163.com>
Signed-off-by: chenxi-hh <32731611+chenxi-hh@users.noreply.github.com>
Co-authored-by: chenxi-hh <chen464822955@163.com>
Co-authored-by: chenxi-hh <32731611+chenxi-hh@users.noreply.github.com>
2025-11-28 14:09:39 +08:00
|
|
|
self.assertEqual(
|
|
|
|
|
NPUPlatform.supported_quantization,
|
|
|
|
|
[ASCEND_QUANTIZATION_METHOD, COMPRESSED_TENSORS_METHOD])
|
2025-07-02 17:46:06 +08:00
|
|
|
|
|
|
|
|
def test_is_sleep_mode_available(self):
|
|
|
|
|
self.assertTrue(self.platform.is_sleep_mode_available())
|
|
|
|
|
|
|
|
|
|
@patch("vllm_ascend.utils.adapt_patch")
|
|
|
|
|
@patch("vllm_ascend.quantization.quant_config.AscendQuantConfig")
|
|
|
|
|
def test_pre_register_and_update_with_parser(self, mock_quant_config,
|
|
|
|
|
mock_adapt_patch):
|
|
|
|
|
mock_parser = MagicMock()
|
|
|
|
|
mock_action = MagicMock()
|
|
|
|
|
mock_action.choices = ["awq", "gptq"]
|
|
|
|
|
mock_parser._option_string_actions = {"--quantization": mock_action}
|
|
|
|
|
|
|
|
|
|
self.platform.pre_register_and_update(mock_parser)
|
|
|
|
|
|
|
|
|
|
mock_adapt_patch.assert_called_once_with(is_global_patch=True)
|
|
|
|
|
|
2025-08-26 09:06:16 +08:00
|
|
|
self.assertTrue(ASCEND_QUANTIZATION_METHOD in mock_action.choices)
|
2025-07-02 17:46:06 +08:00
|
|
|
self.assertEqual(len(mock_action.choices), 3) # original 2 + ascend
|
|
|
|
|
|
|
|
|
|
@patch("vllm_ascend.utils.adapt_patch")
|
|
|
|
|
@patch("vllm_ascend.quantization.quant_config.AscendQuantConfig")
|
|
|
|
|
def test_pre_register_and_update_without_parser(self, mock_quant_config,
|
|
|
|
|
mock_adapt_patch):
|
|
|
|
|
self.platform.pre_register_and_update(None)
|
|
|
|
|
|
|
|
|
|
mock_adapt_patch.assert_called_once_with(is_global_patch=True)
|
|
|
|
|
|
|
|
|
|
@patch("vllm_ascend.utils.adapt_patch")
|
|
|
|
|
@patch("vllm_ascend.quantization.quant_config.AscendQuantConfig")
|
|
|
|
|
def test_pre_register_and_update_with_parser_no_quant_action(
|
|
|
|
|
self, mock_quant_config, mock_adapt_patch):
|
|
|
|
|
mock_parser = MagicMock()
|
|
|
|
|
mock_parser._option_string_actions = {}
|
|
|
|
|
|
|
|
|
|
self.platform.pre_register_and_update(mock_parser)
|
|
|
|
|
|
|
|
|
|
mock_adapt_patch.assert_called_once_with(is_global_patch=True)
|
|
|
|
|
|
|
|
|
|
@patch("vllm_ascend.utils.adapt_patch")
|
|
|
|
|
@patch("vllm_ascend.quantization.quant_config.AscendQuantConfig")
|
|
|
|
|
def test_pre_register_and_update_with_existing_ascend_quant(
|
|
|
|
|
self, mock_quant_config, mock_adapt_patch):
|
|
|
|
|
mock_parser = MagicMock()
|
|
|
|
|
mock_action = MagicMock()
|
2025-08-26 09:06:16 +08:00
|
|
|
mock_action.choices = ["awq", ASCEND_QUANTIZATION_METHOD]
|
2025-07-02 17:46:06 +08:00
|
|
|
mock_parser._option_string_actions = {"--quantization": mock_action}
|
|
|
|
|
|
|
|
|
|
self.platform.pre_register_and_update(mock_parser)
|
|
|
|
|
|
|
|
|
|
mock_adapt_patch.assert_called_once_with(is_global_patch=True)
|
|
|
|
|
self.assertEqual(len(mock_action.choices), 2)
|
|
|
|
|
|
|
|
|
|
def test_get_device_capability(self):
|
|
|
|
|
self.assertIsNone(self.platform.get_device_capability(device_id=0))
|
|
|
|
|
|
|
|
|
|
@patch("torch.npu.get_device_name")
|
|
|
|
|
def test_get_device_name(self, mock_get_device_name):
|
|
|
|
|
device_id = 0
|
|
|
|
|
device_name = "Ascend910B2"
|
|
|
|
|
mock_get_device_name.return_value = device_name
|
|
|
|
|
self.assertEqual(self.platform.get_device_name(device_id), device_name)
|
|
|
|
|
mock_get_device_name.assert_called_once_with(0)
|
|
|
|
|
|
|
|
|
|
@patch("torch.inference_mode")
|
|
|
|
|
def test_inference_mode(self, mock_inference_mode):
|
|
|
|
|
mock_inference_mode.return_value = None
|
|
|
|
|
self.assertIsNone(self.platform.inference_mode())
|
|
|
|
|
mock_inference_mode.assert_called_once()
|
|
|
|
|
|
|
|
|
|
@patch("vllm_ascend.ascend_config.init_ascend_config")
|
|
|
|
|
@patch("vllm_ascend.utils.update_aclgraph_sizes")
|
[refact] unified soc_version code (#4359)
### What this PR does / why we need it?
Currently, there are two paths to judge the chip type in code,
`get_ascend_soc_version` use `get_soc_version` api in torch_npu, and
`is_310p` `use _build_info.__soc_version__`, which generate when
install. We need to unify the two paths.
We need to unify these codes based on the following points:
1. We need to ensure consistency in chip type judgment between compiling
and running states;
2. In compiling state, we need chip type to complete op's compilation,
but in running state, we only need device
type(910B/910_93/310P/910_95/etc) to make code branch judgement;
3. In compiling state, torch_npu may not have been installed yet, so we
can't use torch_npu's api.
Based on the above points, we have made the following changes:
1. When user set env `SOC_VERSION`, use it; when not set, query
soc_version by `npu-smi`;
2. generate device_type based on soc_version when compiling, and write
`__device_type__` instead of `__soc_version__` in `_build_info.py`;
3. In running state, use `__device_type__` to judge code branch.
### Does this PR introduce _any_ user-facing change?
When not set env `SOC_VERSION`, it will not be `ASCEND910B1` by default,
we will query soc_version by `npu-smi`. And env `SOC_VERSION` must be in
the list `soc_to_device` in `setup.py`.
- vLLM version: v0.11.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/2918c1b49c88c29783c86f78d2c4221cb9622379
Signed-off-by: zzzzwwjj <1183291235@qq.com>
2025-11-26 14:28:55 +08:00
|
|
|
@patch('vllm_ascend.utils.get_ascend_device_type',
|
2025-12-17 14:08:19 +08:00
|
|
|
return_value=AscendDeviceType.A3)
|
2025-07-02 17:46:06 +08:00
|
|
|
@patch("os.environ", {})
|
2025-10-20 16:30:57 +08:00
|
|
|
@patch(
|
2025-12-11 22:24:49 +08:00
|
|
|
"vllm_ascend.core.recompute_scheduler.RecomputeSchedulerConfig.initialize_from_config"
|
2025-10-20 16:30:57 +08:00
|
|
|
)
|
2025-07-02 17:46:06 +08:00
|
|
|
def test_check_and_update_config_basic_config_update(
|
[refact] unified soc_version code (#4359)
### What this PR does / why we need it?
Currently, there are two paths to judge the chip type in code,
`get_ascend_soc_version` use `get_soc_version` api in torch_npu, and
`is_310p` `use _build_info.__soc_version__`, which generate when
install. We need to unify the two paths.
We need to unify these codes based on the following points:
1. We need to ensure consistency in chip type judgment between compiling
and running states;
2. In compiling state, we need chip type to complete op's compilation,
but in running state, we only need device
type(910B/910_93/310P/910_95/etc) to make code branch judgement;
3. In compiling state, torch_npu may not have been installed yet, so we
can't use torch_npu's api.
Based on the above points, we have made the following changes:
1. When user set env `SOC_VERSION`, use it; when not set, query
soc_version by `npu-smi`;
2. generate device_type based on soc_version when compiling, and write
`__device_type__` instead of `__soc_version__` in `_build_info.py`;
3. In running state, use `__device_type__` to judge code branch.
### Does this PR introduce _any_ user-facing change?
When not set env `SOC_VERSION`, it will not be `ASCEND910B1` by default,
we will query soc_version by `npu-smi`. And env `SOC_VERSION` must be in
the list `soc_to_device` in `setup.py`.
- vLLM version: v0.11.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/2918c1b49c88c29783c86f78d2c4221cb9622379
Signed-off-by: zzzzwwjj <1183291235@qq.com>
2025-11-26 14:28:55 +08:00
|
|
|
self, mock_init_recompute, mock_soc_version, mock_update_acl,
|
2025-12-11 11:21:13 +08:00
|
|
|
mock_init_ascend):
|
2025-09-02 18:34:04 +08:00
|
|
|
mock_init_ascend.return_value = TestNPUPlatform.mock_vllm_ascend_config(
|
|
|
|
|
)
|
|
|
|
|
vllm_config = TestNPUPlatform.mock_vllm_config()
|
|
|
|
|
vllm_config.parallel_config.enable_expert_parallel = False
|
2025-12-05 10:31:49 +08:00
|
|
|
vllm_config.parallel_config.decode_context_parallel_size = 1
|
|
|
|
|
vllm_config.parallel_config.prefill_context_parallel_size = 1
|
|
|
|
|
vllm_config.parallel_config.decode_context_parallel_size = 1
|
|
|
|
|
vllm_config.parallel_config.prefill_context_parallel_size = 1
|
2025-10-20 16:30:57 +08:00
|
|
|
vllm_config.parallel_config.tensor_parallel_size = 1
|
|
|
|
|
mock_init_recompute.return_value = MagicMock()
|
[1/N][Refactor] Refactor code to adapt with vllm main (#3612)
### What this PR does / why we need it?
This is the step 1 of refactoring code to adapt with vllm main, and this
pr aligned with
https://github.com/vllm-project/vllm/commit/17c540a993af88204ad1b78345c8a865cf58ce44
1. refactor deepseek to the latest code arch as of
https://github.com/vllm-project/vllm/commit/17c540a993af88204ad1b78345c8a865cf58ce44
2. bunches of fixes due to vllm changes
- Fix `AscendScheduler` `__post_init__`, caused by
https://github.com/vllm-project/vllm/pull/25075
- Fix `AscendScheduler` init got an unexpected arg `block_size`, caused
by https://github.com/vllm-project/vllm/pull/26296
- Fix `KVCacheManager` `get_num_common_prefix_blocks` arg, caused by
https://github.com/vllm-project/vllm/pull/23485
- Fix `MLAAttention` import,caused by
https://github.com/vllm-project/vllm/pull/25103
- Fix `SharedFusedMoE` import, caused by
https://github.com/vllm-project/vllm/pull/26145
- Fix `LazyLoader` improt, caused by
https://github.com/vllm-project/vllm/pull/27022
- Fix `vllm.utils.swap_dict_values` improt, caused by
https://github.com/vllm-project/vllm/pull/26990
- Fix `Backend` enum import, caused by
https://github.com/vllm-project/vllm/pull/25893
- Fix `CompilationLevel` renaming to `CompilationMode` issue introduced
by https://github.com/vllm-project/vllm/pull/26355
- Fix fused_moe ops, caused by
https://github.com/vllm-project/vllm/pull/24097
- Fix bert model because of `inputs_embeds`, caused by
https://github.com/vllm-project/vllm/pull/25922
- Fix MRope because of `get_input_positions_tensor` to
`get_mrope_input_positions`, caused by
https://github.com/vllm-project/vllm/pull/24172
- Fix `splitting_ops` changes introduced by
https://github.com/vllm-project/vllm/pull/25845
- Fix multi-modality changes introduced by
https://github.com/vllm-project/vllm/issues/16229
- Fix lora bias dropping issue introduced by
https://github.com/vllm-project/vllm/pull/25807
- Fix structured ouput break introduced by
https://github.com/vllm-project/vllm/issues/26737
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
CI passed with existing test.
- vLLM version: v0.11.0rc3
- vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0
---------
Signed-off-by: MengqingCao <cmq0113@163.com>
Signed-off-by: Icey <1790571317@qq.com>
Co-authored-by: Icey <1790571317@qq.com>
2025-10-24 16:55:08 +08:00
|
|
|
vllm_config.scheduler_config = MagicMock()
|
2025-07-02 17:46:06 +08:00
|
|
|
|
|
|
|
|
# Use importlib.reload to reload the platform module, ensuring the mocked init_ascend_config method is used.
|
|
|
|
|
# Without this reload, when calling self.platform.check_and_update_config,
|
|
|
|
|
# it would execute the original unmocked init_ascend_config method, causing the unit test to fail.
|
|
|
|
|
from vllm_ascend import platform
|
|
|
|
|
|
|
|
|
|
importlib.reload(platform)
|
|
|
|
|
|
2025-09-02 18:34:04 +08:00
|
|
|
self.platform.check_and_update_config(vllm_config)
|
2025-07-02 17:46:06 +08:00
|
|
|
|
2025-09-02 18:34:04 +08:00
|
|
|
mock_init_ascend.assert_called_once_with(vllm_config)
|
2025-07-02 17:46:06 +08:00
|
|
|
|
[refact] unified soc_version code (#4359)
### What this PR does / why we need it?
Currently, there are two paths to judge the chip type in code,
`get_ascend_soc_version` use `get_soc_version` api in torch_npu, and
`is_310p` `use _build_info.__soc_version__`, which generate when
install. We need to unify the two paths.
We need to unify these codes based on the following points:
1. We need to ensure consistency in chip type judgment between compiling
and running states;
2. In compiling state, we need chip type to complete op's compilation,
but in running state, we only need device
type(910B/910_93/310P/910_95/etc) to make code branch judgement;
3. In compiling state, torch_npu may not have been installed yet, so we
can't use torch_npu's api.
Based on the above points, we have made the following changes:
1. When user set env `SOC_VERSION`, use it; when not set, query
soc_version by `npu-smi`;
2. generate device_type based on soc_version when compiling, and write
`__device_type__` instead of `__soc_version__` in `_build_info.py`;
3. In running state, use `__device_type__` to judge code branch.
### Does this PR introduce _any_ user-facing change?
When not set env `SOC_VERSION`, it will not be `ASCEND910B1` by default,
we will query soc_version by `npu-smi`. And env `SOC_VERSION` must be in
the list `soc_to_device` in `setup.py`.
- vLLM version: v0.11.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/2918c1b49c88c29783c86f78d2c4221cb9622379
Signed-off-by: zzzzwwjj <1183291235@qq.com>
2025-11-26 14:28:55 +08:00
|
|
|
@patch('vllm_ascend.utils.get_ascend_device_type',
|
2025-12-17 14:08:19 +08:00
|
|
|
return_value=AscendDeviceType.A3)
|
2025-07-02 17:46:06 +08:00
|
|
|
@patch("vllm_ascend.ascend_config.init_ascend_config")
|
2025-10-20 16:30:57 +08:00
|
|
|
@patch(
|
2025-12-11 22:24:49 +08:00
|
|
|
"vllm_ascend.core.recompute_scheduler.RecomputeSchedulerConfig.initialize_from_config"
|
2025-10-20 16:30:57 +08:00
|
|
|
)
|
2025-07-02 17:46:06 +08:00
|
|
|
def test_check_and_update_config_no_model_config_warning(
|
2025-12-11 11:21:13 +08:00
|
|
|
self, mock_init_recompute, mock_init_ascend, mock_soc_version):
|
2025-09-02 18:34:04 +08:00
|
|
|
mock_init_ascend.return_value = TestNPUPlatform.mock_vllm_ascend_config(
|
|
|
|
|
)
|
|
|
|
|
vllm_config = TestNPUPlatform.mock_vllm_config()
|
|
|
|
|
vllm_config.model_config = None
|
2025-12-05 10:31:49 +08:00
|
|
|
vllm_config.parallel_config.decode_context_parallel_size = 1
|
|
|
|
|
vllm_config.parallel_config.prefill_context_parallel_size = 1
|
2025-10-20 16:30:57 +08:00
|
|
|
vllm_config.parallel_config.tensor_parallel_size = 1
|
|
|
|
|
mock_init_recompute.return_value = MagicMock()
|
[1/N][Refactor] Refactor code to adapt with vllm main (#3612)
### What this PR does / why we need it?
This is the step 1 of refactoring code to adapt with vllm main, and this
pr aligned with
https://github.com/vllm-project/vllm/commit/17c540a993af88204ad1b78345c8a865cf58ce44
1. refactor deepseek to the latest code arch as of
https://github.com/vllm-project/vllm/commit/17c540a993af88204ad1b78345c8a865cf58ce44
2. bunches of fixes due to vllm changes
- Fix `AscendScheduler` `__post_init__`, caused by
https://github.com/vllm-project/vllm/pull/25075
- Fix `AscendScheduler` init got an unexpected arg `block_size`, caused
by https://github.com/vllm-project/vllm/pull/26296
- Fix `KVCacheManager` `get_num_common_prefix_blocks` arg, caused by
https://github.com/vllm-project/vllm/pull/23485
- Fix `MLAAttention` import,caused by
https://github.com/vllm-project/vllm/pull/25103
- Fix `SharedFusedMoE` import, caused by
https://github.com/vllm-project/vllm/pull/26145
- Fix `LazyLoader` improt, caused by
https://github.com/vllm-project/vllm/pull/27022
- Fix `vllm.utils.swap_dict_values` improt, caused by
https://github.com/vllm-project/vllm/pull/26990
- Fix `Backend` enum import, caused by
https://github.com/vllm-project/vllm/pull/25893
- Fix `CompilationLevel` renaming to `CompilationMode` issue introduced
by https://github.com/vllm-project/vllm/pull/26355
- Fix fused_moe ops, caused by
https://github.com/vllm-project/vllm/pull/24097
- Fix bert model because of `inputs_embeds`, caused by
https://github.com/vllm-project/vllm/pull/25922
- Fix MRope because of `get_input_positions_tensor` to
`get_mrope_input_positions`, caused by
https://github.com/vllm-project/vllm/pull/24172
- Fix `splitting_ops` changes introduced by
https://github.com/vllm-project/vllm/pull/25845
- Fix multi-modality changes introduced by
https://github.com/vllm-project/vllm/issues/16229
- Fix lora bias dropping issue introduced by
https://github.com/vllm-project/vllm/pull/25807
- Fix structured ouput break introduced by
https://github.com/vllm-project/vllm/issues/26737
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
CI passed with existing test.
- vLLM version: v0.11.0rc3
- vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0
---------
Signed-off-by: MengqingCao <cmq0113@163.com>
Signed-off-by: Icey <1790571317@qq.com>
Co-authored-by: Icey <1790571317@qq.com>
2025-10-24 16:55:08 +08:00
|
|
|
vllm_config.scheduler_config = MagicMock()
|
2025-07-02 17:46:06 +08:00
|
|
|
|
|
|
|
|
with self.assertLogs(logger="vllm", level="WARNING") as cm:
|
|
|
|
|
from vllm_ascend import platform
|
|
|
|
|
|
|
|
|
|
importlib.reload(platform)
|
2025-09-02 18:34:04 +08:00
|
|
|
self.platform.check_and_update_config(vllm_config)
|
2025-07-02 17:46:06 +08:00
|
|
|
self.assertTrue("Model config is missing" in cm.output[0])
|
|
|
|
|
|
[refact] unified soc_version code (#4359)
### What this PR does / why we need it?
Currently, there are two paths to judge the chip type in code,
`get_ascend_soc_version` use `get_soc_version` api in torch_npu, and
`is_310p` `use _build_info.__soc_version__`, which generate when
install. We need to unify the two paths.
We need to unify these codes based on the following points:
1. We need to ensure consistency in chip type judgment between compiling
and running states;
2. In compiling state, we need chip type to complete op's compilation,
but in running state, we only need device
type(910B/910_93/310P/910_95/etc) to make code branch judgement;
3. In compiling state, torch_npu may not have been installed yet, so we
can't use torch_npu's api.
Based on the above points, we have made the following changes:
1. When user set env `SOC_VERSION`, use it; when not set, query
soc_version by `npu-smi`;
2. generate device_type based on soc_version when compiling, and write
`__device_type__` instead of `__soc_version__` in `_build_info.py`;
3. In running state, use `__device_type__` to judge code branch.
### Does this PR introduce _any_ user-facing change?
When not set env `SOC_VERSION`, it will not be `ASCEND910B1` by default,
we will query soc_version by `npu-smi`. And env `SOC_VERSION` must be in
the list `soc_to_device` in `setup.py`.
- vLLM version: v0.11.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/2918c1b49c88c29783c86f78d2c4221cb9622379
Signed-off-by: zzzzwwjj <1183291235@qq.com>
2025-11-26 14:28:55 +08:00
|
|
|
@patch('vllm_ascend.utils.get_ascend_device_type',
|
2025-12-17 14:08:19 +08:00
|
|
|
return_value=AscendDeviceType.A3)
|
2025-07-02 17:46:06 +08:00
|
|
|
@patch("vllm_ascend.ascend_config.init_ascend_config")
|
2025-10-20 16:30:57 +08:00
|
|
|
@patch(
|
2025-12-11 22:24:49 +08:00
|
|
|
"vllm_ascend.core.recompute_scheduler.RecomputeSchedulerConfig.initialize_from_config"
|
2025-10-20 16:30:57 +08:00
|
|
|
)
|
2025-07-02 17:46:06 +08:00
|
|
|
def test_check_and_update_config_enforce_eager_mode(
|
2025-12-11 11:21:13 +08:00
|
|
|
self, mock_init_recompute, mock_init_ascend, mock_soc_version):
|
2025-09-02 18:34:04 +08:00
|
|
|
mock_init_ascend.return_value = TestNPUPlatform.mock_vllm_ascend_config(
|
|
|
|
|
)
|
|
|
|
|
vllm_config = TestNPUPlatform.mock_vllm_config()
|
|
|
|
|
vllm_config.model_config.enforce_eager = True
|
2025-12-05 10:31:49 +08:00
|
|
|
vllm_config.parallel_config.decode_context_parallel_size = 1
|
|
|
|
|
vllm_config.parallel_config.prefill_context_parallel_size = 1
|
2025-10-20 16:30:57 +08:00
|
|
|
vllm_config.parallel_config.tensor_parallel_size = 1
|
|
|
|
|
mock_init_recompute.return_value = MagicMock()
|
[1/N][Refactor] Refactor code to adapt with vllm main (#3612)
### What this PR does / why we need it?
This is the step 1 of refactoring code to adapt with vllm main, and this
pr aligned with
https://github.com/vllm-project/vllm/commit/17c540a993af88204ad1b78345c8a865cf58ce44
1. refactor deepseek to the latest code arch as of
https://github.com/vllm-project/vllm/commit/17c540a993af88204ad1b78345c8a865cf58ce44
2. bunches of fixes due to vllm changes
- Fix `AscendScheduler` `__post_init__`, caused by
https://github.com/vllm-project/vllm/pull/25075
- Fix `AscendScheduler` init got an unexpected arg `block_size`, caused
by https://github.com/vllm-project/vllm/pull/26296
- Fix `KVCacheManager` `get_num_common_prefix_blocks` arg, caused by
https://github.com/vllm-project/vllm/pull/23485
- Fix `MLAAttention` import,caused by
https://github.com/vllm-project/vllm/pull/25103
- Fix `SharedFusedMoE` import, caused by
https://github.com/vllm-project/vllm/pull/26145
- Fix `LazyLoader` improt, caused by
https://github.com/vllm-project/vllm/pull/27022
- Fix `vllm.utils.swap_dict_values` improt, caused by
https://github.com/vllm-project/vllm/pull/26990
- Fix `Backend` enum import, caused by
https://github.com/vllm-project/vllm/pull/25893
- Fix `CompilationLevel` renaming to `CompilationMode` issue introduced
by https://github.com/vllm-project/vllm/pull/26355
- Fix fused_moe ops, caused by
https://github.com/vllm-project/vllm/pull/24097
- Fix bert model because of `inputs_embeds`, caused by
https://github.com/vllm-project/vllm/pull/25922
- Fix MRope because of `get_input_positions_tensor` to
`get_mrope_input_positions`, caused by
https://github.com/vllm-project/vllm/pull/24172
- Fix `splitting_ops` changes introduced by
https://github.com/vllm-project/vllm/pull/25845
- Fix multi-modality changes introduced by
https://github.com/vllm-project/vllm/issues/16229
- Fix lora bias dropping issue introduced by
https://github.com/vllm-project/vllm/pull/25807
- Fix structured ouput break introduced by
https://github.com/vllm-project/vllm/issues/26737
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
CI passed with existing test.
- vLLM version: v0.11.0rc3
- vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0
---------
Signed-off-by: MengqingCao <cmq0113@163.com>
Signed-off-by: Icey <1790571317@qq.com>
Co-authored-by: Icey <1790571317@qq.com>
2025-10-24 16:55:08 +08:00
|
|
|
vllm_config.scheduler_config = MagicMock()
|
2025-07-02 17:46:06 +08:00
|
|
|
|
|
|
|
|
with self.assertLogs(logger="vllm", level="INFO") as cm:
|
|
|
|
|
from vllm_ascend import platform
|
|
|
|
|
|
|
|
|
|
importlib.reload(platform)
|
2025-09-02 18:34:04 +08:00
|
|
|
self.platform.check_and_update_config(vllm_config)
|
2025-07-02 17:46:06 +08:00
|
|
|
self.assertTrue("Compilation disabled, using eager mode by default" in
|
|
|
|
|
cm.output[0])
|
[1/N][Refactor] Refactor code to adapt with vllm main (#3612)
### What this PR does / why we need it?
This is the step 1 of refactoring code to adapt with vllm main, and this
pr aligned with
https://github.com/vllm-project/vllm/commit/17c540a993af88204ad1b78345c8a865cf58ce44
1. refactor deepseek to the latest code arch as of
https://github.com/vllm-project/vllm/commit/17c540a993af88204ad1b78345c8a865cf58ce44
2. bunches of fixes due to vllm changes
- Fix `AscendScheduler` `__post_init__`, caused by
https://github.com/vllm-project/vllm/pull/25075
- Fix `AscendScheduler` init got an unexpected arg `block_size`, caused
by https://github.com/vllm-project/vllm/pull/26296
- Fix `KVCacheManager` `get_num_common_prefix_blocks` arg, caused by
https://github.com/vllm-project/vllm/pull/23485
- Fix `MLAAttention` import,caused by
https://github.com/vllm-project/vllm/pull/25103
- Fix `SharedFusedMoE` import, caused by
https://github.com/vllm-project/vllm/pull/26145
- Fix `LazyLoader` improt, caused by
https://github.com/vllm-project/vllm/pull/27022
- Fix `vllm.utils.swap_dict_values` improt, caused by
https://github.com/vllm-project/vllm/pull/26990
- Fix `Backend` enum import, caused by
https://github.com/vllm-project/vllm/pull/25893
- Fix `CompilationLevel` renaming to `CompilationMode` issue introduced
by https://github.com/vllm-project/vllm/pull/26355
- Fix fused_moe ops, caused by
https://github.com/vllm-project/vllm/pull/24097
- Fix bert model because of `inputs_embeds`, caused by
https://github.com/vllm-project/vllm/pull/25922
- Fix MRope because of `get_input_positions_tensor` to
`get_mrope_input_positions`, caused by
https://github.com/vllm-project/vllm/pull/24172
- Fix `splitting_ops` changes introduced by
https://github.com/vllm-project/vllm/pull/25845
- Fix multi-modality changes introduced by
https://github.com/vllm-project/vllm/issues/16229
- Fix lora bias dropping issue introduced by
https://github.com/vllm-project/vllm/pull/25807
- Fix structured ouput break introduced by
https://github.com/vllm-project/vllm/issues/26737
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
CI passed with existing test.
- vLLM version: v0.11.0rc3
- vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0
---------
Signed-off-by: MengqingCao <cmq0113@163.com>
Signed-off-by: Icey <1790571317@qq.com>
Co-authored-by: Icey <1790571317@qq.com>
2025-10-24 16:55:08 +08:00
|
|
|
|
2025-11-24 17:08:20 +08:00
|
|
|
self.assertEqual(
|
|
|
|
|
vllm_config.compilation_config.mode,
|
|
|
|
|
CompilationMode.NONE,
|
|
|
|
|
)
|
[1/N][Refactor] Refactor code to adapt with vllm main (#3612)
### What this PR does / why we need it?
This is the step 1 of refactoring code to adapt with vllm main, and this
pr aligned with
https://github.com/vllm-project/vllm/commit/17c540a993af88204ad1b78345c8a865cf58ce44
1. refactor deepseek to the latest code arch as of
https://github.com/vllm-project/vllm/commit/17c540a993af88204ad1b78345c8a865cf58ce44
2. bunches of fixes due to vllm changes
- Fix `AscendScheduler` `__post_init__`, caused by
https://github.com/vllm-project/vllm/pull/25075
- Fix `AscendScheduler` init got an unexpected arg `block_size`, caused
by https://github.com/vllm-project/vllm/pull/26296
- Fix `KVCacheManager` `get_num_common_prefix_blocks` arg, caused by
https://github.com/vllm-project/vllm/pull/23485
- Fix `MLAAttention` import,caused by
https://github.com/vllm-project/vllm/pull/25103
- Fix `SharedFusedMoE` import, caused by
https://github.com/vllm-project/vllm/pull/26145
- Fix `LazyLoader` improt, caused by
https://github.com/vllm-project/vllm/pull/27022
- Fix `vllm.utils.swap_dict_values` improt, caused by
https://github.com/vllm-project/vllm/pull/26990
- Fix `Backend` enum import, caused by
https://github.com/vllm-project/vllm/pull/25893
- Fix `CompilationLevel` renaming to `CompilationMode` issue introduced
by https://github.com/vllm-project/vllm/pull/26355
- Fix fused_moe ops, caused by
https://github.com/vllm-project/vllm/pull/24097
- Fix bert model because of `inputs_embeds`, caused by
https://github.com/vllm-project/vllm/pull/25922
- Fix MRope because of `get_input_positions_tensor` to
`get_mrope_input_positions`, caused by
https://github.com/vllm-project/vllm/pull/24172
- Fix `splitting_ops` changes introduced by
https://github.com/vllm-project/vllm/pull/25845
- Fix multi-modality changes introduced by
https://github.com/vllm-project/vllm/issues/16229
- Fix lora bias dropping issue introduced by
https://github.com/vllm-project/vllm/pull/25807
- Fix structured ouput break introduced by
https://github.com/vllm-project/vllm/issues/26737
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
CI passed with existing test.
- vLLM version: v0.11.0rc3
- vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0
---------
Signed-off-by: MengqingCao <cmq0113@163.com>
Signed-off-by: Icey <1790571317@qq.com>
Co-authored-by: Icey <1790571317@qq.com>
2025-10-24 16:55:08 +08:00
|
|
|
|
2025-08-27 09:30:25 +08:00
|
|
|
self.assertEqual(
|
2025-09-02 18:34:04 +08:00
|
|
|
vllm_config.compilation_config.cudagraph_mode,
|
2025-08-27 09:30:25 +08:00
|
|
|
CUDAGraphMode.NONE,
|
|
|
|
|
)
|
2025-07-02 17:46:06 +08:00
|
|
|
|
[refact] unified soc_version code (#4359)
### What this PR does / why we need it?
Currently, there are two paths to judge the chip type in code,
`get_ascend_soc_version` use `get_soc_version` api in torch_npu, and
`is_310p` `use _build_info.__soc_version__`, which generate when
install. We need to unify the two paths.
We need to unify these codes based on the following points:
1. We need to ensure consistency in chip type judgment between compiling
and running states;
2. In compiling state, we need chip type to complete op's compilation,
but in running state, we only need device
type(910B/910_93/310P/910_95/etc) to make code branch judgement;
3. In compiling state, torch_npu may not have been installed yet, so we
can't use torch_npu's api.
Based on the above points, we have made the following changes:
1. When user set env `SOC_VERSION`, use it; when not set, query
soc_version by `npu-smi`;
2. generate device_type based on soc_version when compiling, and write
`__device_type__` instead of `__soc_version__` in `_build_info.py`;
3. In running state, use `__device_type__` to judge code branch.
### Does this PR introduce _any_ user-facing change?
When not set env `SOC_VERSION`, it will not be `ASCEND910B1` by default,
we will query soc_version by `npu-smi`. And env `SOC_VERSION` must be in
the list `soc_to_device` in `setup.py`.
- vLLM version: v0.11.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/2918c1b49c88c29783c86f78d2c4221cb9622379
Signed-off-by: zzzzwwjj <1183291235@qq.com>
2025-11-26 14:28:55 +08:00
|
|
|
@patch('vllm_ascend.utils.get_ascend_device_type',
|
2025-12-17 14:08:19 +08:00
|
|
|
return_value=AscendDeviceType.A3)
|
[main][misc]change default capture size for Qwen3-MoE when using full dp (#4199)
### What this PR does / why we need it?
Currently, the default `cudagraph_capture_size` in vLLM is `[1, 2, 4 ,8
,16 ,24 ,... , max_capture_size]`. However, this is not always the best
choice on different situations. This PR aims to change the default
setting when running Qwen3-MoE on full dp (`dp_size > 1` && `tp_size ==
1`) setting, which is usually applied in Large-Scale EP.
old :
`[1, 2, 4 ,8 ,16 ,24 ,... , max_capture_size]`
new:
`[1, 2, 5 ,10 ,15, 16 ,24 ,... , max_capture_size]`
This is mainly because the performance of `_npu_paged_attention` op
degrades dramatically on old settings. We hope to provide better
performance if users do not set specific `cudagraph_capture_size`.
### Does this PR introduce _any_ user-facing change?
The default `cudagraph_capture_size` is modified in above cases.
However, if `cudagraph_capture_size` has already set by users, this PR
won't have any influence on this.
### How was this patch tested?
- vLLM version: v0.11.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/2918c1b49c88c29783c86f78d2c4221cb9622379
---------
Signed-off-by: Angazenn <supperccell@163.com>
2025-11-18 08:41:45 +08:00
|
|
|
@patch("vllm_ascend.utils.update_default_aclgraph_sizes")
|
2025-07-02 17:46:06 +08:00
|
|
|
@patch("vllm_ascend.ascend_config.init_ascend_config")
|
2025-10-20 16:30:57 +08:00
|
|
|
@patch(
|
2025-12-11 22:24:49 +08:00
|
|
|
"vllm_ascend.core.recompute_scheduler.RecomputeSchedulerConfig.initialize_from_config"
|
2025-10-20 16:30:57 +08:00
|
|
|
)
|
2025-07-02 17:46:06 +08:00
|
|
|
def test_check_and_update_config_unsupported_compilation_level(
|
2025-12-11 11:21:13 +08:00
|
|
|
self, mock_init_recompute, mock_init_ascend, mock_update_default,
|
|
|
|
|
mock_soc_version):
|
[main][misc]change default capture size for Qwen3-MoE when using full dp (#4199)
### What this PR does / why we need it?
Currently, the default `cudagraph_capture_size` in vLLM is `[1, 2, 4 ,8
,16 ,24 ,... , max_capture_size]`. However, this is not always the best
choice on different situations. This PR aims to change the default
setting when running Qwen3-MoE on full dp (`dp_size > 1` && `tp_size ==
1`) setting, which is usually applied in Large-Scale EP.
old :
`[1, 2, 4 ,8 ,16 ,24 ,... , max_capture_size]`
new:
`[1, 2, 5 ,10 ,15, 16 ,24 ,... , max_capture_size]`
This is mainly because the performance of `_npu_paged_attention` op
degrades dramatically on old settings. We hope to provide better
performance if users do not set specific `cudagraph_capture_size`.
### Does this PR introduce _any_ user-facing change?
The default `cudagraph_capture_size` is modified in above cases.
However, if `cudagraph_capture_size` has already set by users, this PR
won't have any influence on this.
### How was this patch tested?
- vLLM version: v0.11.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/2918c1b49c88c29783c86f78d2c4221cb9622379
---------
Signed-off-by: Angazenn <supperccell@163.com>
2025-11-18 08:41:45 +08:00
|
|
|
mock_update_default.return_value = MagicMock()
|
2025-09-02 18:34:04 +08:00
|
|
|
mock_init_ascend.return_value = TestNPUPlatform.mock_vllm_ascend_config(
|
|
|
|
|
)
|
|
|
|
|
vllm_config = TestNPUPlatform.mock_vllm_config()
|
|
|
|
|
vllm_config.model_config.enforce_eager = False
|
2025-12-05 10:31:49 +08:00
|
|
|
vllm_config.parallel_config.decode_context_parallel_size = 1
|
|
|
|
|
vllm_config.parallel_config.prefill_context_parallel_size = 1
|
2025-10-20 16:30:57 +08:00
|
|
|
vllm_config.parallel_config.tensor_parallel_size = 1
|
|
|
|
|
mock_init_recompute.return_value = MagicMock()
|
[1/N][Refactor] Refactor code to adapt with vllm main (#3612)
### What this PR does / why we need it?
This is the step 1 of refactoring code to adapt with vllm main, and this
pr aligned with
https://github.com/vllm-project/vllm/commit/17c540a993af88204ad1b78345c8a865cf58ce44
1. refactor deepseek to the latest code arch as of
https://github.com/vllm-project/vllm/commit/17c540a993af88204ad1b78345c8a865cf58ce44
2. bunches of fixes due to vllm changes
- Fix `AscendScheduler` `__post_init__`, caused by
https://github.com/vllm-project/vllm/pull/25075
- Fix `AscendScheduler` init got an unexpected arg `block_size`, caused
by https://github.com/vllm-project/vllm/pull/26296
- Fix `KVCacheManager` `get_num_common_prefix_blocks` arg, caused by
https://github.com/vllm-project/vllm/pull/23485
- Fix `MLAAttention` import,caused by
https://github.com/vllm-project/vllm/pull/25103
- Fix `SharedFusedMoE` import, caused by
https://github.com/vllm-project/vllm/pull/26145
- Fix `LazyLoader` improt, caused by
https://github.com/vllm-project/vllm/pull/27022
- Fix `vllm.utils.swap_dict_values` improt, caused by
https://github.com/vllm-project/vllm/pull/26990
- Fix `Backend` enum import, caused by
https://github.com/vllm-project/vllm/pull/25893
- Fix `CompilationLevel` renaming to `CompilationMode` issue introduced
by https://github.com/vllm-project/vllm/pull/26355
- Fix fused_moe ops, caused by
https://github.com/vllm-project/vllm/pull/24097
- Fix bert model because of `inputs_embeds`, caused by
https://github.com/vllm-project/vllm/pull/25922
- Fix MRope because of `get_input_positions_tensor` to
`get_mrope_input_positions`, caused by
https://github.com/vllm-project/vllm/pull/24172
- Fix `splitting_ops` changes introduced by
https://github.com/vllm-project/vllm/pull/25845
- Fix multi-modality changes introduced by
https://github.com/vllm-project/vllm/issues/16229
- Fix lora bias dropping issue introduced by
https://github.com/vllm-project/vllm/pull/25807
- Fix structured ouput break introduced by
https://github.com/vllm-project/vllm/issues/26737
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
CI passed with existing test.
- vLLM version: v0.11.0rc3
- vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0
---------
Signed-off-by: MengqingCao <cmq0113@163.com>
Signed-off-by: Icey <1790571317@qq.com>
Co-authored-by: Icey <1790571317@qq.com>
2025-10-24 16:55:08 +08:00
|
|
|
vllm_config.scheduler_config = MagicMock()
|
|
|
|
|
|
2025-11-24 17:08:20 +08:00
|
|
|
vllm_config.compilation_config.mode = CompilationMode.DYNAMO_TRACE_ONCE
|
2025-07-02 17:46:06 +08:00
|
|
|
|
|
|
|
|
with self.assertLogs(logger="vllm", level="WARNING") as cm:
|
|
|
|
|
from vllm_ascend import platform
|
|
|
|
|
|
|
|
|
|
importlib.reload(platform)
|
2025-09-02 18:34:04 +08:00
|
|
|
self.platform.check_and_update_config(vllm_config)
|
2025-07-02 17:46:06 +08:00
|
|
|
self.assertTrue("NPU does not support" in cm.output[0])
|
2025-11-24 17:08:20 +08:00
|
|
|
|
|
|
|
|
self.assertEqual(
|
|
|
|
|
vllm_config.compilation_config.mode,
|
|
|
|
|
CompilationMode.NONE,
|
|
|
|
|
)
|
2025-08-27 09:30:25 +08:00
|
|
|
self.assertEqual(
|
2025-09-02 18:34:04 +08:00
|
|
|
vllm_config.compilation_config.cudagraph_mode,
|
2025-08-27 09:30:25 +08:00
|
|
|
CUDAGraphMode.NONE,
|
|
|
|
|
)
|
|
|
|
|
|
2025-08-28 14:08:31 +08:00
|
|
|
@pytest.mark.skip(
|
|
|
|
|
"Revert me when vllm support setting cudagraph_mode on oot platform")
|
[refact] unified soc_version code (#4359)
### What this PR does / why we need it?
Currently, there are two paths to judge the chip type in code,
`get_ascend_soc_version` use `get_soc_version` api in torch_npu, and
`is_310p` `use _build_info.__soc_version__`, which generate when
install. We need to unify the two paths.
We need to unify these codes based on the following points:
1. We need to ensure consistency in chip type judgment between compiling
and running states;
2. In compiling state, we need chip type to complete op's compilation,
but in running state, we only need device
type(910B/910_93/310P/910_95/etc) to make code branch judgement;
3. In compiling state, torch_npu may not have been installed yet, so we
can't use torch_npu's api.
Based on the above points, we have made the following changes:
1. When user set env `SOC_VERSION`, use it; when not set, query
soc_version by `npu-smi`;
2. generate device_type based on soc_version when compiling, and write
`__device_type__` instead of `__soc_version__` in `_build_info.py`;
3. In running state, use `__device_type__` to judge code branch.
### Does this PR introduce _any_ user-facing change?
When not set env `SOC_VERSION`, it will not be `ASCEND910B1` by default,
we will query soc_version by `npu-smi`. And env `SOC_VERSION` must be in
the list `soc_to_device` in `setup.py`.
- vLLM version: v0.11.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/2918c1b49c88c29783c86f78d2c4221cb9622379
Signed-off-by: zzzzwwjj <1183291235@qq.com>
2025-11-26 14:28:55 +08:00
|
|
|
@patch('vllm_ascend.utils.get_ascend_device_type',
|
2025-12-17 14:08:19 +08:00
|
|
|
return_value=AscendDeviceType.A3)
|
2025-08-27 09:30:25 +08:00
|
|
|
@patch("vllm_ascend.ascend_config.init_ascend_config")
|
|
|
|
|
def test_check_and_update_config_unsupported_cudagraph_mode(
|
2025-12-11 11:21:13 +08:00
|
|
|
self, mock_init_ascend, mock_soc_version):
|
2025-09-02 18:34:04 +08:00
|
|
|
mock_init_ascend.return_value = TestNPUPlatform.mock_vllm_ascend_config(
|
|
|
|
|
)
|
|
|
|
|
vllm_config = TestNPUPlatform.mock_vllm_config()
|
|
|
|
|
vllm_config.model_config.enforce_eager = False
|
|
|
|
|
vllm_config.compilation_config.cudagraph_mode = CUDAGraphMode.FULL
|
2025-08-27 09:30:25 +08:00
|
|
|
|
|
|
|
|
with self.assertLogs(logger="vllm", level="INFO") as cm:
|
|
|
|
|
from vllm_ascend import platform
|
|
|
|
|
|
|
|
|
|
importlib.reload(platform)
|
2025-09-02 18:34:04 +08:00
|
|
|
self.platform.check_and_update_config(vllm_config)
|
2025-08-27 09:30:25 +08:00
|
|
|
self.assertTrue(
|
|
|
|
|
"cudagraph_mode is not support on NPU. falling back to NONE" in
|
|
|
|
|
cm.output[0])
|
[1/N][Refactor] Refactor code to adapt with vllm main (#3612)
### What this PR does / why we need it?
This is the step 1 of refactoring code to adapt with vllm main, and this
pr aligned with
https://github.com/vllm-project/vllm/commit/17c540a993af88204ad1b78345c8a865cf58ce44
1. refactor deepseek to the latest code arch as of
https://github.com/vllm-project/vllm/commit/17c540a993af88204ad1b78345c8a865cf58ce44
2. bunches of fixes due to vllm changes
- Fix `AscendScheduler` `__post_init__`, caused by
https://github.com/vllm-project/vllm/pull/25075
- Fix `AscendScheduler` init got an unexpected arg `block_size`, caused
by https://github.com/vllm-project/vllm/pull/26296
- Fix `KVCacheManager` `get_num_common_prefix_blocks` arg, caused by
https://github.com/vllm-project/vllm/pull/23485
- Fix `MLAAttention` import,caused by
https://github.com/vllm-project/vllm/pull/25103
- Fix `SharedFusedMoE` import, caused by
https://github.com/vllm-project/vllm/pull/26145
- Fix `LazyLoader` improt, caused by
https://github.com/vllm-project/vllm/pull/27022
- Fix `vllm.utils.swap_dict_values` improt, caused by
https://github.com/vllm-project/vllm/pull/26990
- Fix `Backend` enum import, caused by
https://github.com/vllm-project/vllm/pull/25893
- Fix `CompilationLevel` renaming to `CompilationMode` issue introduced
by https://github.com/vllm-project/vllm/pull/26355
- Fix fused_moe ops, caused by
https://github.com/vllm-project/vllm/pull/24097
- Fix bert model because of `inputs_embeds`, caused by
https://github.com/vllm-project/vllm/pull/25922
- Fix MRope because of `get_input_positions_tensor` to
`get_mrope_input_positions`, caused by
https://github.com/vllm-project/vllm/pull/24172
- Fix `splitting_ops` changes introduced by
https://github.com/vllm-project/vllm/pull/25845
- Fix multi-modality changes introduced by
https://github.com/vllm-project/vllm/issues/16229
- Fix lora bias dropping issue introduced by
https://github.com/vllm-project/vllm/pull/25807
- Fix structured ouput break introduced by
https://github.com/vllm-project/vllm/issues/26737
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
CI passed with existing test.
- vLLM version: v0.11.0rc3
- vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0
---------
Signed-off-by: MengqingCao <cmq0113@163.com>
Signed-off-by: Icey <1790571317@qq.com>
Co-authored-by: Icey <1790571317@qq.com>
2025-10-24 16:55:08 +08:00
|
|
|
|
2025-11-24 17:08:20 +08:00
|
|
|
self.assertEqual(
|
|
|
|
|
vllm_config.compilation_config.mode,
|
|
|
|
|
CompilationMode.NONE,
|
|
|
|
|
)
|
2025-08-27 09:30:25 +08:00
|
|
|
self.assertEqual(
|
2025-09-02 18:34:04 +08:00
|
|
|
vllm_config.compilation_config.cudagraph_mode,
|
2025-08-27 09:30:25 +08:00
|
|
|
CUDAGraphMode.NONE,
|
|
|
|
|
)
|
2025-07-02 17:46:06 +08:00
|
|
|
|
[refact] unified soc_version code (#4359)
### What this PR does / why we need it?
Currently, there are two paths to judge the chip type in code,
`get_ascend_soc_version` use `get_soc_version` api in torch_npu, and
`is_310p` `use _build_info.__soc_version__`, which generate when
install. We need to unify the two paths.
We need to unify these codes based on the following points:
1. We need to ensure consistency in chip type judgment between compiling
and running states;
2. In compiling state, we need chip type to complete op's compilation,
but in running state, we only need device
type(910B/910_93/310P/910_95/etc) to make code branch judgement;
3. In compiling state, torch_npu may not have been installed yet, so we
can't use torch_npu's api.
Based on the above points, we have made the following changes:
1. When user set env `SOC_VERSION`, use it; when not set, query
soc_version by `npu-smi`;
2. generate device_type based on soc_version when compiling, and write
`__device_type__` instead of `__soc_version__` in `_build_info.py`;
3. In running state, use `__device_type__` to judge code branch.
### Does this PR introduce _any_ user-facing change?
When not set env `SOC_VERSION`, it will not be `ASCEND910B1` by default,
we will query soc_version by `npu-smi`. And env `SOC_VERSION` must be in
the list `soc_to_device` in `setup.py`.
- vLLM version: v0.11.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/2918c1b49c88c29783c86f78d2c4221cb9622379
Signed-off-by: zzzzwwjj <1183291235@qq.com>
2025-11-26 14:28:55 +08:00
|
|
|
@patch('vllm_ascend.utils.get_ascend_device_type',
|
2025-12-17 14:08:19 +08:00
|
|
|
return_value=AscendDeviceType.A3)
|
2025-07-02 17:46:06 +08:00
|
|
|
@patch("vllm_ascend.ascend_config.init_ascend_config")
|
2025-10-20 16:30:57 +08:00
|
|
|
@patch(
|
2025-12-11 22:24:49 +08:00
|
|
|
"vllm_ascend.core.recompute_scheduler.RecomputeSchedulerConfig.initialize_from_config"
|
2025-10-20 16:30:57 +08:00
|
|
|
)
|
2025-07-02 17:46:06 +08:00
|
|
|
def test_check_and_update_config_cache_config_block_size(
|
2025-12-11 11:21:13 +08:00
|
|
|
self, mock_init_recompute, mock_init_ascend, mock_soc_version):
|
2025-09-02 18:34:04 +08:00
|
|
|
mock_init_ascend.return_value = TestNPUPlatform.mock_vllm_ascend_config(
|
|
|
|
|
)
|
|
|
|
|
vllm_config = TestNPUPlatform.mock_vllm_config()
|
|
|
|
|
vllm_config.cache_config.block_size = None
|
|
|
|
|
vllm_config.cache_config.enable_prefix_caching = True
|
2025-12-05 10:31:49 +08:00
|
|
|
vllm_config.parallel_config.decode_context_parallel_size = 1
|
|
|
|
|
vllm_config.parallel_config.prefill_context_parallel_size = 1
|
2025-10-20 16:30:57 +08:00
|
|
|
vllm_config.parallel_config.tensor_parallel_size = 1
|
|
|
|
|
mock_init_recompute.return_value = MagicMock()
|
[1/N][Refactor] Refactor code to adapt with vllm main (#3612)
### What this PR does / why we need it?
This is the step 1 of refactoring code to adapt with vllm main, and this
pr aligned with
https://github.com/vllm-project/vllm/commit/17c540a993af88204ad1b78345c8a865cf58ce44
1. refactor deepseek to the latest code arch as of
https://github.com/vllm-project/vllm/commit/17c540a993af88204ad1b78345c8a865cf58ce44
2. bunches of fixes due to vllm changes
- Fix `AscendScheduler` `__post_init__`, caused by
https://github.com/vllm-project/vllm/pull/25075
- Fix `AscendScheduler` init got an unexpected arg `block_size`, caused
by https://github.com/vllm-project/vllm/pull/26296
- Fix `KVCacheManager` `get_num_common_prefix_blocks` arg, caused by
https://github.com/vllm-project/vllm/pull/23485
- Fix `MLAAttention` import,caused by
https://github.com/vllm-project/vllm/pull/25103
- Fix `SharedFusedMoE` import, caused by
https://github.com/vllm-project/vllm/pull/26145
- Fix `LazyLoader` improt, caused by
https://github.com/vllm-project/vllm/pull/27022
- Fix `vllm.utils.swap_dict_values` improt, caused by
https://github.com/vllm-project/vllm/pull/26990
- Fix `Backend` enum import, caused by
https://github.com/vllm-project/vllm/pull/25893
- Fix `CompilationLevel` renaming to `CompilationMode` issue introduced
by https://github.com/vllm-project/vllm/pull/26355
- Fix fused_moe ops, caused by
https://github.com/vllm-project/vllm/pull/24097
- Fix bert model because of `inputs_embeds`, caused by
https://github.com/vllm-project/vllm/pull/25922
- Fix MRope because of `get_input_positions_tensor` to
`get_mrope_input_positions`, caused by
https://github.com/vllm-project/vllm/pull/24172
- Fix `splitting_ops` changes introduced by
https://github.com/vllm-project/vllm/pull/25845
- Fix multi-modality changes introduced by
https://github.com/vllm-project/vllm/issues/16229
- Fix lora bias dropping issue introduced by
https://github.com/vllm-project/vllm/pull/25807
- Fix structured ouput break introduced by
https://github.com/vllm-project/vllm/issues/26737
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
CI passed with existing test.
- vLLM version: v0.11.0rc3
- vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0
---------
Signed-off-by: MengqingCao <cmq0113@163.com>
Signed-off-by: Icey <1790571317@qq.com>
Co-authored-by: Icey <1790571317@qq.com>
2025-10-24 16:55:08 +08:00
|
|
|
vllm_config.scheduler_config = MagicMock()
|
2025-07-02 17:46:06 +08:00
|
|
|
|
|
|
|
|
from vllm_ascend import platform
|
|
|
|
|
|
|
|
|
|
importlib.reload(platform)
|
|
|
|
|
|
2025-09-02 18:34:04 +08:00
|
|
|
self.platform.check_and_update_config(vllm_config)
|
2025-07-02 17:46:06 +08:00
|
|
|
|
2025-09-02 18:34:04 +08:00
|
|
|
self.assertEqual(vllm_config.cache_config.block_size, 128)
|
2025-07-02 17:46:06 +08:00
|
|
|
|
[refact] unified soc_version code (#4359)
### What this PR does / why we need it?
Currently, there are two paths to judge the chip type in code,
`get_ascend_soc_version` use `get_soc_version` api in torch_npu, and
`is_310p` `use _build_info.__soc_version__`, which generate when
install. We need to unify the two paths.
We need to unify these codes based on the following points:
1. We need to ensure consistency in chip type judgment between compiling
and running states;
2. In compiling state, we need chip type to complete op's compilation,
but in running state, we only need device
type(910B/910_93/310P/910_95/etc) to make code branch judgement;
3. In compiling state, torch_npu may not have been installed yet, so we
can't use torch_npu's api.
Based on the above points, we have made the following changes:
1. When user set env `SOC_VERSION`, use it; when not set, query
soc_version by `npu-smi`;
2. generate device_type based on soc_version when compiling, and write
`__device_type__` instead of `__soc_version__` in `_build_info.py`;
3. In running state, use `__device_type__` to judge code branch.
### Does this PR introduce _any_ user-facing change?
When not set env `SOC_VERSION`, it will not be `ASCEND910B1` by default,
we will query soc_version by `npu-smi`. And env `SOC_VERSION` must be in
the list `soc_to_device` in `setup.py`.
- vLLM version: v0.11.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/2918c1b49c88c29783c86f78d2c4221cb9622379
Signed-off-by: zzzzwwjj <1183291235@qq.com>
2025-11-26 14:28:55 +08:00
|
|
|
@patch('vllm_ascend.utils.get_ascend_device_type',
|
2025-12-17 14:08:19 +08:00
|
|
|
return_value=AscendDeviceType.A3)
|
2025-07-02 17:46:06 +08:00
|
|
|
@patch("vllm_ascend.ascend_config.init_ascend_config")
|
2025-10-20 16:30:57 +08:00
|
|
|
@patch(
|
2025-12-11 22:24:49 +08:00
|
|
|
"vllm_ascend.core.recompute_scheduler.RecomputeSchedulerConfig.initialize_from_config"
|
2025-10-20 16:30:57 +08:00
|
|
|
)
|
2025-07-02 17:46:06 +08:00
|
|
|
def test_check_and_update_config_v1_worker_class_selection(
|
2025-12-11 11:21:13 +08:00
|
|
|
self, mock_init_recompute, mock_init_ascend, mock_soc_version):
|
2025-09-02 18:34:04 +08:00
|
|
|
mock_init_ascend.return_value = TestNPUPlatform.mock_vllm_ascend_config(
|
|
|
|
|
)
|
|
|
|
|
vllm_config = TestNPUPlatform.mock_vllm_config()
|
|
|
|
|
vllm_config.parallel_config.worker_cls = "auto"
|
2025-12-05 10:31:49 +08:00
|
|
|
vllm_config.parallel_config.decode_context_parallel_size = 1
|
|
|
|
|
vllm_config.parallel_config.prefill_context_parallel_size = 1
|
2025-10-20 16:30:57 +08:00
|
|
|
vllm_config.parallel_config.tensor_parallel_size = 1
|
|
|
|
|
mock_init_recompute.return_value = MagicMock()
|
[1/N][Refactor] Refactor code to adapt with vllm main (#3612)
### What this PR does / why we need it?
This is the step 1 of refactoring code to adapt with vllm main, and this
pr aligned with
https://github.com/vllm-project/vllm/commit/17c540a993af88204ad1b78345c8a865cf58ce44
1. refactor deepseek to the latest code arch as of
https://github.com/vllm-project/vllm/commit/17c540a993af88204ad1b78345c8a865cf58ce44
2. bunches of fixes due to vllm changes
- Fix `AscendScheduler` `__post_init__`, caused by
https://github.com/vllm-project/vllm/pull/25075
- Fix `AscendScheduler` init got an unexpected arg `block_size`, caused
by https://github.com/vllm-project/vllm/pull/26296
- Fix `KVCacheManager` `get_num_common_prefix_blocks` arg, caused by
https://github.com/vllm-project/vllm/pull/23485
- Fix `MLAAttention` import,caused by
https://github.com/vllm-project/vllm/pull/25103
- Fix `SharedFusedMoE` import, caused by
https://github.com/vllm-project/vllm/pull/26145
- Fix `LazyLoader` improt, caused by
https://github.com/vllm-project/vllm/pull/27022
- Fix `vllm.utils.swap_dict_values` improt, caused by
https://github.com/vllm-project/vllm/pull/26990
- Fix `Backend` enum import, caused by
https://github.com/vllm-project/vllm/pull/25893
- Fix `CompilationLevel` renaming to `CompilationMode` issue introduced
by https://github.com/vllm-project/vllm/pull/26355
- Fix fused_moe ops, caused by
https://github.com/vllm-project/vllm/pull/24097
- Fix bert model because of `inputs_embeds`, caused by
https://github.com/vllm-project/vllm/pull/25922
- Fix MRope because of `get_input_positions_tensor` to
`get_mrope_input_positions`, caused by
https://github.com/vllm-project/vllm/pull/24172
- Fix `splitting_ops` changes introduced by
https://github.com/vllm-project/vllm/pull/25845
- Fix multi-modality changes introduced by
https://github.com/vllm-project/vllm/issues/16229
- Fix lora bias dropping issue introduced by
https://github.com/vllm-project/vllm/pull/25807
- Fix structured ouput break introduced by
https://github.com/vllm-project/vllm/issues/26737
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
CI passed with existing test.
- vLLM version: v0.11.0rc3
- vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0
---------
Signed-off-by: MengqingCao <cmq0113@163.com>
Signed-off-by: Icey <1790571317@qq.com>
Co-authored-by: Icey <1790571317@qq.com>
2025-10-24 16:55:08 +08:00
|
|
|
vllm_config.scheduler_config = MagicMock()
|
2025-07-02 17:46:06 +08:00
|
|
|
|
|
|
|
|
from vllm_ascend import platform
|
|
|
|
|
|
|
|
|
|
importlib.reload(platform)
|
2025-09-02 18:34:04 +08:00
|
|
|
self.platform.check_and_update_config(vllm_config)
|
2025-07-02 17:46:06 +08:00
|
|
|
|
|
|
|
|
self.assertEqual(
|
2025-09-02 18:34:04 +08:00
|
|
|
vllm_config.parallel_config.worker_cls,
|
2025-12-18 15:51:54 +08:00
|
|
|
"vllm_ascend.worker.worker.NPUWorker",
|
2025-07-02 17:46:06 +08:00
|
|
|
)
|
|
|
|
|
|
2025-12-08 08:27:46 +08:00
|
|
|
test_ascend_config = TestNPUPlatform.mock_vllm_ascend_config()
|
|
|
|
|
test_ascend_config.xlite_graph_config.enabled = True
|
|
|
|
|
mock_init_ascend.return_value = test_ascend_config
|
|
|
|
|
vllm_config.parallel_config.worker_cls = "auto"
|
|
|
|
|
self.platform.check_and_update_config(vllm_config)
|
|
|
|
|
self.assertEqual(
|
|
|
|
|
vllm_config.parallel_config.worker_cls,
|
|
|
|
|
"vllm_ascend.xlite.xlite_worker.XliteWorker",
|
|
|
|
|
)
|
|
|
|
|
|
2025-07-02 17:46:06 +08:00
|
|
|
@patch("vllm_ascend.ascend_config.init_ascend_config")
|
[refact] unified soc_version code (#4359)
### What this PR does / why we need it?
Currently, there are two paths to judge the chip type in code,
`get_ascend_soc_version` use `get_soc_version` api in torch_npu, and
`is_310p` `use _build_info.__soc_version__`, which generate when
install. We need to unify the two paths.
We need to unify these codes based on the following points:
1. We need to ensure consistency in chip type judgment between compiling
and running states;
2. In compiling state, we need chip type to complete op's compilation,
but in running state, we only need device
type(910B/910_93/310P/910_95/etc) to make code branch judgement;
3. In compiling state, torch_npu may not have been installed yet, so we
can't use torch_npu's api.
Based on the above points, we have made the following changes:
1. When user set env `SOC_VERSION`, use it; when not set, query
soc_version by `npu-smi`;
2. generate device_type based on soc_version when compiling, and write
`__device_type__` instead of `__soc_version__` in `_build_info.py`;
3. In running state, use `__device_type__` to judge code branch.
### Does this PR introduce _any_ user-facing change?
When not set env `SOC_VERSION`, it will not be `ASCEND910B1` by default,
we will query soc_version by `npu-smi`. And env `SOC_VERSION` must be in
the list `soc_to_device` in `setup.py`.
- vLLM version: v0.11.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/2918c1b49c88c29783c86f78d2c4221cb9622379
Signed-off-by: zzzzwwjj <1183291235@qq.com>
2025-11-26 14:28:55 +08:00
|
|
|
@patch('vllm_ascend.utils.get_ascend_device_type',
|
|
|
|
|
return_value=AscendDeviceType._310P)
|
2025-10-20 16:30:57 +08:00
|
|
|
@patch(
|
2025-12-11 22:24:49 +08:00
|
|
|
"vllm_ascend.core.recompute_scheduler.RecomputeSchedulerConfig.initialize_from_config"
|
2025-10-20 16:30:57 +08:00
|
|
|
)
|
2025-07-02 17:46:06 +08:00
|
|
|
def test_check_and_update_config_310p_no_custom_ops(
|
2025-12-11 11:21:13 +08:00
|
|
|
self, mock_init_recompute, mock_soc_version, mock_init_ascend):
|
2025-09-02 18:34:04 +08:00
|
|
|
mock_init_ascend.return_value = TestNPUPlatform.mock_vllm_ascend_config(
|
|
|
|
|
)
|
|
|
|
|
vllm_config = TestNPUPlatform.mock_vllm_config()
|
|
|
|
|
vllm_config.compilation_config.custom_ops = []
|
2025-12-05 10:31:49 +08:00
|
|
|
vllm_config.parallel_config.decode_context_parallel_size = 1
|
|
|
|
|
vllm_config.parallel_config.prefill_context_parallel_size = 1
|
2025-10-20 16:30:57 +08:00
|
|
|
vllm_config.parallel_config.tensor_parallel_size = 1
|
|
|
|
|
mock_init_recompute.return_value = MagicMock()
|
2025-07-02 17:46:06 +08:00
|
|
|
|
[1/N][Refactor] Refactor code to adapt with vllm main (#3612)
### What this PR does / why we need it?
This is the step 1 of refactoring code to adapt with vllm main, and this
pr aligned with
https://github.com/vllm-project/vllm/commit/17c540a993af88204ad1b78345c8a865cf58ce44
1. refactor deepseek to the latest code arch as of
https://github.com/vllm-project/vllm/commit/17c540a993af88204ad1b78345c8a865cf58ce44
2. bunches of fixes due to vllm changes
- Fix `AscendScheduler` `__post_init__`, caused by
https://github.com/vllm-project/vllm/pull/25075
- Fix `AscendScheduler` init got an unexpected arg `block_size`, caused
by https://github.com/vllm-project/vllm/pull/26296
- Fix `KVCacheManager` `get_num_common_prefix_blocks` arg, caused by
https://github.com/vllm-project/vllm/pull/23485
- Fix `MLAAttention` import,caused by
https://github.com/vllm-project/vllm/pull/25103
- Fix `SharedFusedMoE` import, caused by
https://github.com/vllm-project/vllm/pull/26145
- Fix `LazyLoader` improt, caused by
https://github.com/vllm-project/vllm/pull/27022
- Fix `vllm.utils.swap_dict_values` improt, caused by
https://github.com/vllm-project/vllm/pull/26990
- Fix `Backend` enum import, caused by
https://github.com/vllm-project/vllm/pull/25893
- Fix `CompilationLevel` renaming to `CompilationMode` issue introduced
by https://github.com/vllm-project/vllm/pull/26355
- Fix fused_moe ops, caused by
https://github.com/vllm-project/vllm/pull/24097
- Fix bert model because of `inputs_embeds`, caused by
https://github.com/vllm-project/vllm/pull/25922
- Fix MRope because of `get_input_positions_tensor` to
`get_mrope_input_positions`, caused by
https://github.com/vllm-project/vllm/pull/24172
- Fix `splitting_ops` changes introduced by
https://github.com/vllm-project/vllm/pull/25845
- Fix multi-modality changes introduced by
https://github.com/vllm-project/vllm/issues/16229
- Fix lora bias dropping issue introduced by
https://github.com/vllm-project/vllm/pull/25807
- Fix structured ouput break introduced by
https://github.com/vllm-project/vllm/issues/26737
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
CI passed with existing test.
- vLLM version: v0.11.0rc3
- vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0
---------
Signed-off-by: MengqingCao <cmq0113@163.com>
Signed-off-by: Icey <1790571317@qq.com>
Co-authored-by: Icey <1790571317@qq.com>
2025-10-24 16:55:08 +08:00
|
|
|
vllm_config.scheduler_config = MagicMock()
|
2025-07-02 17:46:06 +08:00
|
|
|
from vllm_ascend import platform
|
|
|
|
|
|
|
|
|
|
importlib.reload(platform)
|
|
|
|
|
|
2025-09-02 18:34:04 +08:00
|
|
|
self.platform.check_and_update_config(vllm_config)
|
|
|
|
|
self.assertEqual(vllm_config.compilation_config.custom_ops, [])
|
2025-07-02 17:46:06 +08:00
|
|
|
|
2025-12-10 09:20:40 +08:00
|
|
|
def test_get_attn_backend_cls_use_v1_and_mla(self):
|
2025-12-20 09:38:53 +08:00
|
|
|
attn_selector_config = AttentionSelectorConfig(
|
|
|
|
|
dtype=torch.float16,
|
|
|
|
|
head_size=0,
|
|
|
|
|
kv_cache_dtype=None,
|
|
|
|
|
block_size=128,
|
2025-07-02 17:46:06 +08:00
|
|
|
use_mla=True,
|
2025-12-20 09:38:53 +08:00
|
|
|
use_sparse=False,
|
2025-07-02 17:46:06 +08:00
|
|
|
)
|
2025-12-20 09:38:53 +08:00
|
|
|
result = self.platform.get_attn_backend_cls("ascend",
|
|
|
|
|
attn_selector_config)
|
2025-07-02 17:46:06 +08:00
|
|
|
self.assertEqual(result,
|
|
|
|
|
"vllm_ascend.attention.mla_v1.AscendMLABackend")
|
|
|
|
|
|
2025-12-10 09:20:40 +08:00
|
|
|
def test_get_attn_backend_cls_use_v1_only(self):
|
2025-12-20 09:38:53 +08:00
|
|
|
attn_selector_config = AttentionSelectorConfig(
|
|
|
|
|
dtype=torch.float16,
|
|
|
|
|
head_size=0,
|
|
|
|
|
kv_cache_dtype=None,
|
|
|
|
|
block_size=128,
|
2025-07-02 17:46:06 +08:00
|
|
|
use_mla=False,
|
2025-12-20 09:38:53 +08:00
|
|
|
use_sparse=False,
|
2025-07-02 17:46:06 +08:00
|
|
|
)
|
2025-12-20 09:38:53 +08:00
|
|
|
result = self.platform.get_attn_backend_cls("ascend",
|
|
|
|
|
attn_selector_config)
|
2025-07-02 17:46:06 +08:00
|
|
|
self.assertEqual(
|
|
|
|
|
result,
|
|
|
|
|
"vllm_ascend.attention.attention_v1.AscendAttentionBackend")
|
|
|
|
|
|
|
|
|
|
def test_get_punica_wrapper(self):
|
|
|
|
|
result = self.platform.get_punica_wrapper()
|
2025-11-24 17:08:20 +08:00
|
|
|
|
|
|
|
|
self.assertEqual(result,
|
|
|
|
|
"vllm_ascend.lora.punica_npu.PunicaWrapperNPU")
|
2025-07-02 17:46:06 +08:00
|
|
|
|
|
|
|
|
@patch("torch.npu.reset_peak_memory_stats")
|
|
|
|
|
@patch("torch.npu.max_memory_allocated")
|
|
|
|
|
def test_get_current_memory_usage_with_specific_device(
|
|
|
|
|
self, mock_max_memory, mock_reset_stats):
|
|
|
|
|
max_memory_allocated_result = 1024.0
|
|
|
|
|
mock_max_memory.return_value = max_memory_allocated_result
|
|
|
|
|
test_device = torch.device("npu:0")
|
|
|
|
|
result = self.platform.get_current_memory_usage(device=test_device)
|
|
|
|
|
|
|
|
|
|
mock_reset_stats.assert_called_once_with(test_device)
|
|
|
|
|
mock_max_memory.assert_called_once_with(test_device)
|
|
|
|
|
self.assertEqual(result, max_memory_allocated_result)
|
|
|
|
|
|
|
|
|
|
@patch("torch.npu.reset_peak_memory_stats")
|
|
|
|
|
@patch("torch.npu.max_memory_allocated")
|
|
|
|
|
def test_get_current_memory_usage_with_default_device(
|
|
|
|
|
self, mock_max_memory, mock_reset_stats):
|
|
|
|
|
max_memory_allocated_result = 1024.0
|
|
|
|
|
mock_max_memory.return_value = max_memory_allocated_result
|
|
|
|
|
|
|
|
|
|
result = self.platform.get_current_memory_usage()
|
|
|
|
|
|
|
|
|
|
mock_reset_stats.assert_called_once_with(None)
|
|
|
|
|
mock_max_memory.assert_called_once_with(None)
|
|
|
|
|
self.assertEqual(result, max_memory_allocated_result)
|
|
|
|
|
|
|
|
|
|
@patch("torch.npu.reset_peak_memory_stats",
|
|
|
|
|
side_effect=RuntimeError("Device error"))
|
|
|
|
|
@patch("torch.npu.max_memory_allocated")
|
|
|
|
|
def test_get_current_memory_usage_when_reset_stats_fails(
|
|
|
|
|
self, mock_max_memory, mock_reset_stats):
|
|
|
|
|
with self.assertRaises(RuntimeError):
|
|
|
|
|
self.platform.get_current_memory_usage()
|
|
|
|
|
mock_reset_stats.assert_called_once()
|
|
|
|
|
mock_max_memory.assert_not_called()
|
|
|
|
|
|
|
|
|
|
@patch("torch.npu.reset_peak_memory_stats")
|
|
|
|
|
@patch(
|
|
|
|
|
"torch.npu.max_memory_allocated",
|
|
|
|
|
side_effect=RuntimeError("Memory query failed"),
|
|
|
|
|
)
|
|
|
|
|
def test_get_current_memory_usage_when_query_fails(self, mock_max_memory,
|
|
|
|
|
mock_reset_stats):
|
|
|
|
|
with self.assertRaises(RuntimeError):
|
|
|
|
|
self.platform.get_current_memory_usage()
|
|
|
|
|
mock_reset_stats.assert_called_once()
|
|
|
|
|
mock_max_memory.assert_called_once()
|
|
|
|
|
|
|
|
|
|
def test_get_device_communicator_cls_returns_correct_value(self):
|
|
|
|
|
self.assertEqual(
|
|
|
|
|
self.platform.get_device_communicator_cls(),
|
|
|
|
|
"vllm_ascend.distributed.communicator.NPUCommunicator",
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
def test_is_pin_memory_available_returns_true(self):
|
|
|
|
|
self.assertTrue(self.platform.is_pin_memory_available())
|
|
|
|
|
|
2025-08-20 09:01:04 +08:00
|
|
|
def test_get_static_graph_wrapper_cls_returns_correct_value(self):
|
2025-07-02 17:46:06 +08:00
|
|
|
self.assertEqual(
|
2025-08-20 09:01:04 +08:00
|
|
|
self.platform.get_static_graph_wrapper_cls(),
|
|
|
|
|
"vllm_ascend.compilation.acl_graph.ACLGraphWrapper",
|
2025-07-02 17:46:06 +08:00
|
|
|
)
|