Files
xc-llm-ascend/tests/ut/test_platform.py

440 lines
20 KiB
Python
Raw Normal View History

import importlib
from unittest.mock import MagicMock, patch
import pytest
import torch
from vllm.config.compilation import CompilationMode, CUDAGraphMode
from vllm.platforms import PlatformEnum
from tests.ut.base import TestBase
from vllm_ascend.platform import NPUPlatform
[Bugfix] Reset incompatible config (#6005) ### What this PR does / why we need it? This PR introduces compatibility fixes for running vLLM on Ascend NPU hardware. The changes ensure that GPU-specific parameters are automatically detected and reset to Ascend-compatible values with appropriate warnings logged. | Module | Parameter | Default Value | |--------|-----------|---------------| | Model Config | `disable_cascade_attn` | `False` | | Parallel Config | `all2all_backend` | `"allgather_reducescatter"` | | Cache Config | `cpu_kvcache_space_bytes` | `None` | | MultiModal Config | `mm_encoder_attn_backend` | `None` | | Observability Config | `enable_layerwise_nvtx_tracing` | `False` | | Scheduler Config | `max_num_partial_prefills` | `1` | | Speculative Config | `quantization` | `None` | | KV Transfer Config | `kv_buffer_size` | `1e9` | | KV Transfer Config | `enable_permute_local_kv` | `False` | | Attention Config | `use_prefill_decode_attention` | `False` | | Attention Config | `use_cudnn_prefill` | `False` | | Attention Config | `use_trtllm_ragged_deepseek_prefill` | `False` | | Attention Config | `use_trtllm_attention` | `False` | | Attention Config | `disable_flashinfer_prefill` | `False` | | Attention Config | `disable_flashinfer_q_quantization` | `False` | | Attention Config | `flash_attn_version` | `None` | | Attention Config | `backend` | `None` | | Attention Config | `flash_attn_max_num_splits_for_cuda_graph` | `32` | ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/2c24bc6996cb165fce92f780b388a5e39b3f4060 Signed-off-by: hfadzxy <starmoon_zhang@163.com>
2026-01-20 11:02:38 +08:00
from vllm_ascend.utils import ASCEND_QUANTIZATION_METHOD, COMPRESSED_TENSORS_METHOD, AscendDeviceType, vllm_version_is
# isort: off
[Bugfix] Reset incompatible config (#6005) ### What this PR does / why we need it? This PR introduces compatibility fixes for running vLLM on Ascend NPU hardware. The changes ensure that GPU-specific parameters are automatically detected and reset to Ascend-compatible values with appropriate warnings logged. | Module | Parameter | Default Value | |--------|-----------|---------------| | Model Config | `disable_cascade_attn` | `False` | | Parallel Config | `all2all_backend` | `"allgather_reducescatter"` | | Cache Config | `cpu_kvcache_space_bytes` | `None` | | MultiModal Config | `mm_encoder_attn_backend` | `None` | | Observability Config | `enable_layerwise_nvtx_tracing` | `False` | | Scheduler Config | `max_num_partial_prefills` | `1` | | Speculative Config | `quantization` | `None` | | KV Transfer Config | `kv_buffer_size` | `1e9` | | KV Transfer Config | `enable_permute_local_kv` | `False` | | Attention Config | `use_prefill_decode_attention` | `False` | | Attention Config | `use_cudnn_prefill` | `False` | | Attention Config | `use_trtllm_ragged_deepseek_prefill` | `False` | | Attention Config | `use_trtllm_attention` | `False` | | Attention Config | `disable_flashinfer_prefill` | `False` | | Attention Config | `disable_flashinfer_q_quantization` | `False` | | Attention Config | `flash_attn_version` | `None` | | Attention Config | `backend` | `None` | | Attention Config | `flash_attn_max_num_splits_for_cuda_graph` | `32` | ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/2c24bc6996cb165fce92f780b388a5e39b3f4060 Signed-off-by: hfadzxy <starmoon_zhang@163.com>
2026-01-20 11:02:38 +08:00
if vllm_version_is("0.13.0"):
from vllm.attention.selector import AttentionSelectorConfig # type: ignore
else:
from vllm.v1.attention.selector import AttentionSelectorConfig # type: ignore
# isort: on
class TestNPUPlatform(TestBase):
@staticmethod
def mock_vllm_config():
mock_vllm_config = MagicMock()
mock_vllm_config.compilation_config = MagicMock()
mock_vllm_config.model_config = MagicMock()
mock_vllm_config.parallel_config = MagicMock()
mock_vllm_config.cache_config = MagicMock()
mock_vllm_config.scheduler_config = MagicMock()
mock_vllm_config.speculative_config = None
mock_vllm_config.compilation_config.pass_config.enable_sp = False
mock_vllm_config.compilation_config.cudagraph_mode = None
return mock_vllm_config
@staticmethod
def mock_vllm_ascend_config():
mock_ascend_config = MagicMock()
mock_ascend_config.xlite_graph_config.enabled = False
mock_ascend_config.enable_shared_expert_dp = False
return mock_ascend_config
def setUp(self):
self.platform = NPUPlatform()
[Bugfix] Reset incompatible config (#6005) ### What this PR does / why we need it? This PR introduces compatibility fixes for running vLLM on Ascend NPU hardware. The changes ensure that GPU-specific parameters are automatically detected and reset to Ascend-compatible values with appropriate warnings logged. | Module | Parameter | Default Value | |--------|-----------|---------------| | Model Config | `disable_cascade_attn` | `False` | | Parallel Config | `all2all_backend` | `"allgather_reducescatter"` | | Cache Config | `cpu_kvcache_space_bytes` | `None` | | MultiModal Config | `mm_encoder_attn_backend` | `None` | | Observability Config | `enable_layerwise_nvtx_tracing` | `False` | | Scheduler Config | `max_num_partial_prefills` | `1` | | Speculative Config | `quantization` | `None` | | KV Transfer Config | `kv_buffer_size` | `1e9` | | KV Transfer Config | `enable_permute_local_kv` | `False` | | Attention Config | `use_prefill_decode_attention` | `False` | | Attention Config | `use_cudnn_prefill` | `False` | | Attention Config | `use_trtllm_ragged_deepseek_prefill` | `False` | | Attention Config | `use_trtllm_attention` | `False` | | Attention Config | `disable_flashinfer_prefill` | `False` | | Attention Config | `disable_flashinfer_q_quantization` | `False` | | Attention Config | `flash_attn_version` | `None` | | Attention Config | `backend` | `None` | | Attention Config | `flash_attn_max_num_splits_for_cuda_graph` | `32` | ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/2c24bc6996cb165fce92f780b388a5e39b3f4060 Signed-off-by: hfadzxy <starmoon_zhang@163.com>
2026-01-20 11:02:38 +08:00
self.platform.supported_quantization[:] = ["ascend", "compressed-tensors"]
def test_class_variables(self):
self.assertEqual(NPUPlatform._enum, PlatformEnum.OOT)
self.assertEqual(NPUPlatform.device_name, "npu")
self.assertEqual(NPUPlatform.device_type, "npu")
self.assertEqual(NPUPlatform.simple_compile_backend, "eager")
self.assertEqual(NPUPlatform.ray_device_key, "NPU")
[Bugfix] Reset incompatible config (#6005) ### What this PR does / why we need it? This PR introduces compatibility fixes for running vLLM on Ascend NPU hardware. The changes ensure that GPU-specific parameters are automatically detected and reset to Ascend-compatible values with appropriate warnings logged. | Module | Parameter | Default Value | |--------|-----------|---------------| | Model Config | `disable_cascade_attn` | `False` | | Parallel Config | `all2all_backend` | `"allgather_reducescatter"` | | Cache Config | `cpu_kvcache_space_bytes` | `None` | | MultiModal Config | `mm_encoder_attn_backend` | `None` | | Observability Config | `enable_layerwise_nvtx_tracing` | `False` | | Scheduler Config | `max_num_partial_prefills` | `1` | | Speculative Config | `quantization` | `None` | | KV Transfer Config | `kv_buffer_size` | `1e9` | | KV Transfer Config | `enable_permute_local_kv` | `False` | | Attention Config | `use_prefill_decode_attention` | `False` | | Attention Config | `use_cudnn_prefill` | `False` | | Attention Config | `use_trtllm_ragged_deepseek_prefill` | `False` | | Attention Config | `use_trtllm_attention` | `False` | | Attention Config | `disable_flashinfer_prefill` | `False` | | Attention Config | `disable_flashinfer_q_quantization` | `False` | | Attention Config | `flash_attn_version` | `None` | | Attention Config | `backend` | `None` | | Attention Config | `flash_attn_max_num_splits_for_cuda_graph` | `32` | ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/2c24bc6996cb165fce92f780b388a5e39b3f4060 Signed-off-by: hfadzxy <starmoon_zhang@163.com>
2026-01-20 11:02:38 +08:00
self.assertEqual(NPUPlatform.device_control_env_var, "ASCEND_RT_VISIBLE_DEVICES")
self.assertEqual(NPUPlatform.dispatch_key, "PrivateUse1")
[Bugfix] Reset incompatible config (#6005) ### What this PR does / why we need it? This PR introduces compatibility fixes for running vLLM on Ascend NPU hardware. The changes ensure that GPU-specific parameters are automatically detected and reset to Ascend-compatible values with appropriate warnings logged. | Module | Parameter | Default Value | |--------|-----------|---------------| | Model Config | `disable_cascade_attn` | `False` | | Parallel Config | `all2all_backend` | `"allgather_reducescatter"` | | Cache Config | `cpu_kvcache_space_bytes` | `None` | | MultiModal Config | `mm_encoder_attn_backend` | `None` | | Observability Config | `enable_layerwise_nvtx_tracing` | `False` | | Scheduler Config | `max_num_partial_prefills` | `1` | | Speculative Config | `quantization` | `None` | | KV Transfer Config | `kv_buffer_size` | `1e9` | | KV Transfer Config | `enable_permute_local_kv` | `False` | | Attention Config | `use_prefill_decode_attention` | `False` | | Attention Config | `use_cudnn_prefill` | `False` | | Attention Config | `use_trtllm_ragged_deepseek_prefill` | `False` | | Attention Config | `use_trtllm_attention` | `False` | | Attention Config | `disable_flashinfer_prefill` | `False` | | Attention Config | `disable_flashinfer_q_quantization` | `False` | | Attention Config | `flash_attn_version` | `None` | | Attention Config | `backend` | `None` | | Attention Config | `flash_attn_max_num_splits_for_cuda_graph` | `32` | ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/2c24bc6996cb165fce92f780b388a5e39b3f4060 Signed-off-by: hfadzxy <starmoon_zhang@163.com>
2026-01-20 11:02:38 +08:00
self.assertEqual(NPUPlatform.supported_quantization, [ASCEND_QUANTIZATION_METHOD, COMPRESSED_TENSORS_METHOD])
def test_is_sleep_mode_available(self):
self.assertTrue(self.platform.is_sleep_mode_available())
@patch("vllm_ascend.utils.adapt_patch")
@patch("vllm_ascend.quantization.quant_config.AscendQuantConfig")
[Bugfix] Reset incompatible config (#6005) ### What this PR does / why we need it? This PR introduces compatibility fixes for running vLLM on Ascend NPU hardware. The changes ensure that GPU-specific parameters are automatically detected and reset to Ascend-compatible values with appropriate warnings logged. | Module | Parameter | Default Value | |--------|-----------|---------------| | Model Config | `disable_cascade_attn` | `False` | | Parallel Config | `all2all_backend` | `"allgather_reducescatter"` | | Cache Config | `cpu_kvcache_space_bytes` | `None` | | MultiModal Config | `mm_encoder_attn_backend` | `None` | | Observability Config | `enable_layerwise_nvtx_tracing` | `False` | | Scheduler Config | `max_num_partial_prefills` | `1` | | Speculative Config | `quantization` | `None` | | KV Transfer Config | `kv_buffer_size` | `1e9` | | KV Transfer Config | `enable_permute_local_kv` | `False` | | Attention Config | `use_prefill_decode_attention` | `False` | | Attention Config | `use_cudnn_prefill` | `False` | | Attention Config | `use_trtllm_ragged_deepseek_prefill` | `False` | | Attention Config | `use_trtllm_attention` | `False` | | Attention Config | `disable_flashinfer_prefill` | `False` | | Attention Config | `disable_flashinfer_q_quantization` | `False` | | Attention Config | `flash_attn_version` | `None` | | Attention Config | `backend` | `None` | | Attention Config | `flash_attn_max_num_splits_for_cuda_graph` | `32` | ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/2c24bc6996cb165fce92f780b388a5e39b3f4060 Signed-off-by: hfadzxy <starmoon_zhang@163.com>
2026-01-20 11:02:38 +08:00
def test_pre_register_and_update_with_parser(self, mock_quant_config, mock_adapt_patch):
mock_parser = MagicMock()
mock_action = MagicMock()
mock_action.choices = ["awq", "gptq"]
mock_parser._option_string_actions = {"--quantization": mock_action}
self.platform.pre_register_and_update(mock_parser)
mock_adapt_patch.assert_called_once_with(is_global_patch=True)
self.assertTrue(ASCEND_QUANTIZATION_METHOD in mock_action.choices)
self.assertEqual(len(mock_action.choices), 3) # original 2 + ascend
@patch("vllm_ascend.utils.adapt_patch")
@patch("vllm_ascend.quantization.quant_config.AscendQuantConfig")
[Bugfix] Reset incompatible config (#6005) ### What this PR does / why we need it? This PR introduces compatibility fixes for running vLLM on Ascend NPU hardware. The changes ensure that GPU-specific parameters are automatically detected and reset to Ascend-compatible values with appropriate warnings logged. | Module | Parameter | Default Value | |--------|-----------|---------------| | Model Config | `disable_cascade_attn` | `False` | | Parallel Config | `all2all_backend` | `"allgather_reducescatter"` | | Cache Config | `cpu_kvcache_space_bytes` | `None` | | MultiModal Config | `mm_encoder_attn_backend` | `None` | | Observability Config | `enable_layerwise_nvtx_tracing` | `False` | | Scheduler Config | `max_num_partial_prefills` | `1` | | Speculative Config | `quantization` | `None` | | KV Transfer Config | `kv_buffer_size` | `1e9` | | KV Transfer Config | `enable_permute_local_kv` | `False` | | Attention Config | `use_prefill_decode_attention` | `False` | | Attention Config | `use_cudnn_prefill` | `False` | | Attention Config | `use_trtllm_ragged_deepseek_prefill` | `False` | | Attention Config | `use_trtllm_attention` | `False` | | Attention Config | `disable_flashinfer_prefill` | `False` | | Attention Config | `disable_flashinfer_q_quantization` | `False` | | Attention Config | `flash_attn_version` | `None` | | Attention Config | `backend` | `None` | | Attention Config | `flash_attn_max_num_splits_for_cuda_graph` | `32` | ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/2c24bc6996cb165fce92f780b388a5e39b3f4060 Signed-off-by: hfadzxy <starmoon_zhang@163.com>
2026-01-20 11:02:38 +08:00
def test_pre_register_and_update_without_parser(self, mock_quant_config, mock_adapt_patch):
self.platform.pre_register_and_update(None)
mock_adapt_patch.assert_called_once_with(is_global_patch=True)
@patch("vllm_ascend.utils.adapt_patch")
@patch("vllm_ascend.quantization.quant_config.AscendQuantConfig")
[Bugfix] Reset incompatible config (#6005) ### What this PR does / why we need it? This PR introduces compatibility fixes for running vLLM on Ascend NPU hardware. The changes ensure that GPU-specific parameters are automatically detected and reset to Ascend-compatible values with appropriate warnings logged. | Module | Parameter | Default Value | |--------|-----------|---------------| | Model Config | `disable_cascade_attn` | `False` | | Parallel Config | `all2all_backend` | `"allgather_reducescatter"` | | Cache Config | `cpu_kvcache_space_bytes` | `None` | | MultiModal Config | `mm_encoder_attn_backend` | `None` | | Observability Config | `enable_layerwise_nvtx_tracing` | `False` | | Scheduler Config | `max_num_partial_prefills` | `1` | | Speculative Config | `quantization` | `None` | | KV Transfer Config | `kv_buffer_size` | `1e9` | | KV Transfer Config | `enable_permute_local_kv` | `False` | | Attention Config | `use_prefill_decode_attention` | `False` | | Attention Config | `use_cudnn_prefill` | `False` | | Attention Config | `use_trtllm_ragged_deepseek_prefill` | `False` | | Attention Config | `use_trtllm_attention` | `False` | | Attention Config | `disable_flashinfer_prefill` | `False` | | Attention Config | `disable_flashinfer_q_quantization` | `False` | | Attention Config | `flash_attn_version` | `None` | | Attention Config | `backend` | `None` | | Attention Config | `flash_attn_max_num_splits_for_cuda_graph` | `32` | ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/2c24bc6996cb165fce92f780b388a5e39b3f4060 Signed-off-by: hfadzxy <starmoon_zhang@163.com>
2026-01-20 11:02:38 +08:00
def test_pre_register_and_update_with_parser_no_quant_action(self, mock_quant_config, mock_adapt_patch):
mock_parser = MagicMock()
mock_parser._option_string_actions = {}
self.platform.pre_register_and_update(mock_parser)
mock_adapt_patch.assert_called_once_with(is_global_patch=True)
@patch("vllm_ascend.utils.adapt_patch")
@patch("vllm_ascend.quantization.quant_config.AscendQuantConfig")
[Bugfix] Reset incompatible config (#6005) ### What this PR does / why we need it? This PR introduces compatibility fixes for running vLLM on Ascend NPU hardware. The changes ensure that GPU-specific parameters are automatically detected and reset to Ascend-compatible values with appropriate warnings logged. | Module | Parameter | Default Value | |--------|-----------|---------------| | Model Config | `disable_cascade_attn` | `False` | | Parallel Config | `all2all_backend` | `"allgather_reducescatter"` | | Cache Config | `cpu_kvcache_space_bytes` | `None` | | MultiModal Config | `mm_encoder_attn_backend` | `None` | | Observability Config | `enable_layerwise_nvtx_tracing` | `False` | | Scheduler Config | `max_num_partial_prefills` | `1` | | Speculative Config | `quantization` | `None` | | KV Transfer Config | `kv_buffer_size` | `1e9` | | KV Transfer Config | `enable_permute_local_kv` | `False` | | Attention Config | `use_prefill_decode_attention` | `False` | | Attention Config | `use_cudnn_prefill` | `False` | | Attention Config | `use_trtllm_ragged_deepseek_prefill` | `False` | | Attention Config | `use_trtllm_attention` | `False` | | Attention Config | `disable_flashinfer_prefill` | `False` | | Attention Config | `disable_flashinfer_q_quantization` | `False` | | Attention Config | `flash_attn_version` | `None` | | Attention Config | `backend` | `None` | | Attention Config | `flash_attn_max_num_splits_for_cuda_graph` | `32` | ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/2c24bc6996cb165fce92f780b388a5e39b3f4060 Signed-off-by: hfadzxy <starmoon_zhang@163.com>
2026-01-20 11:02:38 +08:00
def test_pre_register_and_update_with_existing_ascend_quant(self, mock_quant_config, mock_adapt_patch):
mock_parser = MagicMock()
mock_action = MagicMock()
mock_action.choices = ["awq", ASCEND_QUANTIZATION_METHOD]
mock_parser._option_string_actions = {"--quantization": mock_action}
self.platform.pre_register_and_update(mock_parser)
mock_adapt_patch.assert_called_once_with(is_global_patch=True)
self.assertEqual(len(mock_action.choices), 2)
def test_get_device_capability(self):
self.assertIsNone(self.platform.get_device_capability(device_id=0))
@patch("torch.npu.get_device_name")
def test_get_device_name(self, mock_get_device_name):
device_id = 0
device_name = "Ascend910B2"
mock_get_device_name.return_value = device_name
self.assertEqual(self.platform.get_device_name(device_id), device_name)
mock_get_device_name.assert_called_once_with(0)
@patch("torch.inference_mode")
def test_inference_mode(self, mock_inference_mode):
mock_inference_mode.return_value = None
self.assertIsNone(self.platform.inference_mode())
mock_inference_mode.assert_called_once()
@patch("vllm_ascend.ascend_config.init_ascend_config")
@patch("vllm_ascend.utils.update_aclgraph_sizes")
[Bugfix] Reset incompatible config (#6005) ### What this PR does / why we need it? This PR introduces compatibility fixes for running vLLM on Ascend NPU hardware. The changes ensure that GPU-specific parameters are automatically detected and reset to Ascend-compatible values with appropriate warnings logged. | Module | Parameter | Default Value | |--------|-----------|---------------| | Model Config | `disable_cascade_attn` | `False` | | Parallel Config | `all2all_backend` | `"allgather_reducescatter"` | | Cache Config | `cpu_kvcache_space_bytes` | `None` | | MultiModal Config | `mm_encoder_attn_backend` | `None` | | Observability Config | `enable_layerwise_nvtx_tracing` | `False` | | Scheduler Config | `max_num_partial_prefills` | `1` | | Speculative Config | `quantization` | `None` | | KV Transfer Config | `kv_buffer_size` | `1e9` | | KV Transfer Config | `enable_permute_local_kv` | `False` | | Attention Config | `use_prefill_decode_attention` | `False` | | Attention Config | `use_cudnn_prefill` | `False` | | Attention Config | `use_trtllm_ragged_deepseek_prefill` | `False` | | Attention Config | `use_trtllm_attention` | `False` | | Attention Config | `disable_flashinfer_prefill` | `False` | | Attention Config | `disable_flashinfer_q_quantization` | `False` | | Attention Config | `flash_attn_version` | `None` | | Attention Config | `backend` | `None` | | Attention Config | `flash_attn_max_num_splits_for_cuda_graph` | `32` | ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/2c24bc6996cb165fce92f780b388a5e39b3f4060 Signed-off-by: hfadzxy <starmoon_zhang@163.com>
2026-01-20 11:02:38 +08:00
@patch("vllm_ascend.utils.get_ascend_device_type", return_value=AscendDeviceType.A3)
@patch("os.environ", {})
[Bugfix] Reset incompatible config (#6005) ### What this PR does / why we need it? This PR introduces compatibility fixes for running vLLM on Ascend NPU hardware. The changes ensure that GPU-specific parameters are automatically detected and reset to Ascend-compatible values with appropriate warnings logged. | Module | Parameter | Default Value | |--------|-----------|---------------| | Model Config | `disable_cascade_attn` | `False` | | Parallel Config | `all2all_backend` | `"allgather_reducescatter"` | | Cache Config | `cpu_kvcache_space_bytes` | `None` | | MultiModal Config | `mm_encoder_attn_backend` | `None` | | Observability Config | `enable_layerwise_nvtx_tracing` | `False` | | Scheduler Config | `max_num_partial_prefills` | `1` | | Speculative Config | `quantization` | `None` | | KV Transfer Config | `kv_buffer_size` | `1e9` | | KV Transfer Config | `enable_permute_local_kv` | `False` | | Attention Config | `use_prefill_decode_attention` | `False` | | Attention Config | `use_cudnn_prefill` | `False` | | Attention Config | `use_trtllm_ragged_deepseek_prefill` | `False` | | Attention Config | `use_trtllm_attention` | `False` | | Attention Config | `disable_flashinfer_prefill` | `False` | | Attention Config | `disable_flashinfer_q_quantization` | `False` | | Attention Config | `flash_attn_version` | `None` | | Attention Config | `backend` | `None` | | Attention Config | `flash_attn_max_num_splits_for_cuda_graph` | `32` | ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/2c24bc6996cb165fce92f780b388a5e39b3f4060 Signed-off-by: hfadzxy <starmoon_zhang@163.com>
2026-01-20 11:02:38 +08:00
@patch("vllm_ascend.core.recompute_scheduler.RecomputeSchedulerConfig.initialize_from_config")
def test_check_and_update_config_basic_config_update(
[Bugfix] Reset incompatible config (#6005) ### What this PR does / why we need it? This PR introduces compatibility fixes for running vLLM on Ascend NPU hardware. The changes ensure that GPU-specific parameters are automatically detected and reset to Ascend-compatible values with appropriate warnings logged. | Module | Parameter | Default Value | |--------|-----------|---------------| | Model Config | `disable_cascade_attn` | `False` | | Parallel Config | `all2all_backend` | `"allgather_reducescatter"` | | Cache Config | `cpu_kvcache_space_bytes` | `None` | | MultiModal Config | `mm_encoder_attn_backend` | `None` | | Observability Config | `enable_layerwise_nvtx_tracing` | `False` | | Scheduler Config | `max_num_partial_prefills` | `1` | | Speculative Config | `quantization` | `None` | | KV Transfer Config | `kv_buffer_size` | `1e9` | | KV Transfer Config | `enable_permute_local_kv` | `False` | | Attention Config | `use_prefill_decode_attention` | `False` | | Attention Config | `use_cudnn_prefill` | `False` | | Attention Config | `use_trtllm_ragged_deepseek_prefill` | `False` | | Attention Config | `use_trtllm_attention` | `False` | | Attention Config | `disable_flashinfer_prefill` | `False` | | Attention Config | `disable_flashinfer_q_quantization` | `False` | | Attention Config | `flash_attn_version` | `None` | | Attention Config | `backend` | `None` | | Attention Config | `flash_attn_max_num_splits_for_cuda_graph` | `32` | ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/2c24bc6996cb165fce92f780b388a5e39b3f4060 Signed-off-by: hfadzxy <starmoon_zhang@163.com>
2026-01-20 11:02:38 +08:00
self, mock_init_recompute, mock_soc_version, mock_update_acl, mock_init_ascend
):
mock_init_ascend.return_value = TestNPUPlatform.mock_vllm_ascend_config()
vllm_config = TestNPUPlatform.mock_vllm_config()
vllm_config.parallel_config.enable_expert_parallel = False
vllm_config.parallel_config.decode_context_parallel_size = 1
vllm_config.parallel_config.prefill_context_parallel_size = 1
vllm_config.parallel_config.decode_context_parallel_size = 1
vllm_config.parallel_config.prefill_context_parallel_size = 1
vllm_config.parallel_config.tensor_parallel_size = 1
mock_init_recompute.return_value = MagicMock()
[1/N][Refactor] Refactor code to adapt with vllm main (#3612) ### What this PR does / why we need it? This is the step 1 of refactoring code to adapt with vllm main, and this pr aligned with https://github.com/vllm-project/vllm/commit/17c540a993af88204ad1b78345c8a865cf58ce44 1. refactor deepseek to the latest code arch as of https://github.com/vllm-project/vllm/commit/17c540a993af88204ad1b78345c8a865cf58ce44 2. bunches of fixes due to vllm changes - Fix `AscendScheduler` `__post_init__`, caused by https://github.com/vllm-project/vllm/pull/25075 - Fix `AscendScheduler` init got an unexpected arg `block_size`, caused by https://github.com/vllm-project/vllm/pull/26296 - Fix `KVCacheManager` `get_num_common_prefix_blocks` arg, caused by https://github.com/vllm-project/vllm/pull/23485 - Fix `MLAAttention` import,caused by https://github.com/vllm-project/vllm/pull/25103 - Fix `SharedFusedMoE` import, caused by https://github.com/vllm-project/vllm/pull/26145 - Fix `LazyLoader` improt, caused by https://github.com/vllm-project/vllm/pull/27022 - Fix `vllm.utils.swap_dict_values` improt, caused by https://github.com/vllm-project/vllm/pull/26990 - Fix `Backend` enum import, caused by https://github.com/vllm-project/vllm/pull/25893 - Fix `CompilationLevel` renaming to `CompilationMode` issue introduced by https://github.com/vllm-project/vllm/pull/26355 - Fix fused_moe ops, caused by https://github.com/vllm-project/vllm/pull/24097 - Fix bert model because of `inputs_embeds`, caused by https://github.com/vllm-project/vllm/pull/25922 - Fix MRope because of `get_input_positions_tensor` to `get_mrope_input_positions`, caused by https://github.com/vllm-project/vllm/pull/24172 - Fix `splitting_ops` changes introduced by https://github.com/vllm-project/vllm/pull/25845 - Fix multi-modality changes introduced by https://github.com/vllm-project/vllm/issues/16229 - Fix lora bias dropping issue introduced by https://github.com/vllm-project/vllm/pull/25807 - Fix structured ouput break introduced by https://github.com/vllm-project/vllm/issues/26737 ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? CI passed with existing test. - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 --------- Signed-off-by: MengqingCao <cmq0113@163.com> Signed-off-by: Icey <1790571317@qq.com> Co-authored-by: Icey <1790571317@qq.com>
2025-10-24 16:55:08 +08:00
vllm_config.scheduler_config = MagicMock()
# Use importlib.reload to reload the platform module, ensuring the mocked init_ascend_config method is used.
# Without this reload, when calling self.platform.check_and_update_config,
# it would execute the original unmocked init_ascend_config method, causing the unit test to fail.
from vllm_ascend import platform
importlib.reload(platform)
self.platform.check_and_update_config(vllm_config)
mock_init_ascend.assert_called_once_with(vllm_config)
[Bugfix] Reset incompatible config (#6005) ### What this PR does / why we need it? This PR introduces compatibility fixes for running vLLM on Ascend NPU hardware. The changes ensure that GPU-specific parameters are automatically detected and reset to Ascend-compatible values with appropriate warnings logged. | Module | Parameter | Default Value | |--------|-----------|---------------| | Model Config | `disable_cascade_attn` | `False` | | Parallel Config | `all2all_backend` | `"allgather_reducescatter"` | | Cache Config | `cpu_kvcache_space_bytes` | `None` | | MultiModal Config | `mm_encoder_attn_backend` | `None` | | Observability Config | `enable_layerwise_nvtx_tracing` | `False` | | Scheduler Config | `max_num_partial_prefills` | `1` | | Speculative Config | `quantization` | `None` | | KV Transfer Config | `kv_buffer_size` | `1e9` | | KV Transfer Config | `enable_permute_local_kv` | `False` | | Attention Config | `use_prefill_decode_attention` | `False` | | Attention Config | `use_cudnn_prefill` | `False` | | Attention Config | `use_trtllm_ragged_deepseek_prefill` | `False` | | Attention Config | `use_trtllm_attention` | `False` | | Attention Config | `disable_flashinfer_prefill` | `False` | | Attention Config | `disable_flashinfer_q_quantization` | `False` | | Attention Config | `flash_attn_version` | `None` | | Attention Config | `backend` | `None` | | Attention Config | `flash_attn_max_num_splits_for_cuda_graph` | `32` | ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/2c24bc6996cb165fce92f780b388a5e39b3f4060 Signed-off-by: hfadzxy <starmoon_zhang@163.com>
2026-01-20 11:02:38 +08:00
@patch("vllm_ascend.utils.get_ascend_device_type", return_value=AscendDeviceType.A3)
@patch("vllm_ascend.ascend_config.init_ascend_config")
[Bugfix] Reset incompatible config (#6005) ### What this PR does / why we need it? This PR introduces compatibility fixes for running vLLM on Ascend NPU hardware. The changes ensure that GPU-specific parameters are automatically detected and reset to Ascend-compatible values with appropriate warnings logged. | Module | Parameter | Default Value | |--------|-----------|---------------| | Model Config | `disable_cascade_attn` | `False` | | Parallel Config | `all2all_backend` | `"allgather_reducescatter"` | | Cache Config | `cpu_kvcache_space_bytes` | `None` | | MultiModal Config | `mm_encoder_attn_backend` | `None` | | Observability Config | `enable_layerwise_nvtx_tracing` | `False` | | Scheduler Config | `max_num_partial_prefills` | `1` | | Speculative Config | `quantization` | `None` | | KV Transfer Config | `kv_buffer_size` | `1e9` | | KV Transfer Config | `enable_permute_local_kv` | `False` | | Attention Config | `use_prefill_decode_attention` | `False` | | Attention Config | `use_cudnn_prefill` | `False` | | Attention Config | `use_trtllm_ragged_deepseek_prefill` | `False` | | Attention Config | `use_trtllm_attention` | `False` | | Attention Config | `disable_flashinfer_prefill` | `False` | | Attention Config | `disable_flashinfer_q_quantization` | `False` | | Attention Config | `flash_attn_version` | `None` | | Attention Config | `backend` | `None` | | Attention Config | `flash_attn_max_num_splits_for_cuda_graph` | `32` | ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/2c24bc6996cb165fce92f780b388a5e39b3f4060 Signed-off-by: hfadzxy <starmoon_zhang@163.com>
2026-01-20 11:02:38 +08:00
@patch("vllm_ascend.core.recompute_scheduler.RecomputeSchedulerConfig.initialize_from_config")
def test_check_and_update_config_no_model_config_warning(
[Bugfix] Reset incompatible config (#6005) ### What this PR does / why we need it? This PR introduces compatibility fixes for running vLLM on Ascend NPU hardware. The changes ensure that GPU-specific parameters are automatically detected and reset to Ascend-compatible values with appropriate warnings logged. | Module | Parameter | Default Value | |--------|-----------|---------------| | Model Config | `disable_cascade_attn` | `False` | | Parallel Config | `all2all_backend` | `"allgather_reducescatter"` | | Cache Config | `cpu_kvcache_space_bytes` | `None` | | MultiModal Config | `mm_encoder_attn_backend` | `None` | | Observability Config | `enable_layerwise_nvtx_tracing` | `False` | | Scheduler Config | `max_num_partial_prefills` | `1` | | Speculative Config | `quantization` | `None` | | KV Transfer Config | `kv_buffer_size` | `1e9` | | KV Transfer Config | `enable_permute_local_kv` | `False` | | Attention Config | `use_prefill_decode_attention` | `False` | | Attention Config | `use_cudnn_prefill` | `False` | | Attention Config | `use_trtllm_ragged_deepseek_prefill` | `False` | | Attention Config | `use_trtllm_attention` | `False` | | Attention Config | `disable_flashinfer_prefill` | `False` | | Attention Config | `disable_flashinfer_q_quantization` | `False` | | Attention Config | `flash_attn_version` | `None` | | Attention Config | `backend` | `None` | | Attention Config | `flash_attn_max_num_splits_for_cuda_graph` | `32` | ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/2c24bc6996cb165fce92f780b388a5e39b3f4060 Signed-off-by: hfadzxy <starmoon_zhang@163.com>
2026-01-20 11:02:38 +08:00
self, mock_init_recompute, mock_init_ascend, mock_soc_version
):
mock_init_ascend.return_value = TestNPUPlatform.mock_vllm_ascend_config()
vllm_config = TestNPUPlatform.mock_vllm_config()
vllm_config.model_config = None
vllm_config.parallel_config.decode_context_parallel_size = 1
vllm_config.parallel_config.prefill_context_parallel_size = 1
vllm_config.parallel_config.tensor_parallel_size = 1
mock_init_recompute.return_value = MagicMock()
[1/N][Refactor] Refactor code to adapt with vllm main (#3612) ### What this PR does / why we need it? This is the step 1 of refactoring code to adapt with vllm main, and this pr aligned with https://github.com/vllm-project/vllm/commit/17c540a993af88204ad1b78345c8a865cf58ce44 1. refactor deepseek to the latest code arch as of https://github.com/vllm-project/vllm/commit/17c540a993af88204ad1b78345c8a865cf58ce44 2. bunches of fixes due to vllm changes - Fix `AscendScheduler` `__post_init__`, caused by https://github.com/vllm-project/vllm/pull/25075 - Fix `AscendScheduler` init got an unexpected arg `block_size`, caused by https://github.com/vllm-project/vllm/pull/26296 - Fix `KVCacheManager` `get_num_common_prefix_blocks` arg, caused by https://github.com/vllm-project/vllm/pull/23485 - Fix `MLAAttention` import,caused by https://github.com/vllm-project/vllm/pull/25103 - Fix `SharedFusedMoE` import, caused by https://github.com/vllm-project/vllm/pull/26145 - Fix `LazyLoader` improt, caused by https://github.com/vllm-project/vllm/pull/27022 - Fix `vllm.utils.swap_dict_values` improt, caused by https://github.com/vllm-project/vllm/pull/26990 - Fix `Backend` enum import, caused by https://github.com/vllm-project/vllm/pull/25893 - Fix `CompilationLevel` renaming to `CompilationMode` issue introduced by https://github.com/vllm-project/vllm/pull/26355 - Fix fused_moe ops, caused by https://github.com/vllm-project/vllm/pull/24097 - Fix bert model because of `inputs_embeds`, caused by https://github.com/vllm-project/vllm/pull/25922 - Fix MRope because of `get_input_positions_tensor` to `get_mrope_input_positions`, caused by https://github.com/vllm-project/vllm/pull/24172 - Fix `splitting_ops` changes introduced by https://github.com/vllm-project/vllm/pull/25845 - Fix multi-modality changes introduced by https://github.com/vllm-project/vllm/issues/16229 - Fix lora bias dropping issue introduced by https://github.com/vllm-project/vllm/pull/25807 - Fix structured ouput break introduced by https://github.com/vllm-project/vllm/issues/26737 ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? CI passed with existing test. - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 --------- Signed-off-by: MengqingCao <cmq0113@163.com> Signed-off-by: Icey <1790571317@qq.com> Co-authored-by: Icey <1790571317@qq.com>
2025-10-24 16:55:08 +08:00
vllm_config.scheduler_config = MagicMock()
with self.assertLogs(logger="vllm", level="WARNING") as cm:
from vllm_ascend import platform
importlib.reload(platform)
[Bugfix] Reset incompatible config (#6005) ### What this PR does / why we need it? This PR introduces compatibility fixes for running vLLM on Ascend NPU hardware. The changes ensure that GPU-specific parameters are automatically detected and reset to Ascend-compatible values with appropriate warnings logged. | Module | Parameter | Default Value | |--------|-----------|---------------| | Model Config | `disable_cascade_attn` | `False` | | Parallel Config | `all2all_backend` | `"allgather_reducescatter"` | | Cache Config | `cpu_kvcache_space_bytes` | `None` | | MultiModal Config | `mm_encoder_attn_backend` | `None` | | Observability Config | `enable_layerwise_nvtx_tracing` | `False` | | Scheduler Config | `max_num_partial_prefills` | `1` | | Speculative Config | `quantization` | `None` | | KV Transfer Config | `kv_buffer_size` | `1e9` | | KV Transfer Config | `enable_permute_local_kv` | `False` | | Attention Config | `use_prefill_decode_attention` | `False` | | Attention Config | `use_cudnn_prefill` | `False` | | Attention Config | `use_trtllm_ragged_deepseek_prefill` | `False` | | Attention Config | `use_trtllm_attention` | `False` | | Attention Config | `disable_flashinfer_prefill` | `False` | | Attention Config | `disable_flashinfer_q_quantization` | `False` | | Attention Config | `flash_attn_version` | `None` | | Attention Config | `backend` | `None` | | Attention Config | `flash_attn_max_num_splits_for_cuda_graph` | `32` | ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/2c24bc6996cb165fce92f780b388a5e39b3f4060 Signed-off-by: hfadzxy <starmoon_zhang@163.com>
2026-01-20 11:02:38 +08:00
self.platform = platform.NPUPlatform()
with patch.object(platform.NPUPlatform, "_fix_incompatible_config"):
self.platform.check_and_update_config(vllm_config)
self.assertTrue("Model config is missing" in cm.output[0])
[Bugfix] Reset incompatible config (#6005) ### What this PR does / why we need it? This PR introduces compatibility fixes for running vLLM on Ascend NPU hardware. The changes ensure that GPU-specific parameters are automatically detected and reset to Ascend-compatible values with appropriate warnings logged. | Module | Parameter | Default Value | |--------|-----------|---------------| | Model Config | `disable_cascade_attn` | `False` | | Parallel Config | `all2all_backend` | `"allgather_reducescatter"` | | Cache Config | `cpu_kvcache_space_bytes` | `None` | | MultiModal Config | `mm_encoder_attn_backend` | `None` | | Observability Config | `enable_layerwise_nvtx_tracing` | `False` | | Scheduler Config | `max_num_partial_prefills` | `1` | | Speculative Config | `quantization` | `None` | | KV Transfer Config | `kv_buffer_size` | `1e9` | | KV Transfer Config | `enable_permute_local_kv` | `False` | | Attention Config | `use_prefill_decode_attention` | `False` | | Attention Config | `use_cudnn_prefill` | `False` | | Attention Config | `use_trtllm_ragged_deepseek_prefill` | `False` | | Attention Config | `use_trtllm_attention` | `False` | | Attention Config | `disable_flashinfer_prefill` | `False` | | Attention Config | `disable_flashinfer_q_quantization` | `False` | | Attention Config | `flash_attn_version` | `None` | | Attention Config | `backend` | `None` | | Attention Config | `flash_attn_max_num_splits_for_cuda_graph` | `32` | ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/2c24bc6996cb165fce92f780b388a5e39b3f4060 Signed-off-by: hfadzxy <starmoon_zhang@163.com>
2026-01-20 11:02:38 +08:00
@patch("vllm_ascend.utils.get_ascend_device_type", return_value=AscendDeviceType.A3)
@patch("vllm_ascend.ascend_config.init_ascend_config")
[Bugfix] Reset incompatible config (#6005) ### What this PR does / why we need it? This PR introduces compatibility fixes for running vLLM on Ascend NPU hardware. The changes ensure that GPU-specific parameters are automatically detected and reset to Ascend-compatible values with appropriate warnings logged. | Module | Parameter | Default Value | |--------|-----------|---------------| | Model Config | `disable_cascade_attn` | `False` | | Parallel Config | `all2all_backend` | `"allgather_reducescatter"` | | Cache Config | `cpu_kvcache_space_bytes` | `None` | | MultiModal Config | `mm_encoder_attn_backend` | `None` | | Observability Config | `enable_layerwise_nvtx_tracing` | `False` | | Scheduler Config | `max_num_partial_prefills` | `1` | | Speculative Config | `quantization` | `None` | | KV Transfer Config | `kv_buffer_size` | `1e9` | | KV Transfer Config | `enable_permute_local_kv` | `False` | | Attention Config | `use_prefill_decode_attention` | `False` | | Attention Config | `use_cudnn_prefill` | `False` | | Attention Config | `use_trtllm_ragged_deepseek_prefill` | `False` | | Attention Config | `use_trtllm_attention` | `False` | | Attention Config | `disable_flashinfer_prefill` | `False` | | Attention Config | `disable_flashinfer_q_quantization` | `False` | | Attention Config | `flash_attn_version` | `None` | | Attention Config | `backend` | `None` | | Attention Config | `flash_attn_max_num_splits_for_cuda_graph` | `32` | ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/2c24bc6996cb165fce92f780b388a5e39b3f4060 Signed-off-by: hfadzxy <starmoon_zhang@163.com>
2026-01-20 11:02:38 +08:00
@patch("vllm_ascend.core.recompute_scheduler.RecomputeSchedulerConfig.initialize_from_config")
def test_check_and_update_config_enforce_eager_mode(self, mock_init_recompute, mock_init_ascend, mock_soc_version):
mock_init_ascend.return_value = TestNPUPlatform.mock_vllm_ascend_config()
vllm_config = TestNPUPlatform.mock_vllm_config()
vllm_config.model_config.enforce_eager = True
vllm_config.parallel_config.decode_context_parallel_size = 1
vllm_config.parallel_config.prefill_context_parallel_size = 1
vllm_config.parallel_config.tensor_parallel_size = 1
mock_init_recompute.return_value = MagicMock()
[1/N][Refactor] Refactor code to adapt with vllm main (#3612) ### What this PR does / why we need it? This is the step 1 of refactoring code to adapt with vllm main, and this pr aligned with https://github.com/vllm-project/vllm/commit/17c540a993af88204ad1b78345c8a865cf58ce44 1. refactor deepseek to the latest code arch as of https://github.com/vllm-project/vllm/commit/17c540a993af88204ad1b78345c8a865cf58ce44 2. bunches of fixes due to vllm changes - Fix `AscendScheduler` `__post_init__`, caused by https://github.com/vllm-project/vllm/pull/25075 - Fix `AscendScheduler` init got an unexpected arg `block_size`, caused by https://github.com/vllm-project/vllm/pull/26296 - Fix `KVCacheManager` `get_num_common_prefix_blocks` arg, caused by https://github.com/vllm-project/vllm/pull/23485 - Fix `MLAAttention` import,caused by https://github.com/vllm-project/vllm/pull/25103 - Fix `SharedFusedMoE` import, caused by https://github.com/vllm-project/vllm/pull/26145 - Fix `LazyLoader` improt, caused by https://github.com/vllm-project/vllm/pull/27022 - Fix `vllm.utils.swap_dict_values` improt, caused by https://github.com/vllm-project/vllm/pull/26990 - Fix `Backend` enum import, caused by https://github.com/vllm-project/vllm/pull/25893 - Fix `CompilationLevel` renaming to `CompilationMode` issue introduced by https://github.com/vllm-project/vllm/pull/26355 - Fix fused_moe ops, caused by https://github.com/vllm-project/vllm/pull/24097 - Fix bert model because of `inputs_embeds`, caused by https://github.com/vllm-project/vllm/pull/25922 - Fix MRope because of `get_input_positions_tensor` to `get_mrope_input_positions`, caused by https://github.com/vllm-project/vllm/pull/24172 - Fix `splitting_ops` changes introduced by https://github.com/vllm-project/vllm/pull/25845 - Fix multi-modality changes introduced by https://github.com/vllm-project/vllm/issues/16229 - Fix lora bias dropping issue introduced by https://github.com/vllm-project/vllm/pull/25807 - Fix structured ouput break introduced by https://github.com/vllm-project/vllm/issues/26737 ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? CI passed with existing test. - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 --------- Signed-off-by: MengqingCao <cmq0113@163.com> Signed-off-by: Icey <1790571317@qq.com> Co-authored-by: Icey <1790571317@qq.com>
2025-10-24 16:55:08 +08:00
vllm_config.scheduler_config = MagicMock()
with self.assertLogs(logger="vllm", level="INFO") as cm:
from vllm_ascend import platform
importlib.reload(platform)
[Bugfix] Reset incompatible config (#6005) ### What this PR does / why we need it? This PR introduces compatibility fixes for running vLLM on Ascend NPU hardware. The changes ensure that GPU-specific parameters are automatically detected and reset to Ascend-compatible values with appropriate warnings logged. | Module | Parameter | Default Value | |--------|-----------|---------------| | Model Config | `disable_cascade_attn` | `False` | | Parallel Config | `all2all_backend` | `"allgather_reducescatter"` | | Cache Config | `cpu_kvcache_space_bytes` | `None` | | MultiModal Config | `mm_encoder_attn_backend` | `None` | | Observability Config | `enable_layerwise_nvtx_tracing` | `False` | | Scheduler Config | `max_num_partial_prefills` | `1` | | Speculative Config | `quantization` | `None` | | KV Transfer Config | `kv_buffer_size` | `1e9` | | KV Transfer Config | `enable_permute_local_kv` | `False` | | Attention Config | `use_prefill_decode_attention` | `False` | | Attention Config | `use_cudnn_prefill` | `False` | | Attention Config | `use_trtllm_ragged_deepseek_prefill` | `False` | | Attention Config | `use_trtllm_attention` | `False` | | Attention Config | `disable_flashinfer_prefill` | `False` | | Attention Config | `disable_flashinfer_q_quantization` | `False` | | Attention Config | `flash_attn_version` | `None` | | Attention Config | `backend` | `None` | | Attention Config | `flash_attn_max_num_splits_for_cuda_graph` | `32` | ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/2c24bc6996cb165fce92f780b388a5e39b3f4060 Signed-off-by: hfadzxy <starmoon_zhang@163.com>
2026-01-20 11:02:38 +08:00
self.platform = platform.NPUPlatform()
with patch.object(platform.NPUPlatform, "_fix_incompatible_config"):
self.platform.check_and_update_config(vllm_config)
self.assertTrue("Compilation disabled, using eager mode by default" in cm.output[0])
[1/N][Refactor] Refactor code to adapt with vllm main (#3612) ### What this PR does / why we need it? This is the step 1 of refactoring code to adapt with vllm main, and this pr aligned with https://github.com/vllm-project/vllm/commit/17c540a993af88204ad1b78345c8a865cf58ce44 1. refactor deepseek to the latest code arch as of https://github.com/vllm-project/vllm/commit/17c540a993af88204ad1b78345c8a865cf58ce44 2. bunches of fixes due to vllm changes - Fix `AscendScheduler` `__post_init__`, caused by https://github.com/vllm-project/vllm/pull/25075 - Fix `AscendScheduler` init got an unexpected arg `block_size`, caused by https://github.com/vllm-project/vllm/pull/26296 - Fix `KVCacheManager` `get_num_common_prefix_blocks` arg, caused by https://github.com/vllm-project/vllm/pull/23485 - Fix `MLAAttention` import,caused by https://github.com/vllm-project/vllm/pull/25103 - Fix `SharedFusedMoE` import, caused by https://github.com/vllm-project/vllm/pull/26145 - Fix `LazyLoader` improt, caused by https://github.com/vllm-project/vllm/pull/27022 - Fix `vllm.utils.swap_dict_values` improt, caused by https://github.com/vllm-project/vllm/pull/26990 - Fix `Backend` enum import, caused by https://github.com/vllm-project/vllm/pull/25893 - Fix `CompilationLevel` renaming to `CompilationMode` issue introduced by https://github.com/vllm-project/vllm/pull/26355 - Fix fused_moe ops, caused by https://github.com/vllm-project/vllm/pull/24097 - Fix bert model because of `inputs_embeds`, caused by https://github.com/vllm-project/vllm/pull/25922 - Fix MRope because of `get_input_positions_tensor` to `get_mrope_input_positions`, caused by https://github.com/vllm-project/vllm/pull/24172 - Fix `splitting_ops` changes introduced by https://github.com/vllm-project/vllm/pull/25845 - Fix multi-modality changes introduced by https://github.com/vllm-project/vllm/issues/16229 - Fix lora bias dropping issue introduced by https://github.com/vllm-project/vllm/pull/25807 - Fix structured ouput break introduced by https://github.com/vllm-project/vllm/issues/26737 ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? CI passed with existing test. - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 --------- Signed-off-by: MengqingCao <cmq0113@163.com> Signed-off-by: Icey <1790571317@qq.com> Co-authored-by: Icey <1790571317@qq.com>
2025-10-24 16:55:08 +08:00
self.assertEqual(
vllm_config.compilation_config.mode,
CompilationMode.NONE,
)
[1/N][Refactor] Refactor code to adapt with vllm main (#3612) ### What this PR does / why we need it? This is the step 1 of refactoring code to adapt with vllm main, and this pr aligned with https://github.com/vllm-project/vllm/commit/17c540a993af88204ad1b78345c8a865cf58ce44 1. refactor deepseek to the latest code arch as of https://github.com/vllm-project/vllm/commit/17c540a993af88204ad1b78345c8a865cf58ce44 2. bunches of fixes due to vllm changes - Fix `AscendScheduler` `__post_init__`, caused by https://github.com/vllm-project/vllm/pull/25075 - Fix `AscendScheduler` init got an unexpected arg `block_size`, caused by https://github.com/vllm-project/vllm/pull/26296 - Fix `KVCacheManager` `get_num_common_prefix_blocks` arg, caused by https://github.com/vllm-project/vllm/pull/23485 - Fix `MLAAttention` import,caused by https://github.com/vllm-project/vllm/pull/25103 - Fix `SharedFusedMoE` import, caused by https://github.com/vllm-project/vllm/pull/26145 - Fix `LazyLoader` improt, caused by https://github.com/vllm-project/vllm/pull/27022 - Fix `vllm.utils.swap_dict_values` improt, caused by https://github.com/vllm-project/vllm/pull/26990 - Fix `Backend` enum import, caused by https://github.com/vllm-project/vllm/pull/25893 - Fix `CompilationLevel` renaming to `CompilationMode` issue introduced by https://github.com/vllm-project/vllm/pull/26355 - Fix fused_moe ops, caused by https://github.com/vllm-project/vllm/pull/24097 - Fix bert model because of `inputs_embeds`, caused by https://github.com/vllm-project/vllm/pull/25922 - Fix MRope because of `get_input_positions_tensor` to `get_mrope_input_positions`, caused by https://github.com/vllm-project/vllm/pull/24172 - Fix `splitting_ops` changes introduced by https://github.com/vllm-project/vllm/pull/25845 - Fix multi-modality changes introduced by https://github.com/vllm-project/vllm/issues/16229 - Fix lora bias dropping issue introduced by https://github.com/vllm-project/vllm/pull/25807 - Fix structured ouput break introduced by https://github.com/vllm-project/vllm/issues/26737 ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? CI passed with existing test. - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 --------- Signed-off-by: MengqingCao <cmq0113@163.com> Signed-off-by: Icey <1790571317@qq.com> Co-authored-by: Icey <1790571317@qq.com>
2025-10-24 16:55:08 +08:00
self.assertEqual(
vllm_config.compilation_config.cudagraph_mode,
CUDAGraphMode.NONE,
)
[Bugfix] Reset incompatible config (#6005) ### What this PR does / why we need it? This PR introduces compatibility fixes for running vLLM on Ascend NPU hardware. The changes ensure that GPU-specific parameters are automatically detected and reset to Ascend-compatible values with appropriate warnings logged. | Module | Parameter | Default Value | |--------|-----------|---------------| | Model Config | `disable_cascade_attn` | `False` | | Parallel Config | `all2all_backend` | `"allgather_reducescatter"` | | Cache Config | `cpu_kvcache_space_bytes` | `None` | | MultiModal Config | `mm_encoder_attn_backend` | `None` | | Observability Config | `enable_layerwise_nvtx_tracing` | `False` | | Scheduler Config | `max_num_partial_prefills` | `1` | | Speculative Config | `quantization` | `None` | | KV Transfer Config | `kv_buffer_size` | `1e9` | | KV Transfer Config | `enable_permute_local_kv` | `False` | | Attention Config | `use_prefill_decode_attention` | `False` | | Attention Config | `use_cudnn_prefill` | `False` | | Attention Config | `use_trtllm_ragged_deepseek_prefill` | `False` | | Attention Config | `use_trtllm_attention` | `False` | | Attention Config | `disable_flashinfer_prefill` | `False` | | Attention Config | `disable_flashinfer_q_quantization` | `False` | | Attention Config | `flash_attn_version` | `None` | | Attention Config | `backend` | `None` | | Attention Config | `flash_attn_max_num_splits_for_cuda_graph` | `32` | ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/2c24bc6996cb165fce92f780b388a5e39b3f4060 Signed-off-by: hfadzxy <starmoon_zhang@163.com>
2026-01-20 11:02:38 +08:00
@patch("vllm_ascend.utils.get_ascend_device_type", return_value=AscendDeviceType.A3)
@patch("vllm_ascend.utils.update_default_aclgraph_sizes")
@patch("vllm_ascend.ascend_config.init_ascend_config")
[Bugfix] Reset incompatible config (#6005) ### What this PR does / why we need it? This PR introduces compatibility fixes for running vLLM on Ascend NPU hardware. The changes ensure that GPU-specific parameters are automatically detected and reset to Ascend-compatible values with appropriate warnings logged. | Module | Parameter | Default Value | |--------|-----------|---------------| | Model Config | `disable_cascade_attn` | `False` | | Parallel Config | `all2all_backend` | `"allgather_reducescatter"` | | Cache Config | `cpu_kvcache_space_bytes` | `None` | | MultiModal Config | `mm_encoder_attn_backend` | `None` | | Observability Config | `enable_layerwise_nvtx_tracing` | `False` | | Scheduler Config | `max_num_partial_prefills` | `1` | | Speculative Config | `quantization` | `None` | | KV Transfer Config | `kv_buffer_size` | `1e9` | | KV Transfer Config | `enable_permute_local_kv` | `False` | | Attention Config | `use_prefill_decode_attention` | `False` | | Attention Config | `use_cudnn_prefill` | `False` | | Attention Config | `use_trtllm_ragged_deepseek_prefill` | `False` | | Attention Config | `use_trtllm_attention` | `False` | | Attention Config | `disable_flashinfer_prefill` | `False` | | Attention Config | `disable_flashinfer_q_quantization` | `False` | | Attention Config | `flash_attn_version` | `None` | | Attention Config | `backend` | `None` | | Attention Config | `flash_attn_max_num_splits_for_cuda_graph` | `32` | ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/2c24bc6996cb165fce92f780b388a5e39b3f4060 Signed-off-by: hfadzxy <starmoon_zhang@163.com>
2026-01-20 11:02:38 +08:00
@patch("vllm_ascend.core.recompute_scheduler.RecomputeSchedulerConfig.initialize_from_config")
def test_check_and_update_config_unsupported_compilation_level(
[Bugfix] Reset incompatible config (#6005) ### What this PR does / why we need it? This PR introduces compatibility fixes for running vLLM on Ascend NPU hardware. The changes ensure that GPU-specific parameters are automatically detected and reset to Ascend-compatible values with appropriate warnings logged. | Module | Parameter | Default Value | |--------|-----------|---------------| | Model Config | `disable_cascade_attn` | `False` | | Parallel Config | `all2all_backend` | `"allgather_reducescatter"` | | Cache Config | `cpu_kvcache_space_bytes` | `None` | | MultiModal Config | `mm_encoder_attn_backend` | `None` | | Observability Config | `enable_layerwise_nvtx_tracing` | `False` | | Scheduler Config | `max_num_partial_prefills` | `1` | | Speculative Config | `quantization` | `None` | | KV Transfer Config | `kv_buffer_size` | `1e9` | | KV Transfer Config | `enable_permute_local_kv` | `False` | | Attention Config | `use_prefill_decode_attention` | `False` | | Attention Config | `use_cudnn_prefill` | `False` | | Attention Config | `use_trtllm_ragged_deepseek_prefill` | `False` | | Attention Config | `use_trtllm_attention` | `False` | | Attention Config | `disable_flashinfer_prefill` | `False` | | Attention Config | `disable_flashinfer_q_quantization` | `False` | | Attention Config | `flash_attn_version` | `None` | | Attention Config | `backend` | `None` | | Attention Config | `flash_attn_max_num_splits_for_cuda_graph` | `32` | ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/2c24bc6996cb165fce92f780b388a5e39b3f4060 Signed-off-by: hfadzxy <starmoon_zhang@163.com>
2026-01-20 11:02:38 +08:00
self, mock_init_recompute, mock_init_ascend, mock_update_default, mock_soc_version
):
mock_update_default.return_value = MagicMock()
[Bugfix] Reset incompatible config (#6005) ### What this PR does / why we need it? This PR introduces compatibility fixes for running vLLM on Ascend NPU hardware. The changes ensure that GPU-specific parameters are automatically detected and reset to Ascend-compatible values with appropriate warnings logged. | Module | Parameter | Default Value | |--------|-----------|---------------| | Model Config | `disable_cascade_attn` | `False` | | Parallel Config | `all2all_backend` | `"allgather_reducescatter"` | | Cache Config | `cpu_kvcache_space_bytes` | `None` | | MultiModal Config | `mm_encoder_attn_backend` | `None` | | Observability Config | `enable_layerwise_nvtx_tracing` | `False` | | Scheduler Config | `max_num_partial_prefills` | `1` | | Speculative Config | `quantization` | `None` | | KV Transfer Config | `kv_buffer_size` | `1e9` | | KV Transfer Config | `enable_permute_local_kv` | `False` | | Attention Config | `use_prefill_decode_attention` | `False` | | Attention Config | `use_cudnn_prefill` | `False` | | Attention Config | `use_trtllm_ragged_deepseek_prefill` | `False` | | Attention Config | `use_trtllm_attention` | `False` | | Attention Config | `disable_flashinfer_prefill` | `False` | | Attention Config | `disable_flashinfer_q_quantization` | `False` | | Attention Config | `flash_attn_version` | `None` | | Attention Config | `backend` | `None` | | Attention Config | `flash_attn_max_num_splits_for_cuda_graph` | `32` | ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/2c24bc6996cb165fce92f780b388a5e39b3f4060 Signed-off-by: hfadzxy <starmoon_zhang@163.com>
2026-01-20 11:02:38 +08:00
mock_init_ascend.return_value = TestNPUPlatform.mock_vllm_ascend_config()
vllm_config = TestNPUPlatform.mock_vllm_config()
vllm_config.model_config.enforce_eager = False
vllm_config.parallel_config.decode_context_parallel_size = 1
vllm_config.parallel_config.prefill_context_parallel_size = 1
vllm_config.parallel_config.tensor_parallel_size = 1
mock_init_recompute.return_value = MagicMock()
[1/N][Refactor] Refactor code to adapt with vllm main (#3612) ### What this PR does / why we need it? This is the step 1 of refactoring code to adapt with vllm main, and this pr aligned with https://github.com/vllm-project/vllm/commit/17c540a993af88204ad1b78345c8a865cf58ce44 1. refactor deepseek to the latest code arch as of https://github.com/vllm-project/vllm/commit/17c540a993af88204ad1b78345c8a865cf58ce44 2. bunches of fixes due to vllm changes - Fix `AscendScheduler` `__post_init__`, caused by https://github.com/vllm-project/vllm/pull/25075 - Fix `AscendScheduler` init got an unexpected arg `block_size`, caused by https://github.com/vllm-project/vllm/pull/26296 - Fix `KVCacheManager` `get_num_common_prefix_blocks` arg, caused by https://github.com/vllm-project/vllm/pull/23485 - Fix `MLAAttention` import,caused by https://github.com/vllm-project/vllm/pull/25103 - Fix `SharedFusedMoE` import, caused by https://github.com/vllm-project/vllm/pull/26145 - Fix `LazyLoader` improt, caused by https://github.com/vllm-project/vllm/pull/27022 - Fix `vllm.utils.swap_dict_values` improt, caused by https://github.com/vllm-project/vllm/pull/26990 - Fix `Backend` enum import, caused by https://github.com/vllm-project/vllm/pull/25893 - Fix `CompilationLevel` renaming to `CompilationMode` issue introduced by https://github.com/vllm-project/vllm/pull/26355 - Fix fused_moe ops, caused by https://github.com/vllm-project/vllm/pull/24097 - Fix bert model because of `inputs_embeds`, caused by https://github.com/vllm-project/vllm/pull/25922 - Fix MRope because of `get_input_positions_tensor` to `get_mrope_input_positions`, caused by https://github.com/vllm-project/vllm/pull/24172 - Fix `splitting_ops` changes introduced by https://github.com/vllm-project/vllm/pull/25845 - Fix multi-modality changes introduced by https://github.com/vllm-project/vllm/issues/16229 - Fix lora bias dropping issue introduced by https://github.com/vllm-project/vllm/pull/25807 - Fix structured ouput break introduced by https://github.com/vllm-project/vllm/issues/26737 ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? CI passed with existing test. - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 --------- Signed-off-by: MengqingCao <cmq0113@163.com> Signed-off-by: Icey <1790571317@qq.com> Co-authored-by: Icey <1790571317@qq.com>
2025-10-24 16:55:08 +08:00
vllm_config.scheduler_config = MagicMock()
vllm_config.compilation_config.mode = CompilationMode.DYNAMO_TRACE_ONCE
with self.assertLogs(logger="vllm", level="WARNING") as cm:
from vllm_ascend import platform
importlib.reload(platform)
[Bugfix] Reset incompatible config (#6005) ### What this PR does / why we need it? This PR introduces compatibility fixes for running vLLM on Ascend NPU hardware. The changes ensure that GPU-specific parameters are automatically detected and reset to Ascend-compatible values with appropriate warnings logged. | Module | Parameter | Default Value | |--------|-----------|---------------| | Model Config | `disable_cascade_attn` | `False` | | Parallel Config | `all2all_backend` | `"allgather_reducescatter"` | | Cache Config | `cpu_kvcache_space_bytes` | `None` | | MultiModal Config | `mm_encoder_attn_backend` | `None` | | Observability Config | `enable_layerwise_nvtx_tracing` | `False` | | Scheduler Config | `max_num_partial_prefills` | `1` | | Speculative Config | `quantization` | `None` | | KV Transfer Config | `kv_buffer_size` | `1e9` | | KV Transfer Config | `enable_permute_local_kv` | `False` | | Attention Config | `use_prefill_decode_attention` | `False` | | Attention Config | `use_cudnn_prefill` | `False` | | Attention Config | `use_trtllm_ragged_deepseek_prefill` | `False` | | Attention Config | `use_trtllm_attention` | `False` | | Attention Config | `disable_flashinfer_prefill` | `False` | | Attention Config | `disable_flashinfer_q_quantization` | `False` | | Attention Config | `flash_attn_version` | `None` | | Attention Config | `backend` | `None` | | Attention Config | `flash_attn_max_num_splits_for_cuda_graph` | `32` | ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/2c24bc6996cb165fce92f780b388a5e39b3f4060 Signed-off-by: hfadzxy <starmoon_zhang@163.com>
2026-01-20 11:02:38 +08:00
self.platform = platform.NPUPlatform()
with patch.object(platform.NPUPlatform, "_fix_incompatible_config"):
self.platform.check_and_update_config(vllm_config)
self.assertTrue("NPU does not support" in cm.output[0])
self.assertEqual(
vllm_config.compilation_config.mode,
CompilationMode.NONE,
)
self.assertEqual(
vllm_config.compilation_config.cudagraph_mode,
CUDAGraphMode.NONE,
)
[Bugfix] Reset incompatible config (#6005) ### What this PR does / why we need it? This PR introduces compatibility fixes for running vLLM on Ascend NPU hardware. The changes ensure that GPU-specific parameters are automatically detected and reset to Ascend-compatible values with appropriate warnings logged. | Module | Parameter | Default Value | |--------|-----------|---------------| | Model Config | `disable_cascade_attn` | `False` | | Parallel Config | `all2all_backend` | `"allgather_reducescatter"` | | Cache Config | `cpu_kvcache_space_bytes` | `None` | | MultiModal Config | `mm_encoder_attn_backend` | `None` | | Observability Config | `enable_layerwise_nvtx_tracing` | `False` | | Scheduler Config | `max_num_partial_prefills` | `1` | | Speculative Config | `quantization` | `None` | | KV Transfer Config | `kv_buffer_size` | `1e9` | | KV Transfer Config | `enable_permute_local_kv` | `False` | | Attention Config | `use_prefill_decode_attention` | `False` | | Attention Config | `use_cudnn_prefill` | `False` | | Attention Config | `use_trtllm_ragged_deepseek_prefill` | `False` | | Attention Config | `use_trtllm_attention` | `False` | | Attention Config | `disable_flashinfer_prefill` | `False` | | Attention Config | `disable_flashinfer_q_quantization` | `False` | | Attention Config | `flash_attn_version` | `None` | | Attention Config | `backend` | `None` | | Attention Config | `flash_attn_max_num_splits_for_cuda_graph` | `32` | ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/2c24bc6996cb165fce92f780b388a5e39b3f4060 Signed-off-by: hfadzxy <starmoon_zhang@163.com>
2026-01-20 11:02:38 +08:00
@pytest.mark.skip("Revert me when vllm support setting cudagraph_mode on oot platform")
@patch("vllm_ascend.utils.get_ascend_device_type", return_value=AscendDeviceType.A3)
@patch("vllm_ascend.ascend_config.init_ascend_config")
[Bugfix] Reset incompatible config (#6005) ### What this PR does / why we need it? This PR introduces compatibility fixes for running vLLM on Ascend NPU hardware. The changes ensure that GPU-specific parameters are automatically detected and reset to Ascend-compatible values with appropriate warnings logged. | Module | Parameter | Default Value | |--------|-----------|---------------| | Model Config | `disable_cascade_attn` | `False` | | Parallel Config | `all2all_backend` | `"allgather_reducescatter"` | | Cache Config | `cpu_kvcache_space_bytes` | `None` | | MultiModal Config | `mm_encoder_attn_backend` | `None` | | Observability Config | `enable_layerwise_nvtx_tracing` | `False` | | Scheduler Config | `max_num_partial_prefills` | `1` | | Speculative Config | `quantization` | `None` | | KV Transfer Config | `kv_buffer_size` | `1e9` | | KV Transfer Config | `enable_permute_local_kv` | `False` | | Attention Config | `use_prefill_decode_attention` | `False` | | Attention Config | `use_cudnn_prefill` | `False` | | Attention Config | `use_trtllm_ragged_deepseek_prefill` | `False` | | Attention Config | `use_trtllm_attention` | `False` | | Attention Config | `disable_flashinfer_prefill` | `False` | | Attention Config | `disable_flashinfer_q_quantization` | `False` | | Attention Config | `flash_attn_version` | `None` | | Attention Config | `backend` | `None` | | Attention Config | `flash_attn_max_num_splits_for_cuda_graph` | `32` | ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/2c24bc6996cb165fce92f780b388a5e39b3f4060 Signed-off-by: hfadzxy <starmoon_zhang@163.com>
2026-01-20 11:02:38 +08:00
def test_check_and_update_config_unsupported_cudagraph_mode(self, mock_init_ascend, mock_soc_version):
mock_init_ascend.return_value = TestNPUPlatform.mock_vllm_ascend_config()
vllm_config = TestNPUPlatform.mock_vllm_config()
vllm_config.model_config.enforce_eager = False
vllm_config.compilation_config.cudagraph_mode = CUDAGraphMode.FULL
with self.assertLogs(logger="vllm", level="INFO") as cm:
from vllm_ascend import platform
importlib.reload(platform)
self.platform.check_and_update_config(vllm_config)
[Bugfix] Reset incompatible config (#6005) ### What this PR does / why we need it? This PR introduces compatibility fixes for running vLLM on Ascend NPU hardware. The changes ensure that GPU-specific parameters are automatically detected and reset to Ascend-compatible values with appropriate warnings logged. | Module | Parameter | Default Value | |--------|-----------|---------------| | Model Config | `disable_cascade_attn` | `False` | | Parallel Config | `all2all_backend` | `"allgather_reducescatter"` | | Cache Config | `cpu_kvcache_space_bytes` | `None` | | MultiModal Config | `mm_encoder_attn_backend` | `None` | | Observability Config | `enable_layerwise_nvtx_tracing` | `False` | | Scheduler Config | `max_num_partial_prefills` | `1` | | Speculative Config | `quantization` | `None` | | KV Transfer Config | `kv_buffer_size` | `1e9` | | KV Transfer Config | `enable_permute_local_kv` | `False` | | Attention Config | `use_prefill_decode_attention` | `False` | | Attention Config | `use_cudnn_prefill` | `False` | | Attention Config | `use_trtllm_ragged_deepseek_prefill` | `False` | | Attention Config | `use_trtllm_attention` | `False` | | Attention Config | `disable_flashinfer_prefill` | `False` | | Attention Config | `disable_flashinfer_q_quantization` | `False` | | Attention Config | `flash_attn_version` | `None` | | Attention Config | `backend` | `None` | | Attention Config | `flash_attn_max_num_splits_for_cuda_graph` | `32` | ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/2c24bc6996cb165fce92f780b388a5e39b3f4060 Signed-off-by: hfadzxy <starmoon_zhang@163.com>
2026-01-20 11:02:38 +08:00
self.assertTrue("cudagraph_mode is not support on NPU. falling back to NONE" in cm.output[0])
[1/N][Refactor] Refactor code to adapt with vllm main (#3612) ### What this PR does / why we need it? This is the step 1 of refactoring code to adapt with vllm main, and this pr aligned with https://github.com/vllm-project/vllm/commit/17c540a993af88204ad1b78345c8a865cf58ce44 1. refactor deepseek to the latest code arch as of https://github.com/vllm-project/vllm/commit/17c540a993af88204ad1b78345c8a865cf58ce44 2. bunches of fixes due to vllm changes - Fix `AscendScheduler` `__post_init__`, caused by https://github.com/vllm-project/vllm/pull/25075 - Fix `AscendScheduler` init got an unexpected arg `block_size`, caused by https://github.com/vllm-project/vllm/pull/26296 - Fix `KVCacheManager` `get_num_common_prefix_blocks` arg, caused by https://github.com/vllm-project/vllm/pull/23485 - Fix `MLAAttention` import,caused by https://github.com/vllm-project/vllm/pull/25103 - Fix `SharedFusedMoE` import, caused by https://github.com/vllm-project/vllm/pull/26145 - Fix `LazyLoader` improt, caused by https://github.com/vllm-project/vllm/pull/27022 - Fix `vllm.utils.swap_dict_values` improt, caused by https://github.com/vllm-project/vllm/pull/26990 - Fix `Backend` enum import, caused by https://github.com/vllm-project/vllm/pull/25893 - Fix `CompilationLevel` renaming to `CompilationMode` issue introduced by https://github.com/vllm-project/vllm/pull/26355 - Fix fused_moe ops, caused by https://github.com/vllm-project/vllm/pull/24097 - Fix bert model because of `inputs_embeds`, caused by https://github.com/vllm-project/vllm/pull/25922 - Fix MRope because of `get_input_positions_tensor` to `get_mrope_input_positions`, caused by https://github.com/vllm-project/vllm/pull/24172 - Fix `splitting_ops` changes introduced by https://github.com/vllm-project/vllm/pull/25845 - Fix multi-modality changes introduced by https://github.com/vllm-project/vllm/issues/16229 - Fix lora bias dropping issue introduced by https://github.com/vllm-project/vllm/pull/25807 - Fix structured ouput break introduced by https://github.com/vllm-project/vllm/issues/26737 ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? CI passed with existing test. - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 --------- Signed-off-by: MengqingCao <cmq0113@163.com> Signed-off-by: Icey <1790571317@qq.com> Co-authored-by: Icey <1790571317@qq.com>
2025-10-24 16:55:08 +08:00
self.assertEqual(
vllm_config.compilation_config.mode,
CompilationMode.NONE,
)
self.assertEqual(
vllm_config.compilation_config.cudagraph_mode,
CUDAGraphMode.NONE,
)
[Bugfix] Reset incompatible config (#6005) ### What this PR does / why we need it? This PR introduces compatibility fixes for running vLLM on Ascend NPU hardware. The changes ensure that GPU-specific parameters are automatically detected and reset to Ascend-compatible values with appropriate warnings logged. | Module | Parameter | Default Value | |--------|-----------|---------------| | Model Config | `disable_cascade_attn` | `False` | | Parallel Config | `all2all_backend` | `"allgather_reducescatter"` | | Cache Config | `cpu_kvcache_space_bytes` | `None` | | MultiModal Config | `mm_encoder_attn_backend` | `None` | | Observability Config | `enable_layerwise_nvtx_tracing` | `False` | | Scheduler Config | `max_num_partial_prefills` | `1` | | Speculative Config | `quantization` | `None` | | KV Transfer Config | `kv_buffer_size` | `1e9` | | KV Transfer Config | `enable_permute_local_kv` | `False` | | Attention Config | `use_prefill_decode_attention` | `False` | | Attention Config | `use_cudnn_prefill` | `False` | | Attention Config | `use_trtllm_ragged_deepseek_prefill` | `False` | | Attention Config | `use_trtllm_attention` | `False` | | Attention Config | `disable_flashinfer_prefill` | `False` | | Attention Config | `disable_flashinfer_q_quantization` | `False` | | Attention Config | `flash_attn_version` | `None` | | Attention Config | `backend` | `None` | | Attention Config | `flash_attn_max_num_splits_for_cuda_graph` | `32` | ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/2c24bc6996cb165fce92f780b388a5e39b3f4060 Signed-off-by: hfadzxy <starmoon_zhang@163.com>
2026-01-20 11:02:38 +08:00
@patch("vllm_ascend.utils.get_ascend_device_type", return_value=AscendDeviceType.A3)
@patch("vllm_ascend.ascend_config.init_ascend_config")
[Bugfix] Reset incompatible config (#6005) ### What this PR does / why we need it? This PR introduces compatibility fixes for running vLLM on Ascend NPU hardware. The changes ensure that GPU-specific parameters are automatically detected and reset to Ascend-compatible values with appropriate warnings logged. | Module | Parameter | Default Value | |--------|-----------|---------------| | Model Config | `disable_cascade_attn` | `False` | | Parallel Config | `all2all_backend` | `"allgather_reducescatter"` | | Cache Config | `cpu_kvcache_space_bytes` | `None` | | MultiModal Config | `mm_encoder_attn_backend` | `None` | | Observability Config | `enable_layerwise_nvtx_tracing` | `False` | | Scheduler Config | `max_num_partial_prefills` | `1` | | Speculative Config | `quantization` | `None` | | KV Transfer Config | `kv_buffer_size` | `1e9` | | KV Transfer Config | `enable_permute_local_kv` | `False` | | Attention Config | `use_prefill_decode_attention` | `False` | | Attention Config | `use_cudnn_prefill` | `False` | | Attention Config | `use_trtllm_ragged_deepseek_prefill` | `False` | | Attention Config | `use_trtllm_attention` | `False` | | Attention Config | `disable_flashinfer_prefill` | `False` | | Attention Config | `disable_flashinfer_q_quantization` | `False` | | Attention Config | `flash_attn_version` | `None` | | Attention Config | `backend` | `None` | | Attention Config | `flash_attn_max_num_splits_for_cuda_graph` | `32` | ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/2c24bc6996cb165fce92f780b388a5e39b3f4060 Signed-off-by: hfadzxy <starmoon_zhang@163.com>
2026-01-20 11:02:38 +08:00
@patch("vllm_ascend.core.recompute_scheduler.RecomputeSchedulerConfig.initialize_from_config")
def test_check_and_update_config_cache_config_block_size(
[Bugfix] Reset incompatible config (#6005) ### What this PR does / why we need it? This PR introduces compatibility fixes for running vLLM on Ascend NPU hardware. The changes ensure that GPU-specific parameters are automatically detected and reset to Ascend-compatible values with appropriate warnings logged. | Module | Parameter | Default Value | |--------|-----------|---------------| | Model Config | `disable_cascade_attn` | `False` | | Parallel Config | `all2all_backend` | `"allgather_reducescatter"` | | Cache Config | `cpu_kvcache_space_bytes` | `None` | | MultiModal Config | `mm_encoder_attn_backend` | `None` | | Observability Config | `enable_layerwise_nvtx_tracing` | `False` | | Scheduler Config | `max_num_partial_prefills` | `1` | | Speculative Config | `quantization` | `None` | | KV Transfer Config | `kv_buffer_size` | `1e9` | | KV Transfer Config | `enable_permute_local_kv` | `False` | | Attention Config | `use_prefill_decode_attention` | `False` | | Attention Config | `use_cudnn_prefill` | `False` | | Attention Config | `use_trtllm_ragged_deepseek_prefill` | `False` | | Attention Config | `use_trtllm_attention` | `False` | | Attention Config | `disable_flashinfer_prefill` | `False` | | Attention Config | `disable_flashinfer_q_quantization` | `False` | | Attention Config | `flash_attn_version` | `None` | | Attention Config | `backend` | `None` | | Attention Config | `flash_attn_max_num_splits_for_cuda_graph` | `32` | ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/2c24bc6996cb165fce92f780b388a5e39b3f4060 Signed-off-by: hfadzxy <starmoon_zhang@163.com>
2026-01-20 11:02:38 +08:00
self, mock_init_recompute, mock_init_ascend, mock_soc_version
):
mock_init_ascend.return_value = TestNPUPlatform.mock_vllm_ascend_config()
vllm_config = TestNPUPlatform.mock_vllm_config()
vllm_config.cache_config.block_size = None
vllm_config.cache_config.enable_prefix_caching = True
vllm_config.parallel_config.decode_context_parallel_size = 1
vllm_config.parallel_config.prefill_context_parallel_size = 1
vllm_config.parallel_config.tensor_parallel_size = 1
mock_init_recompute.return_value = MagicMock()
[1/N][Refactor] Refactor code to adapt with vllm main (#3612) ### What this PR does / why we need it? This is the step 1 of refactoring code to adapt with vllm main, and this pr aligned with https://github.com/vllm-project/vllm/commit/17c540a993af88204ad1b78345c8a865cf58ce44 1. refactor deepseek to the latest code arch as of https://github.com/vllm-project/vllm/commit/17c540a993af88204ad1b78345c8a865cf58ce44 2. bunches of fixes due to vllm changes - Fix `AscendScheduler` `__post_init__`, caused by https://github.com/vllm-project/vllm/pull/25075 - Fix `AscendScheduler` init got an unexpected arg `block_size`, caused by https://github.com/vllm-project/vllm/pull/26296 - Fix `KVCacheManager` `get_num_common_prefix_blocks` arg, caused by https://github.com/vllm-project/vllm/pull/23485 - Fix `MLAAttention` import,caused by https://github.com/vllm-project/vllm/pull/25103 - Fix `SharedFusedMoE` import, caused by https://github.com/vllm-project/vllm/pull/26145 - Fix `LazyLoader` improt, caused by https://github.com/vllm-project/vllm/pull/27022 - Fix `vllm.utils.swap_dict_values` improt, caused by https://github.com/vllm-project/vllm/pull/26990 - Fix `Backend` enum import, caused by https://github.com/vllm-project/vllm/pull/25893 - Fix `CompilationLevel` renaming to `CompilationMode` issue introduced by https://github.com/vllm-project/vllm/pull/26355 - Fix fused_moe ops, caused by https://github.com/vllm-project/vllm/pull/24097 - Fix bert model because of `inputs_embeds`, caused by https://github.com/vllm-project/vllm/pull/25922 - Fix MRope because of `get_input_positions_tensor` to `get_mrope_input_positions`, caused by https://github.com/vllm-project/vllm/pull/24172 - Fix `splitting_ops` changes introduced by https://github.com/vllm-project/vllm/pull/25845 - Fix multi-modality changes introduced by https://github.com/vllm-project/vllm/issues/16229 - Fix lora bias dropping issue introduced by https://github.com/vllm-project/vllm/pull/25807 - Fix structured ouput break introduced by https://github.com/vllm-project/vllm/issues/26737 ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? CI passed with existing test. - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 --------- Signed-off-by: MengqingCao <cmq0113@163.com> Signed-off-by: Icey <1790571317@qq.com> Co-authored-by: Icey <1790571317@qq.com>
2025-10-24 16:55:08 +08:00
vllm_config.scheduler_config = MagicMock()
from vllm_ascend import platform
importlib.reload(platform)
self.platform.check_and_update_config(vllm_config)
self.assertEqual(vllm_config.cache_config.block_size, 128)
[Bugfix] Reset incompatible config (#6005) ### What this PR does / why we need it? This PR introduces compatibility fixes for running vLLM on Ascend NPU hardware. The changes ensure that GPU-specific parameters are automatically detected and reset to Ascend-compatible values with appropriate warnings logged. | Module | Parameter | Default Value | |--------|-----------|---------------| | Model Config | `disable_cascade_attn` | `False` | | Parallel Config | `all2all_backend` | `"allgather_reducescatter"` | | Cache Config | `cpu_kvcache_space_bytes` | `None` | | MultiModal Config | `mm_encoder_attn_backend` | `None` | | Observability Config | `enable_layerwise_nvtx_tracing` | `False` | | Scheduler Config | `max_num_partial_prefills` | `1` | | Speculative Config | `quantization` | `None` | | KV Transfer Config | `kv_buffer_size` | `1e9` | | KV Transfer Config | `enable_permute_local_kv` | `False` | | Attention Config | `use_prefill_decode_attention` | `False` | | Attention Config | `use_cudnn_prefill` | `False` | | Attention Config | `use_trtllm_ragged_deepseek_prefill` | `False` | | Attention Config | `use_trtllm_attention` | `False` | | Attention Config | `disable_flashinfer_prefill` | `False` | | Attention Config | `disable_flashinfer_q_quantization` | `False` | | Attention Config | `flash_attn_version` | `None` | | Attention Config | `backend` | `None` | | Attention Config | `flash_attn_max_num_splits_for_cuda_graph` | `32` | ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/2c24bc6996cb165fce92f780b388a5e39b3f4060 Signed-off-by: hfadzxy <starmoon_zhang@163.com>
2026-01-20 11:02:38 +08:00
@patch("vllm_ascend.utils.get_ascend_device_type", return_value=AscendDeviceType.A3)
@patch("vllm_ascend.ascend_config.init_ascend_config")
[Bugfix] Reset incompatible config (#6005) ### What this PR does / why we need it? This PR introduces compatibility fixes for running vLLM on Ascend NPU hardware. The changes ensure that GPU-specific parameters are automatically detected and reset to Ascend-compatible values with appropriate warnings logged. | Module | Parameter | Default Value | |--------|-----------|---------------| | Model Config | `disable_cascade_attn` | `False` | | Parallel Config | `all2all_backend` | `"allgather_reducescatter"` | | Cache Config | `cpu_kvcache_space_bytes` | `None` | | MultiModal Config | `mm_encoder_attn_backend` | `None` | | Observability Config | `enable_layerwise_nvtx_tracing` | `False` | | Scheduler Config | `max_num_partial_prefills` | `1` | | Speculative Config | `quantization` | `None` | | KV Transfer Config | `kv_buffer_size` | `1e9` | | KV Transfer Config | `enable_permute_local_kv` | `False` | | Attention Config | `use_prefill_decode_attention` | `False` | | Attention Config | `use_cudnn_prefill` | `False` | | Attention Config | `use_trtllm_ragged_deepseek_prefill` | `False` | | Attention Config | `use_trtllm_attention` | `False` | | Attention Config | `disable_flashinfer_prefill` | `False` | | Attention Config | `disable_flashinfer_q_quantization` | `False` | | Attention Config | `flash_attn_version` | `None` | | Attention Config | `backend` | `None` | | Attention Config | `flash_attn_max_num_splits_for_cuda_graph` | `32` | ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/2c24bc6996cb165fce92f780b388a5e39b3f4060 Signed-off-by: hfadzxy <starmoon_zhang@163.com>
2026-01-20 11:02:38 +08:00
@patch("vllm_ascend.core.recompute_scheduler.RecomputeSchedulerConfig.initialize_from_config")
def test_check_and_update_config_v1_worker_class_selection(
[Bugfix] Reset incompatible config (#6005) ### What this PR does / why we need it? This PR introduces compatibility fixes for running vLLM on Ascend NPU hardware. The changes ensure that GPU-specific parameters are automatically detected and reset to Ascend-compatible values with appropriate warnings logged. | Module | Parameter | Default Value | |--------|-----------|---------------| | Model Config | `disable_cascade_attn` | `False` | | Parallel Config | `all2all_backend` | `"allgather_reducescatter"` | | Cache Config | `cpu_kvcache_space_bytes` | `None` | | MultiModal Config | `mm_encoder_attn_backend` | `None` | | Observability Config | `enable_layerwise_nvtx_tracing` | `False` | | Scheduler Config | `max_num_partial_prefills` | `1` | | Speculative Config | `quantization` | `None` | | KV Transfer Config | `kv_buffer_size` | `1e9` | | KV Transfer Config | `enable_permute_local_kv` | `False` | | Attention Config | `use_prefill_decode_attention` | `False` | | Attention Config | `use_cudnn_prefill` | `False` | | Attention Config | `use_trtllm_ragged_deepseek_prefill` | `False` | | Attention Config | `use_trtllm_attention` | `False` | | Attention Config | `disable_flashinfer_prefill` | `False` | | Attention Config | `disable_flashinfer_q_quantization` | `False` | | Attention Config | `flash_attn_version` | `None` | | Attention Config | `backend` | `None` | | Attention Config | `flash_attn_max_num_splits_for_cuda_graph` | `32` | ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/2c24bc6996cb165fce92f780b388a5e39b3f4060 Signed-off-by: hfadzxy <starmoon_zhang@163.com>
2026-01-20 11:02:38 +08:00
self, mock_init_recompute, mock_init_ascend, mock_soc_version
):
mock_init_ascend.return_value = TestNPUPlatform.mock_vllm_ascend_config()
vllm_config = TestNPUPlatform.mock_vllm_config()
vllm_config.parallel_config.worker_cls = "auto"
vllm_config.parallel_config.decode_context_parallel_size = 1
vllm_config.parallel_config.prefill_context_parallel_size = 1
vllm_config.parallel_config.tensor_parallel_size = 1
mock_init_recompute.return_value = MagicMock()
[1/N][Refactor] Refactor code to adapt with vllm main (#3612) ### What this PR does / why we need it? This is the step 1 of refactoring code to adapt with vllm main, and this pr aligned with https://github.com/vllm-project/vllm/commit/17c540a993af88204ad1b78345c8a865cf58ce44 1. refactor deepseek to the latest code arch as of https://github.com/vllm-project/vllm/commit/17c540a993af88204ad1b78345c8a865cf58ce44 2. bunches of fixes due to vllm changes - Fix `AscendScheduler` `__post_init__`, caused by https://github.com/vllm-project/vllm/pull/25075 - Fix `AscendScheduler` init got an unexpected arg `block_size`, caused by https://github.com/vllm-project/vllm/pull/26296 - Fix `KVCacheManager` `get_num_common_prefix_blocks` arg, caused by https://github.com/vllm-project/vllm/pull/23485 - Fix `MLAAttention` import,caused by https://github.com/vllm-project/vllm/pull/25103 - Fix `SharedFusedMoE` import, caused by https://github.com/vllm-project/vllm/pull/26145 - Fix `LazyLoader` improt, caused by https://github.com/vllm-project/vllm/pull/27022 - Fix `vllm.utils.swap_dict_values` improt, caused by https://github.com/vllm-project/vllm/pull/26990 - Fix `Backend` enum import, caused by https://github.com/vllm-project/vllm/pull/25893 - Fix `CompilationLevel` renaming to `CompilationMode` issue introduced by https://github.com/vllm-project/vllm/pull/26355 - Fix fused_moe ops, caused by https://github.com/vllm-project/vllm/pull/24097 - Fix bert model because of `inputs_embeds`, caused by https://github.com/vllm-project/vllm/pull/25922 - Fix MRope because of `get_input_positions_tensor` to `get_mrope_input_positions`, caused by https://github.com/vllm-project/vllm/pull/24172 - Fix `splitting_ops` changes introduced by https://github.com/vllm-project/vllm/pull/25845 - Fix multi-modality changes introduced by https://github.com/vllm-project/vllm/issues/16229 - Fix lora bias dropping issue introduced by https://github.com/vllm-project/vllm/pull/25807 - Fix structured ouput break introduced by https://github.com/vllm-project/vllm/issues/26737 ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? CI passed with existing test. - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 --------- Signed-off-by: MengqingCao <cmq0113@163.com> Signed-off-by: Icey <1790571317@qq.com> Co-authored-by: Icey <1790571317@qq.com>
2025-10-24 16:55:08 +08:00
vllm_config.scheduler_config = MagicMock()
from vllm_ascend import platform
importlib.reload(platform)
self.platform.check_and_update_config(vllm_config)
self.assertEqual(
vllm_config.parallel_config.worker_cls,
"vllm_ascend.worker.worker.NPUWorker",
)
test_ascend_config = TestNPUPlatform.mock_vllm_ascend_config()
test_ascend_config.xlite_graph_config.enabled = True
mock_init_ascend.return_value = test_ascend_config
vllm_config.parallel_config.worker_cls = "auto"
self.platform.check_and_update_config(vllm_config)
self.assertEqual(
vllm_config.parallel_config.worker_cls,
"vllm_ascend.xlite.xlite_worker.XliteWorker",
)
@patch("vllm_ascend.ascend_config.init_ascend_config")
[Bugfix] Reset incompatible config (#6005) ### What this PR does / why we need it? This PR introduces compatibility fixes for running vLLM on Ascend NPU hardware. The changes ensure that GPU-specific parameters are automatically detected and reset to Ascend-compatible values with appropriate warnings logged. | Module | Parameter | Default Value | |--------|-----------|---------------| | Model Config | `disable_cascade_attn` | `False` | | Parallel Config | `all2all_backend` | `"allgather_reducescatter"` | | Cache Config | `cpu_kvcache_space_bytes` | `None` | | MultiModal Config | `mm_encoder_attn_backend` | `None` | | Observability Config | `enable_layerwise_nvtx_tracing` | `False` | | Scheduler Config | `max_num_partial_prefills` | `1` | | Speculative Config | `quantization` | `None` | | KV Transfer Config | `kv_buffer_size` | `1e9` | | KV Transfer Config | `enable_permute_local_kv` | `False` | | Attention Config | `use_prefill_decode_attention` | `False` | | Attention Config | `use_cudnn_prefill` | `False` | | Attention Config | `use_trtllm_ragged_deepseek_prefill` | `False` | | Attention Config | `use_trtllm_attention` | `False` | | Attention Config | `disable_flashinfer_prefill` | `False` | | Attention Config | `disable_flashinfer_q_quantization` | `False` | | Attention Config | `flash_attn_version` | `None` | | Attention Config | `backend` | `None` | | Attention Config | `flash_attn_max_num_splits_for_cuda_graph` | `32` | ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/2c24bc6996cb165fce92f780b388a5e39b3f4060 Signed-off-by: hfadzxy <starmoon_zhang@163.com>
2026-01-20 11:02:38 +08:00
@patch("vllm_ascend.utils.get_ascend_device_type", return_value=AscendDeviceType._310P)
@patch("vllm_ascend.core.recompute_scheduler.RecomputeSchedulerConfig.initialize_from_config")
def test_check_and_update_config_310p_no_custom_ops(self, mock_init_recompute, mock_soc_version, mock_init_ascend):
mock_init_ascend.return_value = TestNPUPlatform.mock_vllm_ascend_config()
vllm_config = TestNPUPlatform.mock_vllm_config()
vllm_config.compilation_config.custom_ops = []
vllm_config.parallel_config.decode_context_parallel_size = 1
vllm_config.parallel_config.prefill_context_parallel_size = 1
vllm_config.parallel_config.tensor_parallel_size = 1
mock_init_recompute.return_value = MagicMock()
[1/N][Refactor] Refactor code to adapt with vllm main (#3612) ### What this PR does / why we need it? This is the step 1 of refactoring code to adapt with vllm main, and this pr aligned with https://github.com/vllm-project/vllm/commit/17c540a993af88204ad1b78345c8a865cf58ce44 1. refactor deepseek to the latest code arch as of https://github.com/vllm-project/vllm/commit/17c540a993af88204ad1b78345c8a865cf58ce44 2. bunches of fixes due to vllm changes - Fix `AscendScheduler` `__post_init__`, caused by https://github.com/vllm-project/vllm/pull/25075 - Fix `AscendScheduler` init got an unexpected arg `block_size`, caused by https://github.com/vllm-project/vllm/pull/26296 - Fix `KVCacheManager` `get_num_common_prefix_blocks` arg, caused by https://github.com/vllm-project/vllm/pull/23485 - Fix `MLAAttention` import,caused by https://github.com/vllm-project/vllm/pull/25103 - Fix `SharedFusedMoE` import, caused by https://github.com/vllm-project/vllm/pull/26145 - Fix `LazyLoader` improt, caused by https://github.com/vllm-project/vllm/pull/27022 - Fix `vllm.utils.swap_dict_values` improt, caused by https://github.com/vllm-project/vllm/pull/26990 - Fix `Backend` enum import, caused by https://github.com/vllm-project/vllm/pull/25893 - Fix `CompilationLevel` renaming to `CompilationMode` issue introduced by https://github.com/vllm-project/vllm/pull/26355 - Fix fused_moe ops, caused by https://github.com/vllm-project/vllm/pull/24097 - Fix bert model because of `inputs_embeds`, caused by https://github.com/vllm-project/vllm/pull/25922 - Fix MRope because of `get_input_positions_tensor` to `get_mrope_input_positions`, caused by https://github.com/vllm-project/vllm/pull/24172 - Fix `splitting_ops` changes introduced by https://github.com/vllm-project/vllm/pull/25845 - Fix multi-modality changes introduced by https://github.com/vllm-project/vllm/issues/16229 - Fix lora bias dropping issue introduced by https://github.com/vllm-project/vllm/pull/25807 - Fix structured ouput break introduced by https://github.com/vllm-project/vllm/issues/26737 ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? CI passed with existing test. - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 --------- Signed-off-by: MengqingCao <cmq0113@163.com> Signed-off-by: Icey <1790571317@qq.com> Co-authored-by: Icey <1790571317@qq.com>
2025-10-24 16:55:08 +08:00
vllm_config.scheduler_config = MagicMock()
from vllm_ascend import platform
importlib.reload(platform)
self.platform.check_and_update_config(vllm_config)
self.assertEqual(vllm_config.compilation_config.custom_ops, [])
def test_get_attn_backend_cls_use_v1_and_mla(self):
attn_selector_config = AttentionSelectorConfig(
dtype=torch.float16,
head_size=0,
kv_cache_dtype=None,
block_size=128,
use_mla=True,
use_sparse=False,
)
[Bugfix] Reset incompatible config (#6005) ### What this PR does / why we need it? This PR introduces compatibility fixes for running vLLM on Ascend NPU hardware. The changes ensure that GPU-specific parameters are automatically detected and reset to Ascend-compatible values with appropriate warnings logged. | Module | Parameter | Default Value | |--------|-----------|---------------| | Model Config | `disable_cascade_attn` | `False` | | Parallel Config | `all2all_backend` | `"allgather_reducescatter"` | | Cache Config | `cpu_kvcache_space_bytes` | `None` | | MultiModal Config | `mm_encoder_attn_backend` | `None` | | Observability Config | `enable_layerwise_nvtx_tracing` | `False` | | Scheduler Config | `max_num_partial_prefills` | `1` | | Speculative Config | `quantization` | `None` | | KV Transfer Config | `kv_buffer_size` | `1e9` | | KV Transfer Config | `enable_permute_local_kv` | `False` | | Attention Config | `use_prefill_decode_attention` | `False` | | Attention Config | `use_cudnn_prefill` | `False` | | Attention Config | `use_trtllm_ragged_deepseek_prefill` | `False` | | Attention Config | `use_trtllm_attention` | `False` | | Attention Config | `disable_flashinfer_prefill` | `False` | | Attention Config | `disable_flashinfer_q_quantization` | `False` | | Attention Config | `flash_attn_version` | `None` | | Attention Config | `backend` | `None` | | Attention Config | `flash_attn_max_num_splits_for_cuda_graph` | `32` | ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/2c24bc6996cb165fce92f780b388a5e39b3f4060 Signed-off-by: hfadzxy <starmoon_zhang@163.com>
2026-01-20 11:02:38 +08:00
result = self.platform.get_attn_backend_cls("ascend", attn_selector_config)
self.assertEqual(result, "vllm_ascend.attention.mla_v1.AscendMLABackend")
def test_get_attn_backend_cls_use_v1_only(self):
attn_selector_config = AttentionSelectorConfig(
dtype=torch.float16,
head_size=0,
kv_cache_dtype=None,
block_size=128,
use_mla=False,
use_sparse=False,
)
[Bugfix] Reset incompatible config (#6005) ### What this PR does / why we need it? This PR introduces compatibility fixes for running vLLM on Ascend NPU hardware. The changes ensure that GPU-specific parameters are automatically detected and reset to Ascend-compatible values with appropriate warnings logged. | Module | Parameter | Default Value | |--------|-----------|---------------| | Model Config | `disable_cascade_attn` | `False` | | Parallel Config | `all2all_backend` | `"allgather_reducescatter"` | | Cache Config | `cpu_kvcache_space_bytes` | `None` | | MultiModal Config | `mm_encoder_attn_backend` | `None` | | Observability Config | `enable_layerwise_nvtx_tracing` | `False` | | Scheduler Config | `max_num_partial_prefills` | `1` | | Speculative Config | `quantization` | `None` | | KV Transfer Config | `kv_buffer_size` | `1e9` | | KV Transfer Config | `enable_permute_local_kv` | `False` | | Attention Config | `use_prefill_decode_attention` | `False` | | Attention Config | `use_cudnn_prefill` | `False` | | Attention Config | `use_trtllm_ragged_deepseek_prefill` | `False` | | Attention Config | `use_trtllm_attention` | `False` | | Attention Config | `disable_flashinfer_prefill` | `False` | | Attention Config | `disable_flashinfer_q_quantization` | `False` | | Attention Config | `flash_attn_version` | `None` | | Attention Config | `backend` | `None` | | Attention Config | `flash_attn_max_num_splits_for_cuda_graph` | `32` | ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/2c24bc6996cb165fce92f780b388a5e39b3f4060 Signed-off-by: hfadzxy <starmoon_zhang@163.com>
2026-01-20 11:02:38 +08:00
result = self.platform.get_attn_backend_cls("ascend", attn_selector_config)
self.assertEqual(result, "vllm_ascend.attention.attention_v1.AscendAttentionBackend")
def test_get_punica_wrapper(self):
result = self.platform.get_punica_wrapper()
[Bugfix] Reset incompatible config (#6005) ### What this PR does / why we need it? This PR introduces compatibility fixes for running vLLM on Ascend NPU hardware. The changes ensure that GPU-specific parameters are automatically detected and reset to Ascend-compatible values with appropriate warnings logged. | Module | Parameter | Default Value | |--------|-----------|---------------| | Model Config | `disable_cascade_attn` | `False` | | Parallel Config | `all2all_backend` | `"allgather_reducescatter"` | | Cache Config | `cpu_kvcache_space_bytes` | `None` | | MultiModal Config | `mm_encoder_attn_backend` | `None` | | Observability Config | `enable_layerwise_nvtx_tracing` | `False` | | Scheduler Config | `max_num_partial_prefills` | `1` | | Speculative Config | `quantization` | `None` | | KV Transfer Config | `kv_buffer_size` | `1e9` | | KV Transfer Config | `enable_permute_local_kv` | `False` | | Attention Config | `use_prefill_decode_attention` | `False` | | Attention Config | `use_cudnn_prefill` | `False` | | Attention Config | `use_trtllm_ragged_deepseek_prefill` | `False` | | Attention Config | `use_trtllm_attention` | `False` | | Attention Config | `disable_flashinfer_prefill` | `False` | | Attention Config | `disable_flashinfer_q_quantization` | `False` | | Attention Config | `flash_attn_version` | `None` | | Attention Config | `backend` | `None` | | Attention Config | `flash_attn_max_num_splits_for_cuda_graph` | `32` | ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/2c24bc6996cb165fce92f780b388a5e39b3f4060 Signed-off-by: hfadzxy <starmoon_zhang@163.com>
2026-01-20 11:02:38 +08:00
self.assertEqual(result, "vllm_ascend.lora.punica_npu.PunicaWrapperNPU")
@patch("torch.npu.reset_peak_memory_stats")
@patch("torch.npu.max_memory_allocated")
[Bugfix] Reset incompatible config (#6005) ### What this PR does / why we need it? This PR introduces compatibility fixes for running vLLM on Ascend NPU hardware. The changes ensure that GPU-specific parameters are automatically detected and reset to Ascend-compatible values with appropriate warnings logged. | Module | Parameter | Default Value | |--------|-----------|---------------| | Model Config | `disable_cascade_attn` | `False` | | Parallel Config | `all2all_backend` | `"allgather_reducescatter"` | | Cache Config | `cpu_kvcache_space_bytes` | `None` | | MultiModal Config | `mm_encoder_attn_backend` | `None` | | Observability Config | `enable_layerwise_nvtx_tracing` | `False` | | Scheduler Config | `max_num_partial_prefills` | `1` | | Speculative Config | `quantization` | `None` | | KV Transfer Config | `kv_buffer_size` | `1e9` | | KV Transfer Config | `enable_permute_local_kv` | `False` | | Attention Config | `use_prefill_decode_attention` | `False` | | Attention Config | `use_cudnn_prefill` | `False` | | Attention Config | `use_trtllm_ragged_deepseek_prefill` | `False` | | Attention Config | `use_trtllm_attention` | `False` | | Attention Config | `disable_flashinfer_prefill` | `False` | | Attention Config | `disable_flashinfer_q_quantization` | `False` | | Attention Config | `flash_attn_version` | `None` | | Attention Config | `backend` | `None` | | Attention Config | `flash_attn_max_num_splits_for_cuda_graph` | `32` | ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/2c24bc6996cb165fce92f780b388a5e39b3f4060 Signed-off-by: hfadzxy <starmoon_zhang@163.com>
2026-01-20 11:02:38 +08:00
def test_get_current_memory_usage_with_specific_device(self, mock_max_memory, mock_reset_stats):
max_memory_allocated_result = 1024.0
mock_max_memory.return_value = max_memory_allocated_result
test_device = torch.device("npu:0")
result = self.platform.get_current_memory_usage(device=test_device)
mock_reset_stats.assert_called_once_with(test_device)
mock_max_memory.assert_called_once_with(test_device)
self.assertEqual(result, max_memory_allocated_result)
@patch("torch.npu.reset_peak_memory_stats")
@patch("torch.npu.max_memory_allocated")
[Bugfix] Reset incompatible config (#6005) ### What this PR does / why we need it? This PR introduces compatibility fixes for running vLLM on Ascend NPU hardware. The changes ensure that GPU-specific parameters are automatically detected and reset to Ascend-compatible values with appropriate warnings logged. | Module | Parameter | Default Value | |--------|-----------|---------------| | Model Config | `disable_cascade_attn` | `False` | | Parallel Config | `all2all_backend` | `"allgather_reducescatter"` | | Cache Config | `cpu_kvcache_space_bytes` | `None` | | MultiModal Config | `mm_encoder_attn_backend` | `None` | | Observability Config | `enable_layerwise_nvtx_tracing` | `False` | | Scheduler Config | `max_num_partial_prefills` | `1` | | Speculative Config | `quantization` | `None` | | KV Transfer Config | `kv_buffer_size` | `1e9` | | KV Transfer Config | `enable_permute_local_kv` | `False` | | Attention Config | `use_prefill_decode_attention` | `False` | | Attention Config | `use_cudnn_prefill` | `False` | | Attention Config | `use_trtllm_ragged_deepseek_prefill` | `False` | | Attention Config | `use_trtllm_attention` | `False` | | Attention Config | `disable_flashinfer_prefill` | `False` | | Attention Config | `disable_flashinfer_q_quantization` | `False` | | Attention Config | `flash_attn_version` | `None` | | Attention Config | `backend` | `None` | | Attention Config | `flash_attn_max_num_splits_for_cuda_graph` | `32` | ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/2c24bc6996cb165fce92f780b388a5e39b3f4060 Signed-off-by: hfadzxy <starmoon_zhang@163.com>
2026-01-20 11:02:38 +08:00
def test_get_current_memory_usage_with_default_device(self, mock_max_memory, mock_reset_stats):
max_memory_allocated_result = 1024.0
mock_max_memory.return_value = max_memory_allocated_result
result = self.platform.get_current_memory_usage()
mock_reset_stats.assert_called_once_with(None)
mock_max_memory.assert_called_once_with(None)
self.assertEqual(result, max_memory_allocated_result)
[Bugfix] Reset incompatible config (#6005) ### What this PR does / why we need it? This PR introduces compatibility fixes for running vLLM on Ascend NPU hardware. The changes ensure that GPU-specific parameters are automatically detected and reset to Ascend-compatible values with appropriate warnings logged. | Module | Parameter | Default Value | |--------|-----------|---------------| | Model Config | `disable_cascade_attn` | `False` | | Parallel Config | `all2all_backend` | `"allgather_reducescatter"` | | Cache Config | `cpu_kvcache_space_bytes` | `None` | | MultiModal Config | `mm_encoder_attn_backend` | `None` | | Observability Config | `enable_layerwise_nvtx_tracing` | `False` | | Scheduler Config | `max_num_partial_prefills` | `1` | | Speculative Config | `quantization` | `None` | | KV Transfer Config | `kv_buffer_size` | `1e9` | | KV Transfer Config | `enable_permute_local_kv` | `False` | | Attention Config | `use_prefill_decode_attention` | `False` | | Attention Config | `use_cudnn_prefill` | `False` | | Attention Config | `use_trtllm_ragged_deepseek_prefill` | `False` | | Attention Config | `use_trtllm_attention` | `False` | | Attention Config | `disable_flashinfer_prefill` | `False` | | Attention Config | `disable_flashinfer_q_quantization` | `False` | | Attention Config | `flash_attn_version` | `None` | | Attention Config | `backend` | `None` | | Attention Config | `flash_attn_max_num_splits_for_cuda_graph` | `32` | ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/2c24bc6996cb165fce92f780b388a5e39b3f4060 Signed-off-by: hfadzxy <starmoon_zhang@163.com>
2026-01-20 11:02:38 +08:00
@patch("torch.npu.reset_peak_memory_stats", side_effect=RuntimeError("Device error"))
@patch("torch.npu.max_memory_allocated")
[Bugfix] Reset incompatible config (#6005) ### What this PR does / why we need it? This PR introduces compatibility fixes for running vLLM on Ascend NPU hardware. The changes ensure that GPU-specific parameters are automatically detected and reset to Ascend-compatible values with appropriate warnings logged. | Module | Parameter | Default Value | |--------|-----------|---------------| | Model Config | `disable_cascade_attn` | `False` | | Parallel Config | `all2all_backend` | `"allgather_reducescatter"` | | Cache Config | `cpu_kvcache_space_bytes` | `None` | | MultiModal Config | `mm_encoder_attn_backend` | `None` | | Observability Config | `enable_layerwise_nvtx_tracing` | `False` | | Scheduler Config | `max_num_partial_prefills` | `1` | | Speculative Config | `quantization` | `None` | | KV Transfer Config | `kv_buffer_size` | `1e9` | | KV Transfer Config | `enable_permute_local_kv` | `False` | | Attention Config | `use_prefill_decode_attention` | `False` | | Attention Config | `use_cudnn_prefill` | `False` | | Attention Config | `use_trtllm_ragged_deepseek_prefill` | `False` | | Attention Config | `use_trtllm_attention` | `False` | | Attention Config | `disable_flashinfer_prefill` | `False` | | Attention Config | `disable_flashinfer_q_quantization` | `False` | | Attention Config | `flash_attn_version` | `None` | | Attention Config | `backend` | `None` | | Attention Config | `flash_attn_max_num_splits_for_cuda_graph` | `32` | ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/2c24bc6996cb165fce92f780b388a5e39b3f4060 Signed-off-by: hfadzxy <starmoon_zhang@163.com>
2026-01-20 11:02:38 +08:00
def test_get_current_memory_usage_when_reset_stats_fails(self, mock_max_memory, mock_reset_stats):
with self.assertRaises(RuntimeError):
self.platform.get_current_memory_usage()
mock_reset_stats.assert_called_once()
mock_max_memory.assert_not_called()
@patch("torch.npu.reset_peak_memory_stats")
@patch(
"torch.npu.max_memory_allocated",
side_effect=RuntimeError("Memory query failed"),
)
[Bugfix] Reset incompatible config (#6005) ### What this PR does / why we need it? This PR introduces compatibility fixes for running vLLM on Ascend NPU hardware. The changes ensure that GPU-specific parameters are automatically detected and reset to Ascend-compatible values with appropriate warnings logged. | Module | Parameter | Default Value | |--------|-----------|---------------| | Model Config | `disable_cascade_attn` | `False` | | Parallel Config | `all2all_backend` | `"allgather_reducescatter"` | | Cache Config | `cpu_kvcache_space_bytes` | `None` | | MultiModal Config | `mm_encoder_attn_backend` | `None` | | Observability Config | `enable_layerwise_nvtx_tracing` | `False` | | Scheduler Config | `max_num_partial_prefills` | `1` | | Speculative Config | `quantization` | `None` | | KV Transfer Config | `kv_buffer_size` | `1e9` | | KV Transfer Config | `enable_permute_local_kv` | `False` | | Attention Config | `use_prefill_decode_attention` | `False` | | Attention Config | `use_cudnn_prefill` | `False` | | Attention Config | `use_trtllm_ragged_deepseek_prefill` | `False` | | Attention Config | `use_trtllm_attention` | `False` | | Attention Config | `disable_flashinfer_prefill` | `False` | | Attention Config | `disable_flashinfer_q_quantization` | `False` | | Attention Config | `flash_attn_version` | `None` | | Attention Config | `backend` | `None` | | Attention Config | `flash_attn_max_num_splits_for_cuda_graph` | `32` | ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/2c24bc6996cb165fce92f780b388a5e39b3f4060 Signed-off-by: hfadzxy <starmoon_zhang@163.com>
2026-01-20 11:02:38 +08:00
def test_get_current_memory_usage_when_query_fails(self, mock_max_memory, mock_reset_stats):
with self.assertRaises(RuntimeError):
self.platform.get_current_memory_usage()
mock_reset_stats.assert_called_once()
mock_max_memory.assert_called_once()
def test_get_device_communicator_cls_returns_correct_value(self):
self.assertEqual(
self.platform.get_device_communicator_cls(),
"vllm_ascend.distributed.device_communicators.npu_communicator.NPUCommunicator",
)
def test_is_pin_memory_available_returns_true(self):
self.assertTrue(self.platform.is_pin_memory_available())
def test_get_static_graph_wrapper_cls_returns_correct_value(self):
self.assertEqual(
self.platform.get_static_graph_wrapper_cls(),
"vllm_ascend.compilation.acl_graph.ACLGraphWrapper",
)