diff --git a/tests/e2e/singlecard/test_aclgraph_accuracy.py b/tests/e2e/singlecard/test_aclgraph_accuracy.py index 5b03c0c4..b1878862 100644 --- a/tests/e2e/singlecard/test_aclgraph_accuracy.py +++ b/tests/e2e/singlecard/test_aclgraph_accuracy.py @@ -192,3 +192,22 @@ def test_output_between_eager_and_full_decode_only( name_0="vllm_eager_outputs", name_1="vllm_aclgraph_outputs", ) + + +def test_aclgraph_enable(): + # Generally, this test is not belong to e2e, but it is a good way to check if + # aclgraph is enabled in real environment + from vllm.config.compilation import CompilationMode, CUDAGraphMode + from vllm.engine.arg_utils import EngineArgs + + from vllm_ascend.platform import NPUPlatform + + # vLLM default mode is piecewise cudagraph + config = EngineArgs() + VllmConfig = config.create_engine_config() + assert VllmConfig.compilation_config.cudagraph_mode == CUDAGraphMode.PIECEWISE + + # after check_and_update_config, mode should be VLLM_COMPILE and piecewise cudagraph + NPUPlatform.check_and_update_config(VllmConfig) + assert VllmConfig.compilation_config.mode == CompilationMode.VLLM_COMPILE + assert VllmConfig.compilation_config.cudagraph_mode == CUDAGraphMode.PIECEWISE diff --git a/tests/ut/attention/test_mla_v1.py b/tests/ut/attention/test_mla_v1.py index 97caf4b1..5fc63e9e 100644 --- a/tests/ut/attention/test_mla_v1.py +++ b/tests/ut/attention/test_mla_v1.py @@ -1,3 +1,4 @@ +import os from unittest.mock import MagicMock, patch import torch @@ -507,8 +508,6 @@ class TestAscendMLAMetadataBuilderBuild(TestBase): def setUp(self): self.mock_vllm_config = MagicMock(spec=VllmConfig) - self.mock_vllm_config.model_config = ModelConfig(max_model_len=2048) - self.mock_vllm_config.model_config.hf_text_config.qk_rope_head_dim = 32 self.mock_vllm_config.cache_config = CacheConfig(block_size=32) mock_scheduler_config = MagicMock(spec=SchedulerConfig) mock_scheduler_config.max_num_seqs = 8 @@ -516,7 +515,15 @@ class TestAscendMLAMetadataBuilderBuild(TestBase): self.mock_vllm_config.scheduler_config = mock_scheduler_config self.mock_vllm_config.speculative_config = None self.mock_device = torch.device("cpu") - + fake_weight_path = os.path.join(os.path.dirname(__file__), "..", + "fake_weight") + model_config = ModelConfig( + model=fake_weight_path, + skip_tokenizer_init=True, + ) + model_config.hf_text_config.head_dim = 128 + model_config.hf_text_config.qk_rope_head_dim = 32 + self.mock_vllm_config.model_config = model_config self.kv_cache_spec = MagicMock() self.kv_cache_spec.num_layers = 32 self.kv_cache_spec.head_size = 128 diff --git a/tests/ut/test_platform.py b/tests/ut/test_platform.py index 2eb4e932..129994e0 100644 --- a/tests/ut/test_platform.py +++ b/tests/ut/test_platform.py @@ -4,7 +4,6 @@ from unittest.mock import MagicMock, patch import pytest import torch from vllm.config.compilation import CompilationMode, CUDAGraphMode -from vllm.engine.arg_utils import EngineArgs from vllm.platforms import PlatformEnum from tests.ut.base import TestBase @@ -579,30 +578,3 @@ class TestNPUPlatform(TestBase): self.platform.get_static_graph_wrapper_cls(), "vllm_ascend.compilation.acl_graph.ACLGraphWrapper", ) - - def test_aclgraph_enable(self): - config = EngineArgs() - VllmConfig = config.create_engine_config() - self.assertEqual(VllmConfig.compilation_config.cudagraph_mode, - CUDAGraphMode.PIECEWISE) - - with self.assertLogs(logger="vllm", level="INFO") as cm: - from vllm_ascend import platform - - importlib.reload(platform) - self.platform.check_and_update_config(VllmConfig) - target_msg = "PIECEWISE compilation enabled on NPU. use_inductor not supported - using only ACL Graph mode" - found = any(target_msg in log for log in cm.output) - - self.assertTrue( - found, - f"Expected log message not found. Captured logs: {cm.output}") - - self.assertEqual( - VllmConfig.compilation_config.mode, - CompilationMode.VLLM_COMPILE, - ) - self.assertEqual( - VllmConfig.compilation_config.cudagraph_mode, - CUDAGraphMode.PIECEWISE, - ) diff --git a/vllm_ascend/eplb/core/policy/policy_dynamic_ep_v2.py b/vllm_ascend/eplb/core/policy/policy_dynamic_ep_v2.py index a0b8d5d3..198eeee0 100644 --- a/vllm_ascend/eplb/core/policy/policy_dynamic_ep_v2.py +++ b/vllm_ascend/eplb/core/policy/policy_dynamic_ep_v2.py @@ -4,6 +4,7 @@ from abc import abstractmethod from collections import defaultdict import numpy as np +from vllm.logger import logger class DynamicConfig: @@ -69,21 +70,21 @@ class DynamicEplbV2(EplbPolicy): @staticmethod def safe_divide(a, b): if b == 0: - print("Division by zero is not allowed") + logger.info("Division by zero is not allowed") return 0 return a / b @staticmethod def safe_exact_divide(a, b): if b == 0: - print("Division by zero is not allowed") + logger.info("Division by zero is not allowed") return 0 return a // b @staticmethod def safe_mod(a, b): if b == 0: - print("Division by zero is not allowed") + logger.info("Division by zero is not allowed") return 0 return a % b diff --git a/vllm_ascend/model_loader/netloader/interaction/elastic.py b/vllm_ascend/model_loader/netloader/interaction/elastic.py index 1000bd7c..61b2ad3b 100644 --- a/vllm_ascend/model_loader/netloader/interaction/elastic.py +++ b/vllm_ascend/model_loader/netloader/interaction/elastic.py @@ -60,7 +60,7 @@ class ElasticClient: ip, port_str = source.split(':') port = int(port_str) except Exception as e: - logger.error(f"IP format error: {source}, detail: {e}") + logger.info(f"IP format error: {source}, detail: {e}") continue self.server_addr = ip diff --git a/vllm_ascend/model_loader/netloader/load.py b/vllm_ascend/model_loader/netloader/load.py index 90000d58..4dd24107 100644 --- a/vllm_ascend/model_loader/netloader/load.py +++ b/vllm_ascend/model_loader/netloader/load.py @@ -80,5 +80,5 @@ def elastic_load( time.perf_counter() - t0)) return model_loaded except Exception as e: - logger.error(f"elastic_load error: {e}") + logger.info(f"elastic_load error: {e}") return None