[Refactor] Cleanup platform (#5566)
### What this PR does / why we need it?
1. add `COMPILATION_PASS_KEY` constant
2. clean up useless platform interface `empty_cache`, `synchronize`,
`mem_get_info`, `clear_npu_memory`
3. rename `CUSTOM_OP_REGISTERED` to `_CUSTOM_OP_REGISTERED`
4. remove uesless env `VLLM_ENABLE_CUDAGRAPH_GC`
NPUPlatform is the interface called by vLLM. Do not call it inner
vllm-ascend.
### Does this PR introduce _any_ user-facing change?
This PR is just a cleanup. All CI should pass.
### How was this patch tested?
- vLLM version: v0.13.0
- vLLM main:
7157596103
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
@@ -879,7 +879,6 @@ class TestAscendMLAImpl(TestBase):
|
||||
B, H, D = 4, self.impl.num_heads, self.impl.v_head_dim # total: [4, 4, 8]
|
||||
test_cases = [(1, 1), (1, 2), (2, 1), (2, 2), (4, 4)]
|
||||
for test_case in test_cases:
|
||||
print(test_case)
|
||||
self.impl.dcp_size = test_case[0]
|
||||
self.impl.pcp_size = test_case[1]
|
||||
mock_dcp.world_size = test_case[0]
|
||||
|
||||
@@ -128,10 +128,17 @@ class TestCaMem(PytestBase):
|
||||
2000: data2,
|
||||
}
|
||||
|
||||
# mock is_pin_memory_available, return False as some machine only has cpu
|
||||
with patch(
|
||||
"vllm_ascend.device_allocator.camem.NPUPlatform.is_pin_memory_available",
|
||||
return_value=False):
|
||||
# Mock torch.empty to force pin_memory=False
|
||||
original_torch_empty = torch.empty
|
||||
|
||||
def mock_torch_empty(*args, **kwargs):
|
||||
# If pin_memory was explicitly set to True, change it to False
|
||||
if 'pin_memory' in kwargs and kwargs['pin_memory'] is True:
|
||||
kwargs['pin_memory'] = False
|
||||
return original_torch_empty(*args, **kwargs)
|
||||
|
||||
with patch("vllm_ascend.device_allocator.camem.torch.empty",
|
||||
side_effect=mock_torch_empty):
|
||||
allocator.sleep(offload_tags="tag1")
|
||||
|
||||
# only offload tag1, other tag2 call unmap_and_release
|
||||
|
||||
@@ -120,115 +120,6 @@ class TestNPUPlatform(TestBase):
|
||||
self.assertIsNone(self.platform.inference_mode())
|
||||
mock_inference_mode.assert_called_once()
|
||||
|
||||
@patch("torch.npu.set_device")
|
||||
def test_set_device_normal(self, mock_set_device):
|
||||
device = torch.device("npu:0")
|
||||
self.platform.set_device(device)
|
||||
mock_set_device.assert_called_once_with(device)
|
||||
|
||||
@patch("torch.npu.set_device",
|
||||
side_effect=RuntimeError("Device not available"))
|
||||
def test_set_device_failure(self, mock_set_device):
|
||||
device = torch.device("npu:0")
|
||||
with self.assertRaises(RuntimeError):
|
||||
self.platform.set_device(device)
|
||||
mock_set_device.assert_called_once_with(device)
|
||||
|
||||
@patch("torch.npu.empty_cache")
|
||||
def test_empty_cache_normal(self, mock_empty_cache):
|
||||
self.platform.empty_cache()
|
||||
mock_empty_cache.assert_called_once()
|
||||
|
||||
@patch("torch.npu.empty_cache",
|
||||
side_effect=RuntimeError("Cache clearing failed"))
|
||||
def test_empty_cache_failure(self, mock_empty_cache):
|
||||
with self.assertRaises(RuntimeError):
|
||||
self.platform.empty_cache()
|
||||
mock_empty_cache.assert_called_once()
|
||||
|
||||
@patch("torch.npu.synchronize")
|
||||
def test_synchronize_normal(self, mock_synchronize):
|
||||
self.platform.synchronize()
|
||||
mock_synchronize.assert_called_once()
|
||||
|
||||
@patch("torch.npu.synchronize",
|
||||
side_effect=RuntimeError("Synchronization failed"))
|
||||
def test_synchronize_failure(self, mock_synchronize):
|
||||
with self.assertRaises(RuntimeError):
|
||||
self.platform.synchronize()
|
||||
mock_synchronize.assert_called_once()
|
||||
|
||||
@patch("torch.npu.mem_get_info")
|
||||
def test_mem_get_info_normal(self, mock_mem_get_info):
|
||||
free_memory_size = 1024
|
||||
total_memory_size = 2048
|
||||
memory_info = (free_memory_size, total_memory_size)
|
||||
mock_mem_get_info.return_value = memory_info
|
||||
result = self.platform.mem_get_info()
|
||||
self.assertIsInstance(result, tuple)
|
||||
self.assertEqual(len(result), 2)
|
||||
self.assertEqual(result, memory_info)
|
||||
mock_mem_get_info.assert_called_once()
|
||||
|
||||
@patch("torch.npu.mem_get_info",
|
||||
side_effect=RuntimeError("NPU not available"))
|
||||
def test_mem_get_info_failure(self, mock_mem_get_info):
|
||||
with self.assertRaises(RuntimeError):
|
||||
self.platform.mem_get_info()
|
||||
mock_mem_get_info.assert_called_once()
|
||||
|
||||
@patch("gc.collect")
|
||||
@patch("torch.npu.empty_cache")
|
||||
@patch("torch.npu.reset_peak_memory_stats")
|
||||
def test_clear_npu_memory_normal(self, mock_reset_stats, mock_empty_cache,
|
||||
mock_gc_collect):
|
||||
self.platform.clear_npu_memory()
|
||||
|
||||
mock_gc_collect.assert_called_once()
|
||||
mock_empty_cache.assert_called_once()
|
||||
mock_reset_stats.assert_called_once()
|
||||
|
||||
@patch("gc.collect", side_effect=Exception("GC failed"))
|
||||
@patch("torch.npu.empty_cache")
|
||||
@patch("torch.npu.reset_peak_memory_stats")
|
||||
def test_clear_npu_memory_gc_collect_failure(self, mock_reset_stats,
|
||||
mock_empty_cache,
|
||||
mock_gc_collect):
|
||||
with self.assertRaises(Exception):
|
||||
self.platform.clear_npu_memory()
|
||||
|
||||
mock_gc_collect.assert_called_once()
|
||||
mock_empty_cache.assert_not_called()
|
||||
mock_reset_stats.assert_not_called()
|
||||
|
||||
@patch("gc.collect")
|
||||
@patch("torch.npu.empty_cache",
|
||||
side_effect=RuntimeError("Cache clear failed"))
|
||||
@patch("torch.npu.reset_peak_memory_stats")
|
||||
def test_clear_npu_memory_empty_cache_failure(self, mock_reset_stats,
|
||||
mock_empty_cache,
|
||||
mock_gc_collect):
|
||||
with self.assertRaises(RuntimeError):
|
||||
self.platform.clear_npu_memory()
|
||||
|
||||
mock_gc_collect.assert_called_once()
|
||||
mock_empty_cache.assert_called_once()
|
||||
mock_reset_stats.assert_not_called()
|
||||
|
||||
@patch("gc.collect")
|
||||
@patch("torch.npu.empty_cache")
|
||||
@patch("torch.npu.reset_peak_memory_stats",
|
||||
side_effect=RuntimeError("Reset failed"))
|
||||
def test_clear_npu_memory_reset_stats_failure(self, mock_reset_stats,
|
||||
mock_empty_cache,
|
||||
mock_gc_collect):
|
||||
with self.assertRaises(RuntimeError):
|
||||
self.platform.clear_npu_memory()
|
||||
|
||||
mock_gc_collect.assert_called_once()
|
||||
mock_empty_cache.assert_called_once()
|
||||
mock_reset_stats.assert_called_once()
|
||||
|
||||
@patch("vllm_ascend.ascend_config.init_ascend_config")
|
||||
@patch("vllm_ascend.utils.update_aclgraph_sizes")
|
||||
@patch('vllm_ascend.utils.get_ascend_device_type',
|
||||
|
||||
@@ -238,15 +238,18 @@ class TestNPUWorker(TestBase):
|
||||
@patch(
|
||||
"vllm_ascend.worker.worker.NPUWorker._init_worker_distributed_environment"
|
||||
)
|
||||
@patch("vllm_ascend.worker.worker.NPUPlatform")
|
||||
@patch("vllm_ascend.worker.worker.init_device_properties_triton")
|
||||
def test_init_device(self, mock_init_triton, mock_platform,
|
||||
@patch("torch.npu.set_device")
|
||||
@patch("torch.npu.empty_cache")
|
||||
@patch("torch.npu.mem_get_info")
|
||||
def test_init_device(self, mock_mem_get_info, mock_set_device,
|
||||
mock_empty_cache, mock_init_triton,
|
||||
mock_init_dist_env):
|
||||
"""Test _init_device method"""
|
||||
from vllm_ascend.worker.worker import NPUWorker
|
||||
|
||||
# Setup mock
|
||||
mock_platform.mem_get_info.return_value = (1000, 2000)
|
||||
mock_mem_get_info.return_value = (1000, 2000)
|
||||
|
||||
# Create worker mock
|
||||
with patch.object(NPUWorker, "__init__", lambda x, **kwargs: None):
|
||||
@@ -256,21 +259,13 @@ class TestNPUWorker(TestBase):
|
||||
worker.parallel_config = MagicMock()
|
||||
worker.parallel_config.local_world_size = 0
|
||||
worker.parallel_config.data_parallel_size = 1
|
||||
|
||||
worker.model_config.seed = 42
|
||||
|
||||
# Test _init_device
|
||||
result = worker._init_device()
|
||||
|
||||
# Verify NPUPlatform.set_device is called
|
||||
mock_platform.set_device.assert_called_once()
|
||||
# Verify the parameter passed to set_device is a torch.device object
|
||||
call_args = mock_platform.set_device.call_args[0][0]
|
||||
self.assertEqual(str(call_args), "npu:1")
|
||||
|
||||
mock_platform.empty_cache.assert_called_once()
|
||||
mock_platform.seed_everything.assert_called_once_with(42)
|
||||
mock_platform.mem_get_info.assert_called_once(
|
||||
mock_mem_get_info.assert_called_once(
|
||||
) # Called once in _init_device method
|
||||
mock_init_dist_env.assert_called_once(
|
||||
) # Verify distributed initialization is called
|
||||
@@ -548,9 +543,8 @@ class TestNPUWorker(TestBase):
|
||||
# Verify returns None (empty string is considered false)
|
||||
self.assertIsNone(result)
|
||||
|
||||
@patch("vllm_ascend.worker.worker.NPUPlatform.clear_npu_memory")
|
||||
@patch("vllm_ascend.worker.worker.NPUPlatform.empty_cache")
|
||||
@patch("vllm_ascend.worker.worker.NPUPlatform.mem_get_info")
|
||||
@patch("torch.npu.reset_peak_memory_stats")
|
||||
@patch("torch.npu.empty_cache")
|
||||
@patch("torch_npu.npu.memory_stats")
|
||||
@patch("torch_npu.npu.mem_get_info")
|
||||
@patch("vllm_ascend.worker.worker.logger")
|
||||
@@ -559,15 +553,14 @@ class TestNPUWorker(TestBase):
|
||||
mock_logger,
|
||||
mock_torch_mem_get_info,
|
||||
mock_torch_memory_stats,
|
||||
mock_platform_mem_get_info,
|
||||
mock_platform_empty_cache,
|
||||
mock_platform_clear_npu_memory,
|
||||
mock_torch_empty_cache,
|
||||
mock_torch_reset_peak_memory_stats,
|
||||
):
|
||||
"""Test determine_available_memory normal case (no non-torch memory allocation)"""
|
||||
from vllm_ascend.worker.worker import NPUWorker
|
||||
|
||||
# Setup mock - test case without non-torch memory allocation
|
||||
mock_platform_mem_get_info.side_effect = [
|
||||
mock_torch_mem_get_info.side_effect = [
|
||||
(8000, 10000), # 1st call: before profile execution
|
||||
(7000, 10000), # 2nd call: after profile execution
|
||||
]
|
||||
@@ -606,10 +599,8 @@ class TestNPUWorker(TestBase):
|
||||
result = worker.determine_available_memory()
|
||||
|
||||
# Verify call count and order
|
||||
mock_platform_clear_npu_memory.assert_called_once()
|
||||
self.assertEqual(mock_platform_mem_get_info.call_count, 2)
|
||||
self.assertEqual(mock_torch_mem_get_info.call_count, 4)
|
||||
worker.model_runner.profile_run.assert_called_once()
|
||||
mock_platform_empty_cache.assert_called_once()
|
||||
|
||||
# Verify calculation result with race condition simulation
|
||||
# Calculation logic:
|
||||
@@ -629,24 +620,22 @@ class TestNPUWorker(TestBase):
|
||||
# Verify log output
|
||||
mock_logger.info.assert_called_once()
|
||||
|
||||
@patch("vllm_ascend.worker.worker.NPUPlatform.clear_npu_memory")
|
||||
@patch("vllm_ascend.worker.worker.NPUPlatform.empty_cache")
|
||||
@patch("vllm_ascend.worker.worker.NPUPlatform.mem_get_info")
|
||||
@patch("torch.npu.reset_peak_memory_stats")
|
||||
@patch("torch.npu.empty_cache")
|
||||
@patch("torch_npu.npu.memory_stats")
|
||||
@patch("torch_npu.npu.mem_get_info")
|
||||
def test_determine_available_memory_with_non_torch_allocations(
|
||||
self,
|
||||
mock_torch_mem_get_info,
|
||||
mock_torch_memory_stats,
|
||||
mock_platform_mem_get_info,
|
||||
mock_platform_empty_cache,
|
||||
mock_platform_clear_npu_memory,
|
||||
mock_torch_empty_cache,
|
||||
mock_torch_reset_peak_memory_stats,
|
||||
):
|
||||
"""Test determine_available_memory with significant non-torch memory allocation"""
|
||||
from vllm_ascend.worker.worker import NPUWorker
|
||||
|
||||
# Setup mock - test case with large non-torch memory allocation
|
||||
mock_platform_mem_get_info.side_effect = [
|
||||
mock_torch_mem_get_info.side_effect = [
|
||||
(8000, 10000), # 1st call
|
||||
(7000, 10000), # 2nd call
|
||||
]
|
||||
@@ -695,15 +684,17 @@ class TestNPUWorker(TestBase):
|
||||
expected_result = max(0, int(10000 * 0.9 - 5500))
|
||||
self.assertEqual(result, expected_result)
|
||||
|
||||
@patch("vllm_ascend.worker.worker.NPUPlatform.clear_npu_memory")
|
||||
@patch("vllm_ascend.worker.worker.NPUPlatform.mem_get_info")
|
||||
@patch("torch.npu.mem_get_info")
|
||||
@patch("torch.npu.reset_peak_memory_stats")
|
||||
@patch("torch.npu.empty_cache")
|
||||
def test_determine_available_memory_memory_profiling_error(
|
||||
self, mock_platform_mem_get_info, mock_platform_clear_npu_memory):
|
||||
self, mock_torch_empty_cache, mock_torch_reset_peak_memory_stats,
|
||||
mock_torch_mem_get_info):
|
||||
"""Test determine_available_memory throws exception on memory profiling error"""
|
||||
from vllm_ascend.worker.worker import NPUWorker
|
||||
|
||||
# Setup mock: initial memory less than current free memory (error case)
|
||||
mock_platform_mem_get_info.side_effect = [
|
||||
mock_torch_mem_get_info.side_effect = [
|
||||
(8000, 10000), # 1st call
|
||||
(9000, 10000), # 2nd call: free memory increased instead
|
||||
]
|
||||
@@ -722,24 +713,22 @@ class TestNPUWorker(TestBase):
|
||||
|
||||
self.assertIn("Error in memory profiling", str(cm.exception))
|
||||
|
||||
@patch("vllm_ascend.worker.worker.NPUPlatform.clear_npu_memory")
|
||||
@patch("vllm_ascend.worker.worker.NPUPlatform.empty_cache")
|
||||
@patch("vllm_ascend.worker.worker.NPUPlatform.mem_get_info")
|
||||
@patch("torch.npu.reset_peak_memory_stats")
|
||||
@patch("torch.npu.empty_cache")
|
||||
@patch("torch_npu.npu.memory_stats")
|
||||
@patch("torch_npu.npu.mem_get_info")
|
||||
def test_determine_available_memory_negative_result(
|
||||
self,
|
||||
mock_torch_mem_get_info,
|
||||
mock_torch_memory_stats,
|
||||
mock_platform_mem_get_info,
|
||||
mock_platform_empty_cache,
|
||||
mock_platform_clear_npu_memory,
|
||||
mock_torch_empty_cache,
|
||||
mock_torch_reset_peak_memory_stats,
|
||||
):
|
||||
"""Test determine_available_memory returns 0 when result is negative"""
|
||||
from vllm_ascend.worker.worker import NPUWorker
|
||||
|
||||
# Setup mock: high peak memory causes negative available memory
|
||||
mock_platform_mem_get_info.side_effect = [
|
||||
mock_torch_mem_get_info.side_effect = [
|
||||
(8000, 10000), # 1st call
|
||||
(3000, 10000), # 2nd call
|
||||
]
|
||||
@@ -989,12 +978,10 @@ class TestNPUWorker(TestBase):
|
||||
|
||||
self.assertIn("Sleep mode can only be", str(cm.exception))
|
||||
|
||||
@patch("vllm_ascend.worker.worker.NPUPlatform.seed_everything")
|
||||
@patch("vllm_ascend.worker.worker.logger")
|
||||
@patch("vllm_ascend.worker.worker.NPUWorker._warm_up_atb")
|
||||
def test_compile_or_warm_up_model_with_eager_mode(self, mock_warm_up_atb,
|
||||
mock_logger,
|
||||
mock_seed_everything):
|
||||
mock_logger):
|
||||
"""Test compile_or_warm_up_model method - eager mode"""
|
||||
from vllm_ascend.worker.worker import NPUWorker
|
||||
|
||||
@@ -1032,17 +1019,13 @@ class TestNPUWorker(TestBase):
|
||||
# Verify log output
|
||||
self.assertEqual(mock_logger.info.call_count, 4)
|
||||
|
||||
# Verify seed setting
|
||||
mock_seed_everything.assert_called_once_with(12345)
|
||||
|
||||
# Verify atb warm up
|
||||
mock_warm_up_atb.assert_called_once()
|
||||
|
||||
@patch("vllm_ascend.worker.worker.NPUPlatform.seed_everything")
|
||||
@patch("vllm_ascend.worker.worker.logger")
|
||||
@patch("vllm_ascend.worker.worker.NPUWorker._warm_up_atb")
|
||||
def test_compile_or_warm_up_model_with_graph_capture(
|
||||
self, mock_warm_up_atb, mock_logger, mock_seed_everything):
|
||||
self, mock_warm_up_atb, mock_logger):
|
||||
"""Test compile_or_warm_up_model method - with graph capture enabled"""
|
||||
from vllm_ascend.worker.worker import NPUWorker
|
||||
|
||||
@@ -1072,9 +1055,6 @@ class TestNPUWorker(TestBase):
|
||||
# Should call capture_model in non-eager mode
|
||||
worker.model_runner.capture_model.assert_called_once()
|
||||
|
||||
# Verify seed setting
|
||||
mock_seed_everything.assert_called_once_with(67890)
|
||||
|
||||
# Verify atb warm up
|
||||
mock_warm_up_atb.assert_called_once()
|
||||
|
||||
|
||||
Reference in New Issue
Block a user