[1/N][Refactor] Refactor code to adapt with vllm main (#3612)
### What this PR does / why we need it? This is the step 1 of refactoring code to adapt with vllm main, and this pr aligned with17c540a9931. refactor deepseek to the latest code arch as of17c540a9932. bunches of fixes due to vllm changes - Fix `AscendScheduler` `__post_init__`, caused by https://github.com/vllm-project/vllm/pull/25075 - Fix `AscendScheduler` init got an unexpected arg `block_size`, caused by https://github.com/vllm-project/vllm/pull/26296 - Fix `KVCacheManager` `get_num_common_prefix_blocks` arg, caused by https://github.com/vllm-project/vllm/pull/23485 - Fix `MLAAttention` import,caused by https://github.com/vllm-project/vllm/pull/25103 - Fix `SharedFusedMoE` import, caused by https://github.com/vllm-project/vllm/pull/26145 - Fix `LazyLoader` improt, caused by https://github.com/vllm-project/vllm/pull/27022 - Fix `vllm.utils.swap_dict_values` improt, caused by https://github.com/vllm-project/vllm/pull/26990 - Fix `Backend` enum import, caused by https://github.com/vllm-project/vllm/pull/25893 - Fix `CompilationLevel` renaming to `CompilationMode` issue introduced by https://github.com/vllm-project/vllm/pull/26355 - Fix fused_moe ops, caused by https://github.com/vllm-project/vllm/pull/24097 - Fix bert model because of `inputs_embeds`, caused by https://github.com/vllm-project/vllm/pull/25922 - Fix MRope because of `get_input_positions_tensor` to `get_mrope_input_positions`, caused by https://github.com/vllm-project/vllm/pull/24172 - Fix `splitting_ops` changes introduced by https://github.com/vllm-project/vllm/pull/25845 - Fix multi-modality changes introduced by https://github.com/vllm-project/vllm/issues/16229 - Fix lora bias dropping issue introduced by https://github.com/vllm-project/vllm/pull/25807 - Fix structured ouput break introduced by https://github.com/vllm-project/vllm/issues/26737 ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? CI passed with existing test. - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 --------- Signed-off-by: MengqingCao <cmq0113@163.com> Signed-off-by: Icey <1790571317@qq.com> Co-authored-by: Icey <1790571317@qq.com>
This commit is contained in:
@@ -1,19 +1,19 @@
|
||||
import importlib
|
||||
import unittest
|
||||
from datetime import timedelta
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
from torch.distributed import ProcessGroup
|
||||
from torch.distributed.distributed_c10d import PrefixStore
|
||||
from vllm.config import CompilationLevel
|
||||
from vllm.config.compilation import CUDAGraphMode
|
||||
from vllm.platforms import PlatformEnum
|
||||
|
||||
from tests.ut.base import TestBase
|
||||
from vllm_ascend.platform import NPUPlatform
|
||||
from vllm_ascend.utils import ASCEND_QUANTIZATION_METHOD
|
||||
from vllm_ascend.utils import ASCEND_QUANTIZATION_METHOD, vllm_version_is
|
||||
|
||||
if vllm_version_is("0.11.0"):
|
||||
from vllm.config.compilation import CompilationLevel
|
||||
else:
|
||||
from vllm.config.compilation import CompilationMode
|
||||
|
||||
|
||||
class TestNPUPlatform(TestBase):
|
||||
@@ -249,6 +249,7 @@ class TestNPUPlatform(TestBase):
|
||||
vllm_config.parallel_config.enable_expert_parallel = False
|
||||
vllm_config.parallel_config.tensor_parallel_size = 1
|
||||
mock_init_recompute.return_value = MagicMock()
|
||||
vllm_config.scheduler_config = MagicMock()
|
||||
|
||||
# Use importlib.reload to reload the platform module, ensuring the mocked init_ascend_config method is used.
|
||||
# Without this reload, when calling self.platform.check_and_update_config,
|
||||
@@ -277,6 +278,7 @@ class TestNPUPlatform(TestBase):
|
||||
vllm_config.model_config = None
|
||||
vllm_config.parallel_config.tensor_parallel_size = 1
|
||||
mock_init_recompute.return_value = MagicMock()
|
||||
vllm_config.scheduler_config = MagicMock()
|
||||
|
||||
with self.assertLogs(logger="vllm", level="WARNING") as cm:
|
||||
from vllm_ascend import platform
|
||||
@@ -300,6 +302,7 @@ class TestNPUPlatform(TestBase):
|
||||
vllm_config.model_config.enforce_eager = True
|
||||
vllm_config.parallel_config.tensor_parallel_size = 1
|
||||
mock_init_recompute.return_value = MagicMock()
|
||||
vllm_config.scheduler_config = MagicMock()
|
||||
|
||||
with self.assertLogs(logger="vllm", level="INFO") as cm:
|
||||
from vllm_ascend import platform
|
||||
@@ -308,10 +311,18 @@ class TestNPUPlatform(TestBase):
|
||||
self.platform.check_and_update_config(vllm_config)
|
||||
self.assertTrue("Compilation disabled, using eager mode by default" in
|
||||
cm.output[0])
|
||||
self.assertEqual(
|
||||
vllm_config.compilation_config.level,
|
||||
CompilationLevel.NO_COMPILATION,
|
||||
)
|
||||
|
||||
if vllm_version_is("0.11.0"):
|
||||
self.assertEqual(
|
||||
vllm_config.compilation_config.level,
|
||||
CompilationLevel.NO_COMPILATION,
|
||||
)
|
||||
else:
|
||||
self.assertEqual(
|
||||
vllm_config.compilation_config.mode,
|
||||
CompilationMode.NONE,
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
vllm_config.compilation_config.cudagraph_mode,
|
||||
CUDAGraphMode.NONE,
|
||||
@@ -330,9 +341,14 @@ class TestNPUPlatform(TestBase):
|
||||
)
|
||||
vllm_config = TestNPUPlatform.mock_vllm_config()
|
||||
vllm_config.model_config.enforce_eager = False
|
||||
vllm_config.compilation_config.level = CompilationLevel.DYNAMO_ONCE
|
||||
vllm_config.parallel_config.tensor_parallel_size = 1
|
||||
mock_init_recompute.return_value = MagicMock()
|
||||
vllm_config.scheduler_config = MagicMock()
|
||||
|
||||
if vllm_version_is("0.11.0"):
|
||||
vllm_config.compilation_config.level = CompilationLevel.DYNAMO_ONCE
|
||||
else:
|
||||
vllm_config.compilation_config.mode = CompilationMode.DYNAMO_TRACE_ONCE
|
||||
|
||||
with self.assertLogs(logger="vllm", level="WARNING") as cm:
|
||||
from vllm_ascend import platform
|
||||
@@ -340,10 +356,16 @@ class TestNPUPlatform(TestBase):
|
||||
importlib.reload(platform)
|
||||
self.platform.check_and_update_config(vllm_config)
|
||||
self.assertTrue("NPU does not support" in cm.output[0])
|
||||
self.assertEqual(
|
||||
vllm_config.compilation_config.level,
|
||||
CompilationLevel.NO_COMPILATION,
|
||||
)
|
||||
if vllm_version_is("0.11.0"):
|
||||
self.assertEqual(
|
||||
vllm_config.compilation_config.level,
|
||||
CompilationMode.NONE,
|
||||
)
|
||||
else:
|
||||
self.assertEqual(
|
||||
vllm_config.compilation_config.mode,
|
||||
CompilationMode.NONE,
|
||||
)
|
||||
self.assertEqual(
|
||||
vllm_config.compilation_config.cudagraph_mode,
|
||||
CUDAGraphMode.NONE,
|
||||
@@ -370,10 +392,17 @@ class TestNPUPlatform(TestBase):
|
||||
self.assertTrue(
|
||||
"cudagraph_mode is not support on NPU. falling back to NONE" in
|
||||
cm.output[0])
|
||||
self.assertEqual(
|
||||
vllm_config.compilation_config.level,
|
||||
CompilationLevel.NO_COMPILATION,
|
||||
)
|
||||
|
||||
if vllm_version_is("0.11.0"):
|
||||
self.assertEqual(
|
||||
vllm_config.compilation_config.level,
|
||||
CompilationLevel.NO_COMPILATION,
|
||||
)
|
||||
else:
|
||||
self.assertEqual(
|
||||
vllm_config.compilation_config.mode,
|
||||
CompilationMode.NONE,
|
||||
)
|
||||
self.assertEqual(
|
||||
vllm_config.compilation_config.cudagraph_mode,
|
||||
CUDAGraphMode.NONE,
|
||||
@@ -393,9 +422,14 @@ class TestNPUPlatform(TestBase):
|
||||
mock_init_ascend.return_value = mock_ascend_config
|
||||
vllm_config = TestNPUPlatform.mock_vllm_config()
|
||||
vllm_config.model_config.enforce_eager = False
|
||||
vllm_config.compilation_config.level = CompilationLevel.PIECEWISE
|
||||
vllm_config.parallel_config.tensor_parallel_size = 1
|
||||
mock_init_recompute.return_value = MagicMock()
|
||||
vllm_config.scheduler_config = MagicMock()
|
||||
|
||||
if vllm_version_is("0.11.0"):
|
||||
vllm_config.compilation_config.level = CompilationLevel.PIECEWISE
|
||||
else:
|
||||
vllm_config.compilation_config.mode = CompilationMode.VLLM_COMPILE
|
||||
|
||||
with self.assertLogs(logger="vllm", level="INFO") as cm:
|
||||
from vllm_ascend import platform
|
||||
@@ -403,10 +437,17 @@ class TestNPUPlatform(TestBase):
|
||||
importlib.reload(platform)
|
||||
self.platform.check_and_update_config(vllm_config)
|
||||
self.assertTrue("Torchair compilation enabled" in cm.output[0])
|
||||
self.assertEqual(
|
||||
vllm_config.compilation_config.level,
|
||||
CompilationLevel.NO_COMPILATION,
|
||||
)
|
||||
|
||||
if vllm_version_is("0.11.0"):
|
||||
self.assertEqual(
|
||||
vllm_config.compilation_config.level,
|
||||
CompilationLevel.NO_COMPILATION,
|
||||
)
|
||||
else:
|
||||
self.assertEqual(
|
||||
vllm_config.compilation_config.mode,
|
||||
CompilationMode.NONE,
|
||||
)
|
||||
self.assertEqual(
|
||||
vllm_config.compilation_config.cudagraph_mode,
|
||||
CUDAGraphMode.NONE,
|
||||
@@ -428,6 +469,7 @@ class TestNPUPlatform(TestBase):
|
||||
vllm_config.cache_config.enable_prefix_caching = True
|
||||
vllm_config.parallel_config.tensor_parallel_size = 1
|
||||
mock_init_recompute.return_value = MagicMock()
|
||||
vllm_config.scheduler_config = MagicMock()
|
||||
|
||||
from vllm_ascend import platform
|
||||
|
||||
@@ -452,6 +494,7 @@ class TestNPUPlatform(TestBase):
|
||||
vllm_config.parallel_config.worker_cls = "auto"
|
||||
vllm_config.parallel_config.tensor_parallel_size = 1
|
||||
mock_init_recompute.return_value = MagicMock()
|
||||
vllm_config.scheduler_config = MagicMock()
|
||||
|
||||
from vllm_ascend import platform
|
||||
|
||||
@@ -489,6 +532,7 @@ class TestNPUPlatform(TestBase):
|
||||
vllm_config.parallel_config.tensor_parallel_size = 1
|
||||
mock_init_recompute.return_value = MagicMock()
|
||||
|
||||
vllm_config.scheduler_config = MagicMock()
|
||||
from vllm_ascend import platform
|
||||
|
||||
importlib.reload(platform)
|
||||
@@ -609,8 +653,12 @@ class TestNPUPlatform(TestBase):
|
||||
|
||||
def test_get_punica_wrapper(self):
|
||||
result = self.platform.get_punica_wrapper()
|
||||
self.assertEqual(result,
|
||||
"vllm_ascend.lora.punica_npu.PunicaWrapperNPU")
|
||||
if vllm_version_is("0.11.0"):
|
||||
self.assertEqual(
|
||||
result, "vllm_ascend.lora.punica_npu.PunicaWrapperNPU0110")
|
||||
else:
|
||||
self.assertEqual(result,
|
||||
"vllm_ascend.lora.punica_npu.PunicaWrapperNPU")
|
||||
|
||||
@patch("torch.npu.reset_peak_memory_stats")
|
||||
@patch("torch.npu.max_memory_allocated")
|
||||
@@ -674,54 +722,3 @@ class TestNPUPlatform(TestBase):
|
||||
self.platform.get_static_graph_wrapper_cls(),
|
||||
"vllm_ascend.compilation.acl_graph.ACLGraphWrapper",
|
||||
)
|
||||
|
||||
@patch("torch.distributed.is_hccl_available", return_value=True)
|
||||
@patch("torch_npu._C._distributed_c10d.ProcessGroupHCCL")
|
||||
@patch("torch.distributed.ProcessGroup")
|
||||
def test_successful_initialization(self, mock_pg, mock_pg_hccl, _):
|
||||
mock_prefix = MagicMock(spec=PrefixStore)
|
||||
mock_backend = MagicMock()
|
||||
mock_pg_hccl.return_value = mock_backend
|
||||
group_rank = 0
|
||||
group_size = 4
|
||||
|
||||
mock_pg_instance = MagicMock(spec=ProcessGroup)
|
||||
mock_pg.return_value = mock_pg_instance
|
||||
|
||||
# Use importlib.reload() to force-reload the platform module and ensure the mocked ProcessGroup is used.
|
||||
# Without this reload, when executing self.platform.stateless_init_device_torch_dist_pg(),
|
||||
# it would invoke the original unmocked ProcessGroup implementation instead of our test mock,
|
||||
# which would cause the unit test to fail.
|
||||
from vllm_ascend import platform
|
||||
|
||||
importlib.reload(platform)
|
||||
|
||||
result = self.platform.stateless_init_device_torch_dist_pg(
|
||||
backend="hccl",
|
||||
prefix_store=mock_prefix,
|
||||
group_rank=group_rank,
|
||||
group_size=group_size,
|
||||
timeout=timedelta(seconds=30),
|
||||
)
|
||||
|
||||
mock_pg.assert_called_once_with(mock_prefix, group_rank, group_size)
|
||||
mock_pg_hccl.assert_called_once_with(mock_prefix, group_rank,
|
||||
group_size, unittest.mock.ANY)
|
||||
mock_backend._set_sequence_number_for_group.assert_called_once()
|
||||
mock_pg_instance._register_backend.assert_called_once_with(
|
||||
torch.device("npu"), unittest.mock.ANY, mock_backend)
|
||||
self.assertEqual(result, mock_pg_instance)
|
||||
|
||||
@patch("torch.distributed.is_hccl_available", return_value=False)
|
||||
def test_hccl_unavailable(self, _):
|
||||
with self.assertRaises(AssertionError):
|
||||
from vllm_ascend import platform
|
||||
|
||||
importlib.reload(platform)
|
||||
self.platform.stateless_init_device_torch_dist_pg(
|
||||
backend="hccl",
|
||||
prefix_store=MagicMock(),
|
||||
group_rank=0,
|
||||
group_size=4,
|
||||
timeout=timedelta(seconds=30),
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user