[Feature] model_runner refactor (#4764)

### What this PR does / why we need it? refactor npu_modelrunner， we should be close to gpu_modelrunner ### Does this PR introduce _any_ user-facing change? NO - vLLM version: v0.12.0 - vLLM main: ad32e3e19c --------- Signed-off-by: zhenwenqi2024 <zhenwenqi_2022@qq.com> Signed-off-by: zhenwenqi2024 <155598497+zhenwenqi2024@users.noreply.github.com>
2025-12-12 17:27:09 +08:00
parent 5b12c068f9
commit f708d919f8
10 changed files with 676 additions and 1815 deletions
--- a/tests/ut/worker/test_input_batch.py
+++ b/tests/ut/worker/test_input_batch.py
@@ -24,6 +24,7 @@ from vllm.utils.torch_utils import make_tensor_with_pad
 from vllm.v1.pool.metadata import PoolingMetadata
 from vllm.v1.sample.logits_processor import LogitsProcessors
 from vllm.v1.sample.metadata import SamplingMetadata
+from vllm.v1.utils import CpuGpuBuffer

 from vllm_ascend.worker.block_table import BlockTable, MultiGroupBlockTable
 from vllm_ascend.worker.npu_input_batch import CachedRequestState, InputBatch
@@ -67,6 +68,8 @@ def _compare_objs(obj1,
            is_same = True  # if we make it here must be same
        elif a == b:
            is_same = True
+        elif isinstance(a, CpuGpuBuffer):
+            is_same = np.allclose(a.np, b.np) and torch.allclose(a.gpu, b.gpu)
        assert is_same, f"Attribute {attr_name} is different"\
            f" in {obj1} and {obj2}: {a} != {b}"

--- a/tests/ut/worker/test_model_runner_v1.py
+++ b/tests/ut/worker/test_model_runner_v1.py
@@ -1,113 +0,0 @@
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# This file is a part of the vllm-ascend project.
-
-from unittest.mock import MagicMock, patch
-
-import pytest
-
-from vllm_ascend.ascend_forward_context import MoECommType
-from vllm_ascend.utils import AscendDeviceType
-from vllm_ascend.worker.model_runner_v1 import NPUModelRunner
-
-
-# yapf: disable
-@pytest.mark.parametrize(
-    "soc_version, enable_expert_parallel, world_size, pipeline_size, num_tokens, mc2_tokens_capacity, quant_type, expected_method",
-    [
-        # Case 1: Expert parallel is disabled, should always be 'allgather'
-        (AscendDeviceType._910B, False, 8, 2, 100, 256, None, MoECommType.ALLGATHER),
-        (AscendDeviceType._910_93, False, 16, 2, 500, 256, None, MoECommType.ALLGATHER),
-
-        # Case 2: A2 SOC with w4a8_dynamic -> use alltoall when not mc2
-        (AscendDeviceType._910B, True, 8, 1, 100, 256, "w4a8_dynamic", MoECommType.ALLTOALL),
-        (AscendDeviceType._910B, True, 16, 1, 257, 256, "w4a8_dynamic", MoECommType.ALLTOALL),
-        (AscendDeviceType._910B, True, 16, 1, 100, 256, "w4a8_dynamic", MoECommType.MC2),  # meets mc2 condition
-
-        # Case 3: A2 SOC without w4a8_dynamic -> fallback to allgather
-        (AscendDeviceType._910B, True, 8, 2, 100, 256, None, MoECommType.ALLGATHER),
-        (AscendDeviceType._910B, True, 16, 2, 257, 256, None, MoECommType.ALLGATHER),
-
-        # Case 4: A3 SOC
-        (AscendDeviceType._910_93, True, 8, 2, 100, 256, None, MoECommType.MC2),
-        (AscendDeviceType._910_93, True, 8, 2, 257, 256, None, MoECommType.ALLTOALL),
-    ])
-# yapf: enable
-def test_select_moe_comm_method(soc_version, enable_expert_parallel,
-                                world_size, pipeline_size, num_tokens,
-                                mc2_tokens_capacity, quant_type,
-                                expected_method):
-    """
-    Tests the _select_moe_comm_method with various configurations including quant_type.
-    """
-    # Mock the NPUModelRunner instance and its dependencies
-    mock_runner = MagicMock(spec=NPUModelRunner)
-    mock_runner.parallel_config = MagicMock()
-    mock_runner.parallel_config.enable_expert_parallel = enable_expert_parallel
-    mock_runner.parallel_config.world_size_across_dp = world_size
-    mock_runner.parallel_config.pipeline_parallel_size = pipeline_size
-    mock_runner.mc2_tokens_capacity = mc2_tokens_capacity
-
-    # Add vllm_config.model_config.hf_config mock with moe_quantize
-    mock_hf_config = MagicMock()
-    mock_hf_config.moe_quantize = quant_type
-    mock_model_config = MagicMock()
-    mock_model_config.hf_config = mock_hf_config
-    mock_vllm_config = MagicMock()
-    mock_vllm_config.model_config = mock_model_config
-    mock_runner.vllm_config = mock_vllm_config
-
-    # Patch the helper functions
-    with patch('vllm_ascend.worker.model_runner_v1.get_ascend_device_type',
-               return_value=soc_version), \
-         patch('vllm_ascend.worker.model_runner_v1.is_global_first_rank',
-               return_value=True), \
-         patch('vllm_ascend.worker.model_runner_v1.is_moe_model',
-               return_value=True):
-
-        # Bind the real method to the mock object
-        method = NPUModelRunner._select_moe_comm_method(
-            mock_runner, num_tokens)
-
-        # Assert the result
-        assert method == expected_method
-
-
-def test_select_moe_comm_method_unsupported_soc():
-    """
-    Tests that _select_moe_comm_method raises ValueError for an unsupported SOC.
-    """
-    mock_runner = MagicMock(spec=NPUModelRunner)
-    mock_runner.parallel_config = MagicMock()
-    mock_runner.parallel_config.enable_expert_parallel = True
-    mock_runner.mc2_tokens_capacity = 256
-
-    # Add vllm_config.model_config.hf_config mock with moe_quantize
-    mock_hf_config = MagicMock()
-    mock_hf_config.moe_quantize = None
-    mock_model_config = MagicMock()
-    mock_model_config.hf_config = mock_hf_config
-    mock_vllm_config = MagicMock()
-    mock_vllm_config.model_config = mock_model_config
-    mock_runner.vllm_config = mock_vllm_config
-
-    unsupported_soc = "UnsupportedSOC"
-
-    with patch('vllm_ascend.worker.model_runner_v1.get_ascend_device_type',
-               return_value=unsupported_soc), \
-         patch('vllm_ascend.worker.model_runner_v1.is_global_first_rank',
-               return_value=True), \
-         patch('vllm_ascend.worker.model_runner_v1.is_moe_model',
-                  return_value=True), \
-         pytest.raises(ValueError, match=f"Unsupported soc_version: {unsupported_soc}"):
-
-        NPUModelRunner._select_moe_comm_method(mock_runner, 100)