[1/N][Refactor] torchair model runner refactor (#2205)

There is lot of torchair code in model runner leading the code hard for maintenance. We'll create new torchair_model_runner to split torchair related logic. Following the workflow #2203, this is the first PR. What this PR does: create the new torchair model runner, more function will be added later - vLLM version: v0.10.0 - vLLM main: 586f286789 Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2025-08-05 18:43:04 +08:00
parent 458ab2db12
commit 292fb8f696
4 changed files with 50 additions and 9 deletions
--- a/.github/workflows/vllm_ascend_test.yaml
+++ b/.github/workflows/vllm_ascend_test.yaml
@@ -196,6 +196,13 @@ jobs:
          pytest -sv tests/e2e/singlecard/test_guided_decoding.py
          pytest -sv tests/e2e/singlecard/test_camem.py
          pytest -sv tests/e2e/singlecard/test_embedding.py
+
+          # ------------------------------------ v1 spec decode test ------------------------------------ #
+          pytest -sv tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py
+          # TODO: revert me when test_v1_spec_decode.py::test_ngram_correctness is fixed
+          pytest -sv tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py
+
+          # All other tests, ignore: 310p test, accuracy test.
          pytest -sv tests/e2e/singlecard/ \
          --ignore=tests/e2e/singlecard/test_offline_inference.py \
          --ignore=tests/e2e/singlecard/test_ilama_lora.py \
@@ -204,13 +211,9 @@ jobs:
          --ignore=tests/e2e/singlecard/test_embedding.py \
          --ignore=tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py \
          --ignore=tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py \
-          --ignore=tests/e2e/singlecard/test_offline_inference_310p.py
-          # ------------------------------------ v1 spec decode test ------------------------------------ #
-          VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py
-          # TODO: revert me when test_v1_spec_decode.py::test_ngram_correctness is fixed
-          VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py
-
-  e2e-4-cards:
+          --ignore=tests/e2e/singlecard/test_offline_inference_310p.py \
+          --ignore=tests/e2e/singlecard/models/test_lm_eval_correctness.py
+  e2e-2-cards:
    needs: [e2e]
    if: ${{ needs.e2e.result == 'success' }}
    strategy:
--- a/vllm_ascend/torchair/torchair_model_runner.py
+++ b/vllm_ascend/torchair/torchair_model_runner.py
@@ -0,0 +1,29 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+# Adapted from vllm-project/vllm/vllm/worker/gpu_model_runner.py
+#
+
+import torch
+from vllm.config import VllmConfig
+
+from vllm_ascend.worker.model_runner_v1 import NPUModelRunner
+
+
+class NPUTorchairModelRunner(NPUModelRunner):
+
+    def __init__(self, vllm_config: VllmConfig, device: torch.device):
+        super().__init__(vllm_config, device)
--- a/vllm_ascend/torchair/torchair_worker.py
+++ b/vllm_ascend/torchair/torchair_worker.py
@@ -17,6 +17,7 @@ import torch
 from vllm.logger import logger

 import vllm_ascend.envs as envs_ascend
+from vllm_ascend.torchair.torchair_model_runner import NPUTorchairModelRunner
 from vllm_ascend.torchair.utils import (check_kv_cache_bytes_cache_exist,
                                        check_torchair_cache_exist,
                                        delete_torchair_cache_file,
@@ -52,3 +53,9 @@ class NPUTorchairWorker(NPUWorker):
        self.model_runner.new_kv_cache_bytes = available_kv_cache_memory

        return available_kv_cache_memory
+
+    def init_device(self):
+        """Override init_device to init torchair model runner"""
+        device = self._init_device()
+        # Init ModelRunner here, so that we have access to self.device.
+        self.model_runner = NPUTorchairModelRunner(self.vllm_config, device)
--- a/vllm_ascend/worker/worker_v1.py
+++ b/vllm_ascend/worker/worker_v1.py
@@ -130,17 +130,19 @@ class NPUWorker(WorkerBase):
        self.cache_config.num_gpu_blocks = num_gpu_blocks
        self.cache_config.num_cpu_blocks = num_cpu_blocks

-    def init_device(self):
+    def _init_device(self):
        device = torch.device(f"npu:{self.local_rank}")
        NPUPlatform.set_device(device)
        NPUPlatform.empty_cache()
        self.init_npu_memory = NPUPlatform.mem_get_info()[0]
-
        # Initialize the distributed environment.
        self._init_worker_distributed_environment()
        # Set random seed.
        NPUPlatform.seed_everything(self.model_config.seed)
+        return device

+    def init_device(self):
+        device = self._init_device()
        # Init ModelRunner here, so that we have access to self.device.
        self.model_runner = NPUModelRunner(self.vllm_config, device)