From 292fb8f69601507304b8c1c0a90487e73b0bc164 Mon Sep 17 00:00:00 2001 From: wangxiyuan Date: Tue, 5 Aug 2025 18:43:04 +0800 Subject: [PATCH] [1/N][Refactor] torchair model runner refactor (#2205) There is lot of torchair code in model runner leading the code hard for maintenance. We'll create new torchair_model_runner to split torchair related logic. Following the workflow #2203, this is the first PR. What this PR does: create the new torchair model runner, more function will be added later - vLLM version: v0.10.0 - vLLM main: https://github.com/vllm-project/vllm/commit/586f286789a09f5616be74ee8bedde0a9f698a72 Signed-off-by: wangxiyuan --- .github/workflows/vllm_ascend_test.yaml | 17 ++++++----- vllm_ascend/torchair/torchair_model_runner.py | 29 +++++++++++++++++++ vllm_ascend/torchair/torchair_worker.py | 7 +++++ vllm_ascend/worker/worker_v1.py | 6 ++-- 4 files changed, 50 insertions(+), 9 deletions(-) create mode 100644 vllm_ascend/torchair/torchair_model_runner.py diff --git a/.github/workflows/vllm_ascend_test.yaml b/.github/workflows/vllm_ascend_test.yaml index 32363ff..bdb0c96 100644 --- a/.github/workflows/vllm_ascend_test.yaml +++ b/.github/workflows/vllm_ascend_test.yaml @@ -196,6 +196,13 @@ jobs: pytest -sv tests/e2e/singlecard/test_guided_decoding.py pytest -sv tests/e2e/singlecard/test_camem.py pytest -sv tests/e2e/singlecard/test_embedding.py + + # ------------------------------------ v1 spec decode test ------------------------------------ # + pytest -sv tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py + # TODO: revert me when test_v1_spec_decode.py::test_ngram_correctness is fixed + pytest -sv tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py + + # All other tests, ignore: 310p test, accuracy test. pytest -sv tests/e2e/singlecard/ \ --ignore=tests/e2e/singlecard/test_offline_inference.py \ --ignore=tests/e2e/singlecard/test_ilama_lora.py \ @@ -204,13 +211,9 @@ jobs: --ignore=tests/e2e/singlecard/test_embedding.py \ --ignore=tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py \ --ignore=tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py \ - --ignore=tests/e2e/singlecard/test_offline_inference_310p.py - # ------------------------------------ v1 spec decode test ------------------------------------ # - VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py - # TODO: revert me when test_v1_spec_decode.py::test_ngram_correctness is fixed - VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py - - e2e-4-cards: + --ignore=tests/e2e/singlecard/test_offline_inference_310p.py \ + --ignore=tests/e2e/singlecard/models/test_lm_eval_correctness.py + e2e-2-cards: needs: [e2e] if: ${{ needs.e2e.result == 'success' }} strategy: diff --git a/vllm_ascend/torchair/torchair_model_runner.py b/vllm_ascend/torchair/torchair_model_runner.py new file mode 100644 index 0000000..2001674 --- /dev/null +++ b/vllm_ascend/torchair/torchair_model_runner.py @@ -0,0 +1,29 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# Adapted from vllm-project/vllm/vllm/worker/gpu_model_runner.py +# + +import torch +from vllm.config import VllmConfig + +from vllm_ascend.worker.model_runner_v1 import NPUModelRunner + + +class NPUTorchairModelRunner(NPUModelRunner): + + def __init__(self, vllm_config: VllmConfig, device: torch.device): + super().__init__(vllm_config, device) diff --git a/vllm_ascend/torchair/torchair_worker.py b/vllm_ascend/torchair/torchair_worker.py index f74bc02..3488ac7 100644 --- a/vllm_ascend/torchair/torchair_worker.py +++ b/vllm_ascend/torchair/torchair_worker.py @@ -17,6 +17,7 @@ import torch from vllm.logger import logger import vllm_ascend.envs as envs_ascend +from vllm_ascend.torchair.torchair_model_runner import NPUTorchairModelRunner from vllm_ascend.torchair.utils import (check_kv_cache_bytes_cache_exist, check_torchair_cache_exist, delete_torchair_cache_file, @@ -52,3 +53,9 @@ class NPUTorchairWorker(NPUWorker): self.model_runner.new_kv_cache_bytes = available_kv_cache_memory return available_kv_cache_memory + + def init_device(self): + """Override init_device to init torchair model runner""" + device = self._init_device() + # Init ModelRunner here, so that we have access to self.device. + self.model_runner = NPUTorchairModelRunner(self.vllm_config, device) diff --git a/vllm_ascend/worker/worker_v1.py b/vllm_ascend/worker/worker_v1.py index c13238a..d9bff3c 100644 --- a/vllm_ascend/worker/worker_v1.py +++ b/vllm_ascend/worker/worker_v1.py @@ -130,17 +130,19 @@ class NPUWorker(WorkerBase): self.cache_config.num_gpu_blocks = num_gpu_blocks self.cache_config.num_cpu_blocks = num_cpu_blocks - def init_device(self): + def _init_device(self): device = torch.device(f"npu:{self.local_rank}") NPUPlatform.set_device(device) NPUPlatform.empty_cache() self.init_npu_memory = NPUPlatform.mem_get_info()[0] - # Initialize the distributed environment. self._init_worker_distributed_environment() # Set random seed. NPUPlatform.seed_everything(self.model_config.seed) + return device + def init_device(self): + device = self._init_device() # Init ModelRunner here, so that we have access to self.device. self.model_runner = NPUModelRunner(self.vllm_config, device)