[1/N][Refactor] torchair model runner refactor (#2205)
There is lot of torchair code in model runner leading the code hard for
maintenance. We'll create new torchair_model_runner to split torchair
related logic. Following the workflow #2203, this is the first PR.
What this PR does:
create the new torchair model runner, more function will be added later
- vLLM version: v0.10.0
- vLLM main:
586f286789
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
17
.github/workflows/vllm_ascend_test.yaml
vendored
17
.github/workflows/vllm_ascend_test.yaml
vendored
@@ -196,6 +196,13 @@ jobs:
|
||||
pytest -sv tests/e2e/singlecard/test_guided_decoding.py
|
||||
pytest -sv tests/e2e/singlecard/test_camem.py
|
||||
pytest -sv tests/e2e/singlecard/test_embedding.py
|
||||
|
||||
# ------------------------------------ v1 spec decode test ------------------------------------ #
|
||||
pytest -sv tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py
|
||||
# TODO: revert me when test_v1_spec_decode.py::test_ngram_correctness is fixed
|
||||
pytest -sv tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py
|
||||
|
||||
# All other tests, ignore: 310p test, accuracy test.
|
||||
pytest -sv tests/e2e/singlecard/ \
|
||||
--ignore=tests/e2e/singlecard/test_offline_inference.py \
|
||||
--ignore=tests/e2e/singlecard/test_ilama_lora.py \
|
||||
@@ -204,13 +211,9 @@ jobs:
|
||||
--ignore=tests/e2e/singlecard/test_embedding.py \
|
||||
--ignore=tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py \
|
||||
--ignore=tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py \
|
||||
--ignore=tests/e2e/singlecard/test_offline_inference_310p.py
|
||||
# ------------------------------------ v1 spec decode test ------------------------------------ #
|
||||
VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py
|
||||
# TODO: revert me when test_v1_spec_decode.py::test_ngram_correctness is fixed
|
||||
VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py
|
||||
|
||||
e2e-4-cards:
|
||||
--ignore=tests/e2e/singlecard/test_offline_inference_310p.py \
|
||||
--ignore=tests/e2e/singlecard/models/test_lm_eval_correctness.py
|
||||
e2e-2-cards:
|
||||
needs: [e2e]
|
||||
if: ${{ needs.e2e.result == 'success' }}
|
||||
strategy:
|
||||
|
||||
29
vllm_ascend/torchair/torchair_model_runner.py
Normal file
29
vllm_ascend/torchair/torchair_model_runner.py
Normal file
@@ -0,0 +1,29 @@
|
||||
#
|
||||
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
||||
# Copyright 2023 The vLLM team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# This file is a part of the vllm-ascend project.
|
||||
# Adapted from vllm-project/vllm/vllm/worker/gpu_model_runner.py
|
||||
#
|
||||
|
||||
import torch
|
||||
from vllm.config import VllmConfig
|
||||
|
||||
from vllm_ascend.worker.model_runner_v1 import NPUModelRunner
|
||||
|
||||
|
||||
class NPUTorchairModelRunner(NPUModelRunner):
|
||||
|
||||
def __init__(self, vllm_config: VllmConfig, device: torch.device):
|
||||
super().__init__(vllm_config, device)
|
||||
@@ -17,6 +17,7 @@ import torch
|
||||
from vllm.logger import logger
|
||||
|
||||
import vllm_ascend.envs as envs_ascend
|
||||
from vllm_ascend.torchair.torchair_model_runner import NPUTorchairModelRunner
|
||||
from vllm_ascend.torchair.utils import (check_kv_cache_bytes_cache_exist,
|
||||
check_torchair_cache_exist,
|
||||
delete_torchair_cache_file,
|
||||
@@ -52,3 +53,9 @@ class NPUTorchairWorker(NPUWorker):
|
||||
self.model_runner.new_kv_cache_bytes = available_kv_cache_memory
|
||||
|
||||
return available_kv_cache_memory
|
||||
|
||||
def init_device(self):
|
||||
"""Override init_device to init torchair model runner"""
|
||||
device = self._init_device()
|
||||
# Init ModelRunner here, so that we have access to self.device.
|
||||
self.model_runner = NPUTorchairModelRunner(self.vllm_config, device)
|
||||
|
||||
@@ -130,17 +130,19 @@ class NPUWorker(WorkerBase):
|
||||
self.cache_config.num_gpu_blocks = num_gpu_blocks
|
||||
self.cache_config.num_cpu_blocks = num_cpu_blocks
|
||||
|
||||
def init_device(self):
|
||||
def _init_device(self):
|
||||
device = torch.device(f"npu:{self.local_rank}")
|
||||
NPUPlatform.set_device(device)
|
||||
NPUPlatform.empty_cache()
|
||||
self.init_npu_memory = NPUPlatform.mem_get_info()[0]
|
||||
|
||||
# Initialize the distributed environment.
|
||||
self._init_worker_distributed_environment()
|
||||
# Set random seed.
|
||||
NPUPlatform.seed_everything(self.model_config.seed)
|
||||
return device
|
||||
|
||||
def init_device(self):
|
||||
device = self._init_device()
|
||||
# Init ModelRunner here, so that we have access to self.device.
|
||||
self.model_runner = NPUModelRunner(self.vllm_config, device)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user