Files
xc-llm-kunlun/vllm_kunlun/worker/worker.py
2025-12-10 17:51:24 +08:00

51 lines
2.3 KiB
Python

"""worker"""
from typing import Dict, List, Optional, Set, Tuple, Type, Union
from vllm.v1.worker.gpu_worker import Worker, _check_if_gpu_supports_dtype, init_worker_distributed_environment
from vllm.model_executor import set_random_seed
from .model_runner import KunlunModelRunner
from vllm.utils import MemorySnapshot
import torch
import os
import gc
class KunlunWorker(Worker):
"""Worker"""
def init_device(self):
if self.device_config.device.type == "cuda":
# torch.distributed.all_reduce does not free the input tensor until
# the synchronization point. This causes the memory usage to grow
# as the number of all_reduce calls increases. This env var disables
# this behavior.
# Related issue:
# https://discuss.pytorch.org/t/cuda-allocation-lifetime-for-inputs-to-distributed-all-reduce/191573
os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1"
# This env var set by Ray causes exceptions with graph building.
os.environ.pop("NCCL_ASYNC_ERROR_HANDLING", None)
self.device = torch.device(f"cuda:{self.local_rank}")
torch.cuda.set_device(self.device)
_check_if_gpu_supports_dtype(self.model_config.dtype)
gc.collect()
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()
self.init_snapshot = MemorySnapshot()
free_memory, total = torch.cuda.mem_get_info()
self.init_gpu_memory = free_memory
# 设置一个合理的初始值,比如总内存的 80%
self.requested_memory = int(total * 0.2) # 留出 20% 的余量
else:
raise RuntimeError(
f"Not support device type: {self.device_config.device}")
# Initialize the distributed environment.
init_worker_distributed_environment(self.vllm_config,
self.rank,
self.distributed_init_method,
self.local_rank)
# Set random seed.
set_random_seed(self.model_config.seed)
# Construct the model runner
self.model_runner: KunlunModelRunner = KunlunModelRunner(
self.vllm_config, self.device)