51 lines
2.3 KiB
Python
51 lines
2.3 KiB
Python
"""worker"""
|
|
from typing import Dict, List, Optional, Set, Tuple, Type, Union
|
|
from vllm.v1.worker.gpu_worker import Worker, _check_if_gpu_supports_dtype, init_worker_distributed_environment
|
|
from vllm.model_executor import set_random_seed
|
|
from .model_runner import KunlunModelRunner
|
|
from vllm.utils import MemorySnapshot
|
|
import torch
|
|
import os
|
|
import gc
|
|
|
|
class KunlunWorker(Worker):
|
|
"""Worker"""
|
|
|
|
def init_device(self):
|
|
if self.device_config.device.type == "cuda":
|
|
# torch.distributed.all_reduce does not free the input tensor until
|
|
# the synchronization point. This causes the memory usage to grow
|
|
# as the number of all_reduce calls increases. This env var disables
|
|
# this behavior.
|
|
# Related issue:
|
|
# https://discuss.pytorch.org/t/cuda-allocation-lifetime-for-inputs-to-distributed-all-reduce/191573
|
|
os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1"
|
|
|
|
# This env var set by Ray causes exceptions with graph building.
|
|
os.environ.pop("NCCL_ASYNC_ERROR_HANDLING", None)
|
|
self.device = torch.device(f"cuda:{self.local_rank}")
|
|
torch.cuda.set_device(self.device)
|
|
|
|
_check_if_gpu_supports_dtype(self.model_config.dtype)
|
|
gc.collect()
|
|
torch.cuda.empty_cache()
|
|
torch.cuda.reset_peak_memory_stats()
|
|
self.init_snapshot = MemorySnapshot()
|
|
free_memory, total = torch.cuda.mem_get_info()
|
|
self.init_gpu_memory = free_memory
|
|
# 设置一个合理的初始值,比如总内存的 80%
|
|
self.requested_memory = int(total * 0.2) # 留出 20% 的余量
|
|
else:
|
|
raise RuntimeError(
|
|
f"Not support device type: {self.device_config.device}")
|
|
# Initialize the distributed environment.
|
|
init_worker_distributed_environment(self.vllm_config,
|
|
self.rank,
|
|
self.distributed_init_method,
|
|
self.local_rank)
|
|
# Set random seed.
|
|
set_random_seed(self.model_config.seed)
|
|
# Construct the model runner
|
|
self.model_runner: KunlunModelRunner = KunlunModelRunner(
|
|
self.vllm_config, self.device)
|