提交vllm0.11.0开发分支
This commit is contained in:
0
vllm_kunlun/worker/__init__.py
Normal file
0
vllm_kunlun/worker/__init__.py
Normal file
2043
vllm_kunlun/worker/model_runner.py
Normal file
2043
vllm_kunlun/worker/model_runner.py
Normal file
File diff suppressed because it is too large
Load Diff
50
vllm_kunlun/worker/worker.py
Normal file
50
vllm_kunlun/worker/worker.py
Normal file
@@ -0,0 +1,50 @@
|
||||
"""worker"""
|
||||
from typing import Dict, List, Optional, Set, Tuple, Type, Union
|
||||
from vllm.v1.worker.gpu_worker import Worker, _check_if_gpu_supports_dtype, init_worker_distributed_environment
|
||||
from vllm.model_executor import set_random_seed
|
||||
from .model_runner import KunlunModelRunner
|
||||
from vllm.utils import MemorySnapshot
|
||||
import torch
|
||||
import os
|
||||
import gc
|
||||
|
||||
class KunlunWorker(Worker):
|
||||
"""Worker"""
|
||||
|
||||
def init_device(self):
|
||||
if self.device_config.device.type == "cuda":
|
||||
# torch.distributed.all_reduce does not free the input tensor until
|
||||
# the synchronization point. This causes the memory usage to grow
|
||||
# as the number of all_reduce calls increases. This env var disables
|
||||
# this behavior.
|
||||
# Related issue:
|
||||
# https://discuss.pytorch.org/t/cuda-allocation-lifetime-for-inputs-to-distributed-all-reduce/191573
|
||||
os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1"
|
||||
|
||||
# This env var set by Ray causes exceptions with graph building.
|
||||
os.environ.pop("NCCL_ASYNC_ERROR_HANDLING", None)
|
||||
self.device = torch.device(f"cuda:{self.local_rank}")
|
||||
torch.cuda.set_device(self.device)
|
||||
|
||||
_check_if_gpu_supports_dtype(self.model_config.dtype)
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
torch.cuda.reset_peak_memory_stats()
|
||||
self.init_snapshot = MemorySnapshot()
|
||||
free_memory, total = torch.cuda.mem_get_info()
|
||||
self.init_gpu_memory = free_memory
|
||||
# 设置一个合理的初始值,比如总内存的 80%
|
||||
self.requested_memory = int(total * 0.2) # 留出 20% 的余量
|
||||
else:
|
||||
raise RuntimeError(
|
||||
f"Not support device type: {self.device_config.device}")
|
||||
# Initialize the distributed environment.
|
||||
init_worker_distributed_environment(self.vllm_config,
|
||||
self.rank,
|
||||
self.distributed_init_method,
|
||||
self.local_rank)
|
||||
# Set random seed.
|
||||
set_random_seed(self.model_config.seed)
|
||||
# Construct the model runner
|
||||
self.model_runner: KunlunModelRunner = KunlunModelRunner(
|
||||
self.vllm_config, self.device)
|
||||
Reference in New Issue
Block a user