xc-llm-kunlun/vllm_kunlun/worker/worker.py

"""worker"""
from typing import Dict, List, Optional, Set, Tuple, Type, Union
from vllm.v1.worker.gpu_worker import Worker, _check_if_gpu_supports_dtype, init_worker_distributed_environment
from vllm.model_executor import set_random_seed
from .model_runner import KunlunModelRunner
from vllm.utils import MemorySnapshot
import torch
import os
import gc

class KunlunWorker(Worker):
    """Worker"""

    def init_device(self):
        if self.device_config.device.type == "cuda":
            # torch.distributed.all_reduce does not free the input tensor until
            # the synchronization point. This causes the memory usage to grow
            # as the number of all_reduce calls increases. This env var disables
            # this behavior.
            # Related issue:
            # https://discuss.pytorch.org/t/cuda-allocation-lifetime-for-inputs-to-distributed-all-reduce/191573
            os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1"

            # This env var set by Ray causes exceptions with graph building.
            os.environ.pop("NCCL_ASYNC_ERROR_HANDLING", None)
            self.device = torch.device(f"cuda:{self.local_rank}")
            torch.cuda.set_device(self.device)

            _check_if_gpu_supports_dtype(self.model_config.dtype)
            gc.collect()
            torch.cuda.empty_cache()
            torch.cuda.reset_peak_memory_stats()
            self.init_snapshot = MemorySnapshot()
            free_memory, total = torch.cuda.mem_get_info()
            self.init_gpu_memory = free_memory
            # 设置一个合理的初始值，比如总内存的 80%
            self.requested_memory = int(total * 0.2)  # 留出 20% 的余量
        else:
            raise RuntimeError(
                f"Not support device type: {self.device_config.device}")
        # Initialize the distributed environment.
        init_worker_distributed_environment(self.vllm_config,
                                            self.rank,
                                            self.distributed_init_method,
                                            self.local_rank)
        # Set random seed.
        set_random_seed(self.model_config.seed)
        # Construct the model runner
        self.model_runner: KunlunModelRunner = KunlunModelRunner(
            self.vllm_config, self.device)