提交vllm0.11.0开发分支

2025-12-10 17:51:24 +08:00
parent deab7dd0b6
commit 7c22d621fb
175 changed files with 31856 additions and 8683 deletions
--- a/vllm_kunlun/worker/init.py
+++ b/vllm_kunlun/worker/init.py
--- a/vllm_kunlun/worker/model_runner.py
+++ b/vllm_kunlun/worker/model_runner.py
--- a/vllm_kunlun/worker/worker.py
+++ b/vllm_kunlun/worker/worker.py
@@ -0,0 +1,50 @@
+"""worker"""
+from typing import Dict, List, Optional, Set, Tuple, Type, Union
+from vllm.v1.worker.gpu_worker import Worker, _check_if_gpu_supports_dtype, init_worker_distributed_environment
+from vllm.model_executor import set_random_seed
+from .model_runner import KunlunModelRunner
+from vllm.utils import MemorySnapshot
+import torch
+import os
+import gc
+
+class KunlunWorker(Worker):
+    """Worker"""
+
+    def init_device(self):
+        if self.device_config.device.type == "cuda":
+            # torch.distributed.all_reduce does not free the input tensor until
+            # the synchronization point. This causes the memory usage to grow
+            # as the number of all_reduce calls increases. This env var disables
+            # this behavior.
+            # Related issue:
+            # https://discuss.pytorch.org/t/cuda-allocation-lifetime-for-inputs-to-distributed-all-reduce/191573
+            os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1"
+
+            # This env var set by Ray causes exceptions with graph building.
+            os.environ.pop("NCCL_ASYNC_ERROR_HANDLING", None)
+            self.device = torch.device(f"cuda:{self.local_rank}")
+            torch.cuda.set_device(self.device)
+
+            _check_if_gpu_supports_dtype(self.model_config.dtype)
+            gc.collect()
+            torch.cuda.empty_cache()
+            torch.cuda.reset_peak_memory_stats()
+            self.init_snapshot = MemorySnapshot() 
+            free_memory, total = torch.cuda.mem_get_info()
+            self.init_gpu_memory = free_memory
+            # 设置一个合理的初始值，比如总内存的 80%
+            self.requested_memory = int(total * 0.2)  # 留出 20% 的余量
+        else:
+            raise RuntimeError(
+                f"Not support device type: {self.device_config.device}")
+        # Initialize the distributed environment.
+        init_worker_distributed_environment(self.vllm_config, 
+                                            self.rank,
+                                            self.distributed_init_method,
+                                            self.local_rank)
+        # Set random seed.
+        set_random_seed(self.model_config.seed)
+        # Construct the model runner
+        self.model_runner: KunlunModelRunner = KunlunModelRunner(
+            self.vllm_config, self.device)