Overlapped weight offload (#8034)

This commit is contained in:
fzyzcjy
2025-08-23 17:06:46 +08:00
committed by GitHub
parent ccd3fb946e
commit 2600fc0d47
9 changed files with 584 additions and 10 deletions

View File

@@ -23,8 +23,10 @@ import dataclasses
import logging
import multiprocessing as mp
import os
import random
import signal
import threading
import time
from typing import AsyncIterator, Dict, Iterator, List, Optional, Tuple, Union
import zmq
@@ -654,6 +656,11 @@ def _set_envs_and_config(server_args: ServerArgs):
# flashinfer uses this environment variable for various kernels from MoE to quant kernels
os.environ["TRTLLM_ENABLE_PDL"] = "1"
# Can also be passed as argument
os.environ["SGLANG_RUN_ID"] = (
f"sglang-run-{time.time()}-{random.randint(0, 100000000)}"
)
# Set prometheus env vars
if server_args.enable_metrics:
set_prometheus_multiproc_dir()