init
This commit is contained in:
217
vllm/envs.py
Normal file
217
vllm/envs.py
Normal file
@@ -0,0 +1,217 @@
|
||||
import os
|
||||
from typing import TYPE_CHECKING, Any, Callable, Dict, Optional
|
||||
|
||||
if TYPE_CHECKING:
|
||||
VLLM_HOST_IP: str = ""
|
||||
VLLM_USE_MODELSCOPE: bool = False
|
||||
VLLM_INSTANCE_ID: Optional[str] = None
|
||||
VLLM_NCCL_SO_PATH: Optional[str] = None
|
||||
LD_LIBRARY_PATH: Optional[str] = None
|
||||
VLLM_USE_TRITON_FLASH_ATTN: bool = False
|
||||
LOCAL_RANK: int = 0
|
||||
CUDA_VISIBLE_DEVICES: Optional[str] = None
|
||||
VLLM_ENGINE_ITERATION_TIMEOUT_S: int = 60
|
||||
VLLM_API_KEY: Optional[str] = None
|
||||
S3_ACCESS_KEY_ID: Optional[str] = None
|
||||
S3_SECRET_ACCESS_KEY: Optional[str] = None
|
||||
S3_ENDPOINT_URL: Optional[str] = None
|
||||
VLLM_CONFIG_ROOT: str = ""
|
||||
VLLM_USAGE_STATS_SERVER: str = "https://stats.vllm.ai"
|
||||
VLLM_NO_USAGE_STATS: bool = False
|
||||
VLLM_DO_NOT_TRACK: bool = False
|
||||
VLLM_USAGE_SOURCE: str = ""
|
||||
VLLM_CONFIGURE_LOGGING: int = 1
|
||||
VLLM_LOGGING_CONFIG_PATH: Optional[str] = None
|
||||
VLLM_TRACE_FUNCTION: int = 0
|
||||
VLLM_ATTENTION_BACKEND: Optional[str] = None
|
||||
VLLM_CPU_KVCACHE_SPACE: int = 0
|
||||
VLLM_USE_RAY_COMPILED_DAG: bool = False
|
||||
VLLM_WORKER_MULTIPROC_METHOD: str = "spawn"
|
||||
VLLM_TARGET_DEVICE: str = "musa"
|
||||
MAX_JOBS: Optional[str] = None
|
||||
NVCC_THREADS: Optional[str] = None
|
||||
VLLM_BUILD_WITH_NEURON: bool = False
|
||||
VLLM_USE_PRECOMPILED: bool = False
|
||||
VLLM_INSTALL_PUNICA_KERNELS: bool = False
|
||||
CMAKE_BUILD_TYPE: Optional[str] = None
|
||||
VERBOSE: bool = False
|
||||
|
||||
# The begin-* and end* here are used by the documentation generator
|
||||
# to extract the used env vars.
|
||||
|
||||
# begin-env-vars-definition
|
||||
|
||||
environment_variables: Dict[str, Callable[[], Any]] = {
|
||||
|
||||
# ================== Installation Time Env Vars ==================
|
||||
|
||||
# Target device of vLLM, supporting [cuda (by default), rocm, neuron, cpu]
|
||||
"VLLM_TARGET_DEVICE":
|
||||
lambda: os.getenv("VLLM_TARGET_DEVICE", "cuda"),
|
||||
|
||||
# Maximum number of compilation jobs to run in parallel.
|
||||
# By default this is the number of CPUs
|
||||
"MAX_JOBS":
|
||||
lambda: os.getenv("MAX_JOBS", None),
|
||||
|
||||
# Number of threads to use for nvcc
|
||||
# By default this is 1.
|
||||
# If set, `MAX_JOBS` will be reduced to avoid oversubscribing the CPU.
|
||||
"NVCC_THREADS":
|
||||
lambda: os.getenv("NVCC_THREADS", None),
|
||||
|
||||
# If set, vllm will build with Neuron support
|
||||
"VLLM_BUILD_WITH_NEURON":
|
||||
lambda: bool(os.environ.get("VLLM_BUILD_WITH_NEURON", False)),
|
||||
|
||||
# If set, vllm will use precompiled binaries (*.so)
|
||||
"VLLM_USE_PRECOMPILED":
|
||||
lambda: bool(os.environ.get("VLLM_USE_PRECOMPILED")),
|
||||
|
||||
# If set, vllm will install Punica kernels
|
||||
"VLLM_INSTALL_PUNICA_KERNELS":
|
||||
lambda: bool(int(os.getenv("VLLM_INSTALL_PUNICA_KERNELS", "0"))),
|
||||
|
||||
# CMake build type
|
||||
# If not set, defaults to "Debug" or "RelWithDebInfo"
|
||||
# Available options: "Debug", "Release", "RelWithDebInfo"
|
||||
"CMAKE_BUILD_TYPE":
|
||||
lambda: os.getenv("CMAKE_BUILD_TYPE"),
|
||||
|
||||
# If set, vllm will print verbose logs during installation
|
||||
"VERBOSE":
|
||||
lambda: bool(int(os.getenv('VERBOSE', '0'))),
|
||||
|
||||
# Root directory for VLLM configuration files
|
||||
# Note that this not only affects how vllm finds its configuration files
|
||||
# during runtime, but also affects how vllm installs its configuration
|
||||
# files during **installation**.
|
||||
"VLLM_CONFIG_ROOT":
|
||||
lambda: os.environ.get("VLLM_CONFIG_ROOT", None) or os.getenv(
|
||||
"XDG_CONFIG_HOME", None) or os.path.expanduser("~/.config"),
|
||||
|
||||
# ================== Runtime Env Vars ==================
|
||||
|
||||
# used in distributed environment to determine the master address
|
||||
'VLLM_HOST_IP':
|
||||
lambda: os.getenv('VLLM_HOST_IP', "") or os.getenv("HOST_IP", ""),
|
||||
|
||||
# If true, will load models from ModelScope instead of Hugging Face Hub.
|
||||
# note that the value is true or false, not numbers
|
||||
"VLLM_USE_MODELSCOPE":
|
||||
lambda: os.environ.get("VLLM_USE_MODELSCOPE", "False").lower() == "true",
|
||||
|
||||
# Instance id represents an instance of the VLLM. All processes in the same
|
||||
# instance should have the same instance id.
|
||||
"VLLM_INSTANCE_ID":
|
||||
lambda: os.environ.get("VLLM_INSTANCE_ID", None),
|
||||
|
||||
# path to cudatoolkit home directory, under which should be bin, include,
|
||||
# and lib directories.
|
||||
"CUDA_HOME":
|
||||
lambda: os.environ.get("CUDA_HOME", None),
|
||||
|
||||
# Path to the NCCL library file. It is needed because nccl>=2.19 brought
|
||||
# by PyTorch contains a bug: https://github.com/NVIDIA/nccl/issues/1234
|
||||
"VLLM_NCCL_SO_PATH":
|
||||
lambda: os.environ.get("VLLM_NCCL_SO_PATH", None),
|
||||
|
||||
# when `VLLM_NCCL_SO_PATH` is not set, vllm will try to find the nccl
|
||||
# library file in the locations specified by `LD_LIBRARY_PATH`
|
||||
"LD_LIBRARY_PATH":
|
||||
lambda: os.environ.get("LD_LIBRARY_PATH", None),
|
||||
|
||||
# flag to control if vllm should use triton flash attention
|
||||
"VLLM_USE_TRITON_FLASH_ATTN":
|
||||
lambda: (os.environ.get("VLLM_USE_TRITON_FLASH_ATTN", "True").lower() in
|
||||
("true", "1")),
|
||||
|
||||
# local rank of the process in the distributed setting, used to determine
|
||||
# the GPU device id
|
||||
"LOCAL_RANK":
|
||||
lambda: int(os.environ.get("LOCAL_RANK", "0")),
|
||||
|
||||
# used to control the visible devices in the distributed setting
|
||||
"CUDA_VISIBLE_DEVICES":
|
||||
lambda: os.environ.get("CUDA_VISIBLE_DEVICES", None),
|
||||
|
||||
# timeout for each iteration in the engine
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S":
|
||||
lambda: int(os.environ.get("VLLM_ENGINE_ITERATION_TIMEOUT_S", "60")),
|
||||
|
||||
# API key for VLLM API server
|
||||
"VLLM_API_KEY":
|
||||
lambda: os.environ.get("VLLM_API_KEY", None),
|
||||
|
||||
# S3 access information, used for tensorizer to load model from S3
|
||||
"S3_ACCESS_KEY_ID":
|
||||
lambda: os.environ.get("S3_ACCESS_KEY", None),
|
||||
"S3_SECRET_ACCESS_KEY":
|
||||
lambda: os.environ.get("S3_SECRET_ACCESS_KEY", None),
|
||||
"S3_ENDPOINT_URL":
|
||||
lambda: os.environ.get("S3_ENDPOINT_URL", None),
|
||||
|
||||
# Usage stats collection
|
||||
"VLLM_USAGE_STATS_SERVER":
|
||||
lambda: os.environ.get("VLLM_USAGE_STATS_SERVER", "https://stats.vllm.ai"),
|
||||
"VLLM_NO_USAGE_STATS":
|
||||
lambda: os.environ.get("VLLM_NO_USAGE_STATS", "0") == "1",
|
||||
"VLLM_DO_NOT_TRACK":
|
||||
lambda: (os.environ.get("VLLM_DO_NOT_TRACK", None) or os.environ.get(
|
||||
"DO_NOT_TRACK", None) or "0") == "1",
|
||||
"VLLM_USAGE_SOURCE":
|
||||
lambda: os.environ.get("VLLM_USAGE_SOURCE", "production"),
|
||||
|
||||
# Logging configuration
|
||||
# If set to 0, vllm will not configure logging
|
||||
# If set to 1, vllm will configure logging using the default configuration
|
||||
# or the configuration file specified by VLLM_LOGGING_CONFIG_PATH
|
||||
"VLLM_CONFIGURE_LOGGING":
|
||||
lambda: int(os.getenv("VLLM_CONFIGURE_LOGGING", "1")),
|
||||
"VLLM_LOGGING_CONFIG_PATH":
|
||||
lambda: os.getenv("VLLM_LOGGING_CONFIG_PATH"),
|
||||
|
||||
# Trace function calls
|
||||
# If set to 1, vllm will trace function calls
|
||||
# Useful for debugging
|
||||
"VLLM_TRACE_FUNCTION":
|
||||
lambda: int(os.getenv("VLLM_TRACE_FUNCTION", "0")),
|
||||
|
||||
# Backend for attention computation
|
||||
# Available options:
|
||||
# - "TORCH_SDPA": use torch.nn.MultiheadAttention
|
||||
# - "FLASH_ATTN": use FlashAttention
|
||||
# - "XFORMERS": use XFormers
|
||||
# - "ROCM_FLASH": use ROCmFlashAttention
|
||||
"VLLM_ATTENTION_BACKEND":
|
||||
lambda: os.getenv("VLLM_ATTENTION_BACKEND", None),
|
||||
|
||||
# CPU key-value cache space
|
||||
# default is 4GB
|
||||
"VLLM_CPU_KVCACHE_SPACE":
|
||||
lambda: int(os.getenv("VLLM_CPU_KVCACHE_SPACE", "0")),
|
||||
|
||||
# If the env var is set, it uses the Ray's compiled DAG API
|
||||
# which optimizes the control plane overhead.
|
||||
# Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it.
|
||||
"VLLM_USE_RAY_COMPILED_DAG":
|
||||
lambda: bool(os.getenv("VLLM_USE_RAY_COMPILED_DAG", 0)),
|
||||
|
||||
# Use dedicated multiprocess context for workers.
|
||||
# Both spawn and fork work
|
||||
"VLLM_WORKER_MULTIPROC_METHOD":
|
||||
lambda: os.getenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn"),
|
||||
}
|
||||
|
||||
# end-env-vars-definition
|
||||
|
||||
|
||||
def __getattr__(name):
|
||||
# lazy evaluation of environment variables
|
||||
if name in environment_variables:
|
||||
return environment_variables[name]()
|
||||
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
|
||||
|
||||
|
||||
def __dir__():
|
||||
return list(environment_variables.keys())
|
||||
Reference in New Issue
Block a user