enginex-bi_series-vllm/vllm/envs.py

import os
import tempfile
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional

if TYPE_CHECKING:
    VLLM_HOST_IP: str = ""
    VLLM_PORT: Optional[int] = None
    VLLM_RPC_BASE_PATH: str = tempfile.gettempdir()
    VLLM_USE_MODELSCOPE: bool = False
    VLLM_RINGBUFFER_WARNING_INTERVAL: int = 60
    VLLM_INSTANCE_ID: Optional[str] = None
    VLLM_NCCL_SO_PATH: Optional[str] = None
    LD_LIBRARY_PATH: Optional[str] = None
    VLLM_USE_TRITON_FLASH_ATTN: bool = False
    LOCAL_RANK: int = 0
    CUDA_VISIBLE_DEVICES: Optional[str] = None
    VLLM_ENGINE_ITERATION_TIMEOUT_S: int = 60
    VLLM_API_KEY: Optional[str] = None
    S3_ACCESS_KEY_ID: Optional[str] = None
    S3_SECRET_ACCESS_KEY: Optional[str] = None
    S3_ENDPOINT_URL: Optional[str] = None
    VLLM_CACHE_ROOT: str = os.path.expanduser("~/.cache/vllm")
    VLLM_CONFIG_ROOT: str = os.path.expanduser("~/.config/vllm")
    VLLM_USAGE_STATS_SERVER: str = "https://stats.vllm.ai"
    VLLM_NO_USAGE_STATS: bool = False
    VLLM_DO_NOT_TRACK: bool = False
    VLLM_USAGE_SOURCE: str = ""
    VLLM_CONFIGURE_LOGGING: int = 1
    VLLM_LOGGING_LEVEL: str = "INFO"
    VLLM_LOGGING_CONFIG_PATH: Optional[str] = None
    VLLM_TRACE_FUNCTION: int = 0
    VLLM_ATTENTION_BACKEND: Optional[str] = None
    VLLM_USE_FLASHINFER_SAMPLER: bool = False
    VLLM_USE_FLASHINFER_REJECTION_SAMPLER: bool = False
    VLLM_PP_LAYER_PARTITION: Optional[str] = None
    VLLM_CPU_KVCACHE_SPACE: int = 0
    VLLM_CPU_OMP_THREADS_BIND: str = ""
    VLLM_OPENVINO_DEVICE: str = "CPU"
    VLLM_OPENVINO_KVCACHE_SPACE: int = 0
    VLLM_OPENVINO_CPU_KV_CACHE_PRECISION: Optional[str] = None
    VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS: bool = False
    VLLM_XLA_CACHE_PATH: str = os.path.join(VLLM_CACHE_ROOT, "xla_cache")
    VLLM_FUSED_MOE_CHUNK_SIZE: int = 64 * 1024
    VLLM_USE_RAY_SPMD_WORKER: bool = False
    VLLM_USE_RAY_COMPILED_DAG: bool = False
    VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL: bool = True
    VLLM_WORKER_MULTIPROC_METHOD: str = "spawn"
    VLLM_ASSETS_CACHE: str = os.path.join(VLLM_CACHE_ROOT, "assets")
    VLLM_IMAGE_FETCH_TIMEOUT: int = 5
    VLLM_AUDIO_FETCH_TIMEOUT: int = 5
    VLLM_TARGET_DEVICE: str = "cuda"
    MAX_JOBS: Optional[str] = None
    NVCC_THREADS: Optional[str] = None
    VLLM_USE_PRECOMPILED: bool = False
    VLLM_NO_DEPRECATION_WARNING: bool = False
    VLLM_KEEP_ALIVE_ON_ENGINE_DEATH: bool = False
    CMAKE_BUILD_TYPE: Optional[str] = None
    VERBOSE: bool = False
    VLLM_ALLOW_LONG_MAX_MODEL_LEN: bool = False
    VLLM_TEST_FORCE_FP8_MARLIN: bool = False
    VLLM_RPC_TIMEOUT: int = 10000  # ms
    VLLM_PLUGINS: Optional[List[str]] = None
    VLLM_TORCH_PROFILER_DIR: Optional[str] = None
    VLLM_USE_TRITON_AWQ: bool = False
    VLLM_ALLOW_RUNTIME_LORA_UPDATING: bool = False
    VLLM_SKIP_P2P_CHECK: bool = False
    VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1: bool = False
    VLLM_TORCH_COMPILE_LEVEL: int = 0
    VLLM_V0_USE_OUTLINES_CACHE: bool = False


def get_default_cache_root():
    return os.getenv(
        "XDG_CACHE_HOME",
        os.path.join(os.path.expanduser("~"), ".cache"),
    )


def get_default_config_root():
    return os.getenv(
        "XDG_CONFIG_HOME",
        os.path.join(os.path.expanduser("~"), ".config"),
    )


# The begin-* and end* here are used by the documentation generator
# to extract the used env vars.

# begin-env-vars-definition

environment_variables: Dict[str, Callable[[], Any]] = {

    # ================== Installation Time Env Vars ==================

    # Target device of vLLM, supporting [cuda (by default),
    # rocm, neuron, cpu, openvino]
    "VLLM_TARGET_DEVICE":
    lambda: os.getenv("VLLM_TARGET_DEVICE", "cuda"),

    # Maximum number of compilation jobs to run in parallel.
    # By default this is the number of CPUs
    "MAX_JOBS":
    lambda: os.getenv("MAX_JOBS", None),

    # Number of threads to use for nvcc
    # By default this is 1.
    # If set, `MAX_JOBS` will be reduced to avoid oversubscribing the CPU.
    "NVCC_THREADS":
    lambda: os.getenv("NVCC_THREADS", None),

    # If set, vllm will use precompiled binaries (*.so)
    "VLLM_USE_PRECOMPILED":
    lambda: bool(os.environ.get("VLLM_USE_PRECOMPILED")),

    # CMake build type
    # If not set, defaults to "Debug" or "RelWithDebInfo"
    # Available options: "Debug", "Release", "RelWithDebInfo"
    "CMAKE_BUILD_TYPE":
    lambda: os.getenv("CMAKE_BUILD_TYPE"),

    # If set, vllm will print verbose logs during installation
    "VERBOSE":
    lambda: bool(int(os.getenv('VERBOSE', '0'))),

    # Root directory for VLLM configuration files
    # Defaults to `~/.config/vllm` unless `XDG_CONFIG_HOME` is set
    # Note that this not only affects how vllm finds its configuration files
    # during runtime, but also affects how vllm installs its configuration
    # files during **installation**.
    "VLLM_CONFIG_ROOT":
    lambda: os.path.expanduser(
        os.getenv(
            "VLLM_CONFIG_ROOT",
            os.path.join(get_default_config_root(), "vllm"),
        )),

    # ================== Runtime Env Vars ==================

    # Root directory for VLLM cache files
    # Defaults to `~/.cache/vllm` unless `XDG_CACHE_HOME` is set
    "VLLM_CACHE_ROOT":
    lambda: os.path.expanduser(
        os.getenv(
            "VLLM_CACHE_ROOT",
            os.path.join(get_default_cache_root(), "vllm"),
        )),

    # used in distributed environment to determine the ip address
    # of the current node, when the node has multiple network interfaces.
    # If you are using multi-node inference, you should set this differently
    # on each node.
    'VLLM_HOST_IP':
    lambda: os.getenv('VLLM_HOST_IP', "") or os.getenv("HOST_IP", ""),

    # used in distributed environment to manually set the communication port
    # Note: if VLLM_PORT is set, and some code asks for multiple ports, the
    # VLLM_PORT will be used as the first port, and the rest will be generated
    # by incrementing the VLLM_PORT value.
    # '0' is used to make mypy happy
    'VLLM_PORT':
    lambda: int(os.getenv('VLLM_PORT', '0'))
    if 'VLLM_PORT' in os.environ else None,

    # path used for ipc when the frontend api server is running in
    # multi-processing mode to communicate with the backend engine process.
    'VLLM_RPC_BASE_PATH':
    lambda: os.getenv('VLLM_RPC_BASE_PATH', tempfile.gettempdir()),

    # If true, will load models from ModelScope instead of Hugging Face Hub.
    # note that the value is true or false, not numbers
    "VLLM_USE_MODELSCOPE":
    lambda: os.environ.get("VLLM_USE_MODELSCOPE", "False").lower() == "true",

    # Instance id represents an instance of the VLLM. All processes in the same
    # instance should have the same instance id.
    "VLLM_INSTANCE_ID":
    lambda: os.environ.get("VLLM_INSTANCE_ID", None),

    # Interval in seconds to log a warning message when the ring buffer is full
    "VLLM_RINGBUFFER_WARNING_INTERVAL":
    lambda: int(os.environ.get("VLLM_RINGBUFFER_WARNING_INTERVAL", "60")),

    # path to cudatoolkit home directory, under which should be bin, include,
    # and lib directories.
    "CUDA_HOME":
    lambda: os.environ.get("CUDA_HOME", None),

    # Path to the NCCL library file. It is needed because nccl>=2.19 brought
    # by PyTorch contains a bug: https://github.com/NVIDIA/nccl/issues/1234
    "VLLM_NCCL_SO_PATH":
    lambda: os.environ.get("VLLM_NCCL_SO_PATH", None),

    # when `VLLM_NCCL_SO_PATH` is not set, vllm will try to find the nccl
    # library file in the locations specified by `LD_LIBRARY_PATH`
    "LD_LIBRARY_PATH":
    lambda: os.environ.get("LD_LIBRARY_PATH", None),

    # flag to control if vllm should use triton flash attention
    "VLLM_USE_TRITON_FLASH_ATTN":
    lambda: (os.environ.get("VLLM_USE_TRITON_FLASH_ATTN", "True").lower() in
             ("true", "1")),

    # Internal flag to enable Dynamo fullgraph capture
    "VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE":
    lambda: bool(
        os.environ.get("VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE", "1") != "0"),
    "VLLM_TORCH_COMPILE_LEVEL":
    lambda: int(os.environ.get("VLLM_TORCH_COMPILE_LEVEL", "0")),

    # local rank of the process in the distributed setting, used to determine
    # the GPU device id
    "LOCAL_RANK":
    lambda: int(os.environ.get("LOCAL_RANK", "0")),

    # used to control the visible devices in the distributed setting
    "CUDA_VISIBLE_DEVICES":
    lambda: os.environ.get("CUDA_VISIBLE_DEVICES", None),

    # timeout for each iteration in the engine
    "VLLM_ENGINE_ITERATION_TIMEOUT_S":
    lambda: int(os.environ.get("VLLM_ENGINE_ITERATION_TIMEOUT_S", "60")),

    # API key for VLLM API server
    "VLLM_API_KEY":
    lambda: os.environ.get("VLLM_API_KEY", None),

    # S3 access information, used for tensorizer to load model from S3
    "S3_ACCESS_KEY_ID":
    lambda: os.environ.get("S3_ACCESS_KEY_ID", None),
    "S3_SECRET_ACCESS_KEY":
    lambda: os.environ.get("S3_SECRET_ACCESS_KEY", None),
    "S3_ENDPOINT_URL":
    lambda: os.environ.get("S3_ENDPOINT_URL", None),

    # Usage stats collection
    "VLLM_USAGE_STATS_SERVER":
    lambda: os.environ.get("VLLM_USAGE_STATS_SERVER", "https://stats.vllm.ai"),
    "VLLM_NO_USAGE_STATS":
    lambda: os.environ.get("VLLM_NO_USAGE_STATS", "0") == "1",
    "VLLM_DO_NOT_TRACK":
    lambda: (os.environ.get("VLLM_DO_NOT_TRACK", None) or os.environ.get(
        "DO_NOT_TRACK", None) or "0") == "1",
    "VLLM_USAGE_SOURCE":
    lambda: os.environ.get("VLLM_USAGE_SOURCE", "production"),

    # Logging configuration
    # If set to 0, vllm will not configure logging
    # If set to 1, vllm will configure logging using the default configuration
    #    or the configuration file specified by VLLM_LOGGING_CONFIG_PATH
    "VLLM_CONFIGURE_LOGGING":
    lambda: int(os.getenv("VLLM_CONFIGURE_LOGGING", "1")),
    "VLLM_LOGGING_CONFIG_PATH":
    lambda: os.getenv("VLLM_LOGGING_CONFIG_PATH"),

    # this is used for configuring the default logging level
    "VLLM_LOGGING_LEVEL":
    lambda: os.getenv("VLLM_LOGGING_LEVEL", "INFO"),

    # Trace function calls
    # If set to 1, vllm will trace function calls
    # Useful for debugging
    "VLLM_TRACE_FUNCTION":
    lambda: int(os.getenv("VLLM_TRACE_FUNCTION", "0")),

    # Backend for attention computation
    # Available options:
    # - "TORCH_SDPA": use torch.nn.MultiheadAttention
    # - "FLASH_ATTN": use FlashAttention
    # - "XFORMERS": use XFormers
    # - "ROCM_FLASH": use ROCmFlashAttention
    # - "FLASHINFER": use flashinfer
    "VLLM_ATTENTION_BACKEND":
    lambda: os.getenv("VLLM_ATTENTION_BACKEND", None),

    # If set, vllm will use flashinfer sampler
    "VLLM_USE_FLASHINFER_SAMPLER":
    lambda: bool(int(os.getenv("VLLM_USE_FLASHINFER_SAMPLER", "0"))),

    # Pipeline stage partition strategy
    "VLLM_PP_LAYER_PARTITION":
    lambda: os.getenv("VLLM_PP_LAYER_PARTITION", None),

    # (CPU backend only) CPU key-value cache space.
    # default is 4GB
    "VLLM_CPU_KVCACHE_SPACE":
    lambda: int(os.getenv("VLLM_CPU_KVCACHE_SPACE", "0")),

    # (CPU backend only) CPU core ids bound by OpenMP threads, e.g., "0-31",
    # "0,1,2", "0-31,33". CPU cores of different ranks are separated by '|'.
    "VLLM_CPU_OMP_THREADS_BIND":
    lambda: os.getenv("VLLM_CPU_OMP_THREADS_BIND", "all"),

    # OpenVINO device selection
    # default is CPU
    "VLLM_OPENVINO_DEVICE":
    lambda: os.getenv("VLLM_OPENVINO_DEVICE", "CPU").upper(),

    # OpenVINO key-value cache space
    # default is 4GB
    "VLLM_OPENVINO_KVCACHE_SPACE":
    lambda: int(os.getenv("VLLM_OPENVINO_KVCACHE_SPACE", "0")),

    # OpenVINO KV cache precision
    # default is bf16 if natively supported by platform, otherwise f16
    # To enable KV cache compression, please, explicitly specify u8
    "VLLM_OPENVINO_CPU_KV_CACHE_PRECISION":
    lambda: os.getenv("VLLM_OPENVINO_CPU_KV_CACHE_PRECISION", None),

    # Enables weights compression during model export via HF Optimum
    # default is False
    "VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS":
    lambda: bool(os.getenv("VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS", False)),

    # If the env var is set, then all workers will execute as separate
    # processes from the engine, and we use the same mechanism to trigger
    # execution on all workers.
    # Run vLLM with VLLM_USE_RAY_SPMD_WORKER=1 to enable it.
    "VLLM_USE_RAY_SPMD_WORKER":
    lambda: bool(int(os.getenv("VLLM_USE_RAY_SPMD_WORKER", "0"))),

    # If the env var is set, it uses the Ray's compiled DAG API
    # which optimizes the control plane overhead.
    # Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it.
    "VLLM_USE_RAY_COMPILED_DAG":
    lambda: bool(int(os.getenv("VLLM_USE_RAY_COMPILED_DAG", "0"))),

    # If the env var is set, it uses NCCL for communication in
    # Ray's compiled DAG. This flag is ignored if
    # VLLM_USE_RAY_COMPILED_DAG is not set.
    "VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL":
    lambda: bool(int(os.getenv("VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL", "1"))
                 ),

    # Use dedicated multiprocess context for workers.
    # Both spawn and fork work
    "VLLM_WORKER_MULTIPROC_METHOD":
    lambda: os.getenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn"),

    # Path to the cache for storing downloaded assets
    "VLLM_ASSETS_CACHE":
    lambda: os.path.expanduser(
        os.getenv(
            "VLLM_ASSETS_CACHE",
            os.path.join(get_default_cache_root(), "vllm", "assets"),
        )),

    # Timeout for fetching images when serving multimodal models
    # Default is 5 seconds
    "VLLM_IMAGE_FETCH_TIMEOUT":
    lambda: int(os.getenv("VLLM_IMAGE_FETCH_TIMEOUT", "5")),

    # Timeout for fetching audio when serving multimodal models
    # Default is 5 seconds
    "VLLM_AUDIO_FETCH_TIMEOUT":
    lambda: int(os.getenv("VLLM_AUDIO_FETCH_TIMEOUT", "5")),

    # Path to the XLA persistent cache directory.
    # Only used for XLA devices such as TPUs.
    "VLLM_XLA_CACHE_PATH":
    lambda: os.path.expanduser(
        os.getenv(
            "VLLM_XLA_CACHE_PATH",
            os.path.join(get_default_cache_root(), "vllm", "xla_cache"),
        )),
    "VLLM_FUSED_MOE_CHUNK_SIZE":
    lambda: int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768")),

    # If set, vllm will skip the deprecation warnings.
    "VLLM_NO_DEPRECATION_WARNING":
    lambda: bool(int(os.getenv("VLLM_NO_DEPRECATION_WARNING", "0"))),

    # If set, the OpenAI API server will stay alive even after the underlying
    # AsyncLLMEngine errors and stops serving requests
    "VLLM_KEEP_ALIVE_ON_ENGINE_DEATH":
    lambda: bool(os.getenv("VLLM_KEEP_ALIVE_ON_ENGINE_DEATH", 0)),

    # If the env var VLLM_ALLOW_LONG_MAX_MODEL_LEN is set, it allows
    # the user to specify a max sequence length greater than
    # the max length derived from the model's config.json.
    # To enable this, set VLLM_ALLOW_LONG_MAX_MODEL_LEN=1.
    "VLLM_ALLOW_LONG_MAX_MODEL_LEN":
    lambda:
    (os.environ.get("VLLM_ALLOW_LONG_MAX_MODEL_LEN", "0").strip().lower() in
     ("1", "true")),

    # If set, forces FP8 Marlin to be used for FP8 quantization regardless
    # of the hardware support for FP8 compute.
    "VLLM_TEST_FORCE_FP8_MARLIN":
    lambda:
    (os.environ.get("VLLM_TEST_FORCE_FP8_MARLIN", "0").strip().lower() in
     ("1", "true")),
    "VLLM_TEST_FORCE_LOAD_FORMAT":
    lambda: os.getenv("VLLM_TEST_FORCE_LOAD_FORMAT", "dummy"),

    # Time in ms for the zmq client to wait for a response from the backend
    # server for simple data operations
    "VLLM_RPC_TIMEOUT":
    lambda: int(os.getenv("VLLM_RPC_TIMEOUT", "10000")),

    # a list of plugin names to load, separated by commas.
    # if this is not set, it means all plugins will be loaded
    # if this is set to an empty string, no plugins will be loaded
    "VLLM_PLUGINS":
    lambda: None if "VLLM_PLUGINS" not in os.environ else os.environ[
        "VLLM_PLUGINS"].split(","),

    # Enables torch profiler if set. Path to the directory where torch profiler
    # traces are saved. Note that it must be an absolute path.
    "VLLM_TORCH_PROFILER_DIR":
    lambda: (None if os.getenv("VLLM_TORCH_PROFILER_DIR", None) is None else os
             .path.expanduser(os.getenv("VLLM_TORCH_PROFILER_DIR", "."))),

    # If set, vLLM will use Triton implementations of AWQ.
    "VLLM_USE_TRITON_AWQ":
    lambda: bool(int(os.getenv("VLLM_USE_TRITON_AWQ", "0"))),

    # If set, allow loading or unloading lora adapters in runtime,
    "VLLM_ALLOW_RUNTIME_LORA_UPDATING":
    lambda:
    (os.environ.get("VLLM_ALLOW_RUNTIME_LORA_UPDATING", "0").strip().lower() in
     ("1", "true")),

    # By default, vLLM will check the peer-to-peer capability itself,
    # in case of broken drivers. See https://github.com/vllm-project/vllm/blob/a9b15c606fea67a072416ea0ea115261a2756058/vllm/distributed/device_communicators/custom_all_reduce_utils.py#L101-L108 for details. # noqa
    # If this env var is set to 1, vLLM will skip the peer-to-peer check,
    # and trust the driver's peer-to-peer capability report.
    "VLLM_SKIP_P2P_CHECK":
    lambda: os.getenv("VLLM_SKIP_P2P_CHECK", "0") == "1",

    # If set, allowing the use of deprecated block manager V1
    "VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1":
    lambda: os.environ.get("VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1", "0"
                           ) == "1",
    
    # Whether to turn on the outlines cache for V0
    # This cache is unbounded and on disk, so it's not safe to use in
    # an environment with potentially malicious users.
    "VLLM_V0_USE_OUTLINES_CACHE":
    lambda: os.environ.get("VLLM_V0_USE_OUTLINES_CACHE", "0") == "1",
}

# end-env-vars-definition


def __getattr__(name: str):
    # lazy evaluation of environment variables
    if name in environment_variables:
        return environment_variables[name]()
    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")


def __dir__():
    return list(environment_variables.keys())
First commit 2025-08-05 19:02:46 +08:00			`import os`
			`import tempfile`
			`from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional`

			`if TYPE_CHECKING:`
			`VLLM_HOST_IP: str = ""`
			`VLLM_PORT: Optional[int] = None`
			`VLLM_RPC_BASE_PATH: str = tempfile.gettempdir()`
			`VLLM_USE_MODELSCOPE: bool = False`
			`VLLM_RINGBUFFER_WARNING_INTERVAL: int = 60`
			`VLLM_INSTANCE_ID: Optional[str] = None`
			`VLLM_NCCL_SO_PATH: Optional[str] = None`
			`LD_LIBRARY_PATH: Optional[str] = None`
			`VLLM_USE_TRITON_FLASH_ATTN: bool = False`
			`LOCAL_RANK: int = 0`
			`CUDA_VISIBLE_DEVICES: Optional[str] = None`
			`VLLM_ENGINE_ITERATION_TIMEOUT_S: int = 60`
			`VLLM_API_KEY: Optional[str] = None`
			`S3_ACCESS_KEY_ID: Optional[str] = None`
			`S3_SECRET_ACCESS_KEY: Optional[str] = None`
			`S3_ENDPOINT_URL: Optional[str] = None`
			`VLLM_CACHE_ROOT: str = os.path.expanduser("~/.cache/vllm")`
			`VLLM_CONFIG_ROOT: str = os.path.expanduser("~/.config/vllm")`
			`VLLM_USAGE_STATS_SERVER: str = "https://stats.vllm.ai"`
			`VLLM_NO_USAGE_STATS: bool = False`
			`VLLM_DO_NOT_TRACK: bool = False`
			`VLLM_USAGE_SOURCE: str = ""`
			`VLLM_CONFIGURE_LOGGING: int = 1`
			`VLLM_LOGGING_LEVEL: str = "INFO"`
			`VLLM_LOGGING_CONFIG_PATH: Optional[str] = None`
			`VLLM_TRACE_FUNCTION: int = 0`
			`VLLM_ATTENTION_BACKEND: Optional[str] = None`
			`VLLM_USE_FLASHINFER_SAMPLER: bool = False`
			`VLLM_USE_FLASHINFER_REJECTION_SAMPLER: bool = False`
			`VLLM_PP_LAYER_PARTITION: Optional[str] = None`
			`VLLM_CPU_KVCACHE_SPACE: int = 0`
			`VLLM_CPU_OMP_THREADS_BIND: str = ""`
			`VLLM_OPENVINO_DEVICE: str = "CPU"`
			`VLLM_OPENVINO_KVCACHE_SPACE: int = 0`
			`VLLM_OPENVINO_CPU_KV_CACHE_PRECISION: Optional[str] = None`
			`VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS: bool = False`
			`VLLM_XLA_CACHE_PATH: str = os.path.join(VLLM_CACHE_ROOT, "xla_cache")`
			`VLLM_FUSED_MOE_CHUNK_SIZE: int = 64 * 1024`
			`VLLM_USE_RAY_SPMD_WORKER: bool = False`
			`VLLM_USE_RAY_COMPILED_DAG: bool = False`
			`VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL: bool = True`
			`VLLM_WORKER_MULTIPROC_METHOD: str = "spawn"`
			`VLLM_ASSETS_CACHE: str = os.path.join(VLLM_CACHE_ROOT, "assets")`
			`VLLM_IMAGE_FETCH_TIMEOUT: int = 5`
			`VLLM_AUDIO_FETCH_TIMEOUT: int = 5`
			`VLLM_TARGET_DEVICE: str = "cuda"`
			`MAX_JOBS: Optional[str] = None`
			`NVCC_THREADS: Optional[str] = None`
			`VLLM_USE_PRECOMPILED: bool = False`
			`VLLM_NO_DEPRECATION_WARNING: bool = False`
			`VLLM_KEEP_ALIVE_ON_ENGINE_DEATH: bool = False`
			`CMAKE_BUILD_TYPE: Optional[str] = None`
			`VERBOSE: bool = False`
			`VLLM_ALLOW_LONG_MAX_MODEL_LEN: bool = False`
			`VLLM_TEST_FORCE_FP8_MARLIN: bool = False`
			`VLLM_RPC_TIMEOUT: int = 10000 # ms`
			`VLLM_PLUGINS: Optional[List[str]] = None`
			`VLLM_TORCH_PROFILER_DIR: Optional[str] = None`
			`VLLM_USE_TRITON_AWQ: bool = False`
			`VLLM_ALLOW_RUNTIME_LORA_UPDATING: bool = False`
			`VLLM_SKIP_P2P_CHECK: bool = False`
			`VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1: bool = False`
			`VLLM_TORCH_COMPILE_LEVEL: int = 0`
Update to new version of base image 2025-10-24 15:45:06 +08:00			`VLLM_V0_USE_OUTLINES_CACHE: bool = False`
First commit 2025-08-05 19:02:46 +08:00

			`def get_default_cache_root():`
			`return os.getenv(`
			`"XDG_CACHE_HOME",`
			`os.path.join(os.path.expanduser("~"), ".cache"),`
			`)`


			`def get_default_config_root():`
			`return os.getenv(`
			`"XDG_CONFIG_HOME",`
			`os.path.join(os.path.expanduser("~"), ".config"),`
			`)`


			`# The begin-* and end* here are used by the documentation generator`
			`# to extract the used env vars.`

			`# begin-env-vars-definition`

			`environment_variables: Dict[str, Callable[[], Any]] = {`

			`# ================== Installation Time Env Vars ==================`

			`# Target device of vLLM, supporting [cuda (by default),`
			`# rocm, neuron, cpu, openvino]`
			`"VLLM_TARGET_DEVICE":`
			`lambda: os.getenv("VLLM_TARGET_DEVICE", "cuda"),`

			`# Maximum number of compilation jobs to run in parallel.`
			`# By default this is the number of CPUs`
			`"MAX_JOBS":`
			`lambda: os.getenv("MAX_JOBS", None),`

			`# Number of threads to use for nvcc`
			`# By default this is 1.`
			# If set, `MAX_JOBS` will be reduced to avoid oversubscribing the CPU.
			`"NVCC_THREADS":`
			`lambda: os.getenv("NVCC_THREADS", None),`

			`# If set, vllm will use precompiled binaries (*.so)`
			`"VLLM_USE_PRECOMPILED":`
			`lambda: bool(os.environ.get("VLLM_USE_PRECOMPILED")),`

			`# CMake build type`
			`# If not set, defaults to "Debug" or "RelWithDebInfo"`
			`# Available options: "Debug", "Release", "RelWithDebInfo"`
			`"CMAKE_BUILD_TYPE":`
			`lambda: os.getenv("CMAKE_BUILD_TYPE"),`

			`# If set, vllm will print verbose logs during installation`
			`"VERBOSE":`
			`lambda: bool(int(os.getenv('VERBOSE', '0'))),`

			`# Root directory for VLLM configuration files`
			# Defaults to `~/.config/vllm` unless `XDG_CONFIG_HOME` is set
			`# Note that this not only affects how vllm finds its configuration files`
			`# during runtime, but also affects how vllm installs its configuration`
			`# files during installation.`
			`"VLLM_CONFIG_ROOT":`
			`lambda: os.path.expanduser(`
			`os.getenv(`
			`"VLLM_CONFIG_ROOT",`
			`os.path.join(get_default_config_root(), "vllm"),`
			`)),`

			`# ================== Runtime Env Vars ==================`

			`# Root directory for VLLM cache files`
			# Defaults to `~/.cache/vllm` unless `XDG_CACHE_HOME` is set
			`"VLLM_CACHE_ROOT":`
			`lambda: os.path.expanduser(`
			`os.getenv(`
			`"VLLM_CACHE_ROOT",`
			`os.path.join(get_default_cache_root(), "vllm"),`
			`)),`

			`# used in distributed environment to determine the ip address`
			`# of the current node, when the node has multiple network interfaces.`
			`# If you are using multi-node inference, you should set this differently`
			`# on each node.`
			`'VLLM_HOST_IP':`
			`lambda: os.getenv('VLLM_HOST_IP', "") or os.getenv("HOST_IP", ""),`

			`# used in distributed environment to manually set the communication port`
			`# Note: if VLLM_PORT is set, and some code asks for multiple ports, the`
			`# VLLM_PORT will be used as the first port, and the rest will be generated`
			`# by incrementing the VLLM_PORT value.`
			`# '0' is used to make mypy happy`
			`'VLLM_PORT':`
			`lambda: int(os.getenv('VLLM_PORT', '0'))`
			`if 'VLLM_PORT' in os.environ else None,`

			`# path used for ipc when the frontend api server is running in`
			`# multi-processing mode to communicate with the backend engine process.`
			`'VLLM_RPC_BASE_PATH':`
			`lambda: os.getenv('VLLM_RPC_BASE_PATH', tempfile.gettempdir()),`

			`# If true, will load models from ModelScope instead of Hugging Face Hub.`
			`# note that the value is true or false, not numbers`
			`"VLLM_USE_MODELSCOPE":`
			`lambda: os.environ.get("VLLM_USE_MODELSCOPE", "False").lower() == "true",`

			`# Instance id represents an instance of the VLLM. All processes in the same`
			`# instance should have the same instance id.`
			`"VLLM_INSTANCE_ID":`
			`lambda: os.environ.get("VLLM_INSTANCE_ID", None),`

			`# Interval in seconds to log a warning message when the ring buffer is full`
			`"VLLM_RINGBUFFER_WARNING_INTERVAL":`
			`lambda: int(os.environ.get("VLLM_RINGBUFFER_WARNING_INTERVAL", "60")),`

			`# path to cudatoolkit home directory, under which should be bin, include,`
			`# and lib directories.`
			`"CUDA_HOME":`
			`lambda: os.environ.get("CUDA_HOME", None),`

			`# Path to the NCCL library file. It is needed because nccl>=2.19 brought`
			`# by PyTorch contains a bug: https://github.com/NVIDIA/nccl/issues/1234`
			`"VLLM_NCCL_SO_PATH":`
			`lambda: os.environ.get("VLLM_NCCL_SO_PATH", None),`

			# when `VLLM_NCCL_SO_PATH` is not set, vllm will try to find the nccl
			# library file in the locations specified by `LD_LIBRARY_PATH`
			`"LD_LIBRARY_PATH":`
			`lambda: os.environ.get("LD_LIBRARY_PATH", None),`

			`# flag to control if vllm should use triton flash attention`
			`"VLLM_USE_TRITON_FLASH_ATTN":`
			`lambda: (os.environ.get("VLLM_USE_TRITON_FLASH_ATTN", "True").lower() in`
			`("true", "1")),`

			`# Internal flag to enable Dynamo fullgraph capture`
			`"VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE":`
			`lambda: bool(`
			`os.environ.get("VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE", "1") != "0"),`
			`"VLLM_TORCH_COMPILE_LEVEL":`
			`lambda: int(os.environ.get("VLLM_TORCH_COMPILE_LEVEL", "0")),`

			`# local rank of the process in the distributed setting, used to determine`
			`# the GPU device id`
			`"LOCAL_RANK":`
			`lambda: int(os.environ.get("LOCAL_RANK", "0")),`

			`# used to control the visible devices in the distributed setting`
			`"CUDA_VISIBLE_DEVICES":`
			`lambda: os.environ.get("CUDA_VISIBLE_DEVICES", None),`

			`# timeout for each iteration in the engine`
			`"VLLM_ENGINE_ITERATION_TIMEOUT_S":`
			`lambda: int(os.environ.get("VLLM_ENGINE_ITERATION_TIMEOUT_S", "60")),`

			`# API key for VLLM API server`
			`"VLLM_API_KEY":`
			`lambda: os.environ.get("VLLM_API_KEY", None),`

			`# S3 access information, used for tensorizer to load model from S3`
			`"S3_ACCESS_KEY_ID":`
			`lambda: os.environ.get("S3_ACCESS_KEY_ID", None),`
			`"S3_SECRET_ACCESS_KEY":`
			`lambda: os.environ.get("S3_SECRET_ACCESS_KEY", None),`
			`"S3_ENDPOINT_URL":`
			`lambda: os.environ.get("S3_ENDPOINT_URL", None),`

			`# Usage stats collection`
			`"VLLM_USAGE_STATS_SERVER":`
			`lambda: os.environ.get("VLLM_USAGE_STATS_SERVER", "https://stats.vllm.ai"),`
			`"VLLM_NO_USAGE_STATS":`
			`lambda: os.environ.get("VLLM_NO_USAGE_STATS", "0") == "1",`
			`"VLLM_DO_NOT_TRACK":`
			`lambda: (os.environ.get("VLLM_DO_NOT_TRACK", None) or os.environ.get(`
			`"DO_NOT_TRACK", None) or "0") == "1",`
			`"VLLM_USAGE_SOURCE":`
			`lambda: os.environ.get("VLLM_USAGE_SOURCE", "production"),`

			`# Logging configuration`
			`# If set to 0, vllm will not configure logging`
			`# If set to 1, vllm will configure logging using the default configuration`
			`# or the configuration file specified by VLLM_LOGGING_CONFIG_PATH`
			`"VLLM_CONFIGURE_LOGGING":`
			`lambda: int(os.getenv("VLLM_CONFIGURE_LOGGING", "1")),`
			`"VLLM_LOGGING_CONFIG_PATH":`
			`lambda: os.getenv("VLLM_LOGGING_CONFIG_PATH"),`

			`# this is used for configuring the default logging level`
			`"VLLM_LOGGING_LEVEL":`
			`lambda: os.getenv("VLLM_LOGGING_LEVEL", "INFO"),`

			`# Trace function calls`
			`# If set to 1, vllm will trace function calls`
			`# Useful for debugging`
			`"VLLM_TRACE_FUNCTION":`
			`lambda: int(os.getenv("VLLM_TRACE_FUNCTION", "0")),`

			`# Backend for attention computation`
			`# Available options:`
			`# - "TORCH_SDPA": use torch.nn.MultiheadAttention`
			`# - "FLASH_ATTN": use FlashAttention`
			`# - "XFORMERS": use XFormers`
			`# - "ROCM_FLASH": use ROCmFlashAttention`
			`# - "FLASHINFER": use flashinfer`
			`"VLLM_ATTENTION_BACKEND":`
			`lambda: os.getenv("VLLM_ATTENTION_BACKEND", None),`

			`# If set, vllm will use flashinfer sampler`
			`"VLLM_USE_FLASHINFER_SAMPLER":`
			`lambda: bool(int(os.getenv("VLLM_USE_FLASHINFER_SAMPLER", "0"))),`

			`# Pipeline stage partition strategy`
			`"VLLM_PP_LAYER_PARTITION":`
			`lambda: os.getenv("VLLM_PP_LAYER_PARTITION", None),`

			`# (CPU backend only) CPU key-value cache space.`
			`# default is 4GB`
			`"VLLM_CPU_KVCACHE_SPACE":`
			`lambda: int(os.getenv("VLLM_CPU_KVCACHE_SPACE", "0")),`

			`# (CPU backend only) CPU core ids bound by OpenMP threads, e.g., "0-31",`
			`# "0,1,2", "0-31,33". CPU cores of different ranks are separated by '\|'.`
			`"VLLM_CPU_OMP_THREADS_BIND":`
			`lambda: os.getenv("VLLM_CPU_OMP_THREADS_BIND", "all"),`

			`# OpenVINO device selection`
			`# default is CPU`
			`"VLLM_OPENVINO_DEVICE":`
			`lambda: os.getenv("VLLM_OPENVINO_DEVICE", "CPU").upper(),`

			`# OpenVINO key-value cache space`
			`# default is 4GB`
			`"VLLM_OPENVINO_KVCACHE_SPACE":`
			`lambda: int(os.getenv("VLLM_OPENVINO_KVCACHE_SPACE", "0")),`

			`# OpenVINO KV cache precision`
			`# default is bf16 if natively supported by platform, otherwise f16`
			`# To enable KV cache compression, please, explicitly specify u8`
			`"VLLM_OPENVINO_CPU_KV_CACHE_PRECISION":`
			`lambda: os.getenv("VLLM_OPENVINO_CPU_KV_CACHE_PRECISION", None),`

			`# Enables weights compression during model export via HF Optimum`
			`# default is False`
			`"VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS":`
			`lambda: bool(os.getenv("VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS", False)),`

			`# If the env var is set, then all workers will execute as separate`
			`# processes from the engine, and we use the same mechanism to trigger`
			`# execution on all workers.`
			`# Run vLLM with VLLM_USE_RAY_SPMD_WORKER=1 to enable it.`
			`"VLLM_USE_RAY_SPMD_WORKER":`
			`lambda: bool(int(os.getenv("VLLM_USE_RAY_SPMD_WORKER", "0"))),`

			`# If the env var is set, it uses the Ray's compiled DAG API`
			`# which optimizes the control plane overhead.`
			`# Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it.`
			`"VLLM_USE_RAY_COMPILED_DAG":`
			`lambda: bool(int(os.getenv("VLLM_USE_RAY_COMPILED_DAG", "0"))),`

			`# If the env var is set, it uses NCCL for communication in`
			`# Ray's compiled DAG. This flag is ignored if`
			`# VLLM_USE_RAY_COMPILED_DAG is not set.`
			`"VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL":`
			`lambda: bool(int(os.getenv("VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL", "1"))`
			`),`

			`# Use dedicated multiprocess context for workers.`
			`# Both spawn and fork work`
			`"VLLM_WORKER_MULTIPROC_METHOD":`
			`lambda: os.getenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn"),`

			`# Path to the cache for storing downloaded assets`
			`"VLLM_ASSETS_CACHE":`
			`lambda: os.path.expanduser(`
			`os.getenv(`
			`"VLLM_ASSETS_CACHE",`
			`os.path.join(get_default_cache_root(), "vllm", "assets"),`
			`)),`

			`# Timeout for fetching images when serving multimodal models`
			`# Default is 5 seconds`
			`"VLLM_IMAGE_FETCH_TIMEOUT":`
			`lambda: int(os.getenv("VLLM_IMAGE_FETCH_TIMEOUT", "5")),`

			`# Timeout for fetching audio when serving multimodal models`
			`# Default is 5 seconds`
			`"VLLM_AUDIO_FETCH_TIMEOUT":`
			`lambda: int(os.getenv("VLLM_AUDIO_FETCH_TIMEOUT", "5")),`

			`# Path to the XLA persistent cache directory.`
			`# Only used for XLA devices such as TPUs.`
			`"VLLM_XLA_CACHE_PATH":`
			`lambda: os.path.expanduser(`
			`os.getenv(`
			`"VLLM_XLA_CACHE_PATH",`
			`os.path.join(get_default_cache_root(), "vllm", "xla_cache"),`
			`)),`
			`"VLLM_FUSED_MOE_CHUNK_SIZE":`
			`lambda: int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768")),`

			`# If set, vllm will skip the deprecation warnings.`
			`"VLLM_NO_DEPRECATION_WARNING":`
			`lambda: bool(int(os.getenv("VLLM_NO_DEPRECATION_WARNING", "0"))),`

			`# If set, the OpenAI API server will stay alive even after the underlying`
			`# AsyncLLMEngine errors and stops serving requests`
			`"VLLM_KEEP_ALIVE_ON_ENGINE_DEATH":`
			`lambda: bool(os.getenv("VLLM_KEEP_ALIVE_ON_ENGINE_DEATH", 0)),`

			`# If the env var VLLM_ALLOW_LONG_MAX_MODEL_LEN is set, it allows`
			`# the user to specify a max sequence length greater than`
			`# the max length derived from the model's config.json.`
			`# To enable this, set VLLM_ALLOW_LONG_MAX_MODEL_LEN=1.`
			`"VLLM_ALLOW_LONG_MAX_MODEL_LEN":`
			`lambda:`
			`(os.environ.get("VLLM_ALLOW_LONG_MAX_MODEL_LEN", "0").strip().lower() in`
			`("1", "true")),`

			`# If set, forces FP8 Marlin to be used for FP8 quantization regardless`
			`# of the hardware support for FP8 compute.`
			`"VLLM_TEST_FORCE_FP8_MARLIN":`
			`lambda:`
			`(os.environ.get("VLLM_TEST_FORCE_FP8_MARLIN", "0").strip().lower() in`
			`("1", "true")),`
			`"VLLM_TEST_FORCE_LOAD_FORMAT":`
			`lambda: os.getenv("VLLM_TEST_FORCE_LOAD_FORMAT", "dummy"),`

			`# Time in ms for the zmq client to wait for a response from the backend`
			`# server for simple data operations`
			`"VLLM_RPC_TIMEOUT":`
			`lambda: int(os.getenv("VLLM_RPC_TIMEOUT", "10000")),`

			`# a list of plugin names to load, separated by commas.`
			`# if this is not set, it means all plugins will be loaded`
			`# if this is set to an empty string, no plugins will be loaded`
			`"VLLM_PLUGINS":`
			`lambda: None if "VLLM_PLUGINS" not in os.environ else os.environ[`
			`"VLLM_PLUGINS"].split(","),`

			`# Enables torch profiler if set. Path to the directory where torch profiler`
			`# traces are saved. Note that it must be an absolute path.`
			`"VLLM_TORCH_PROFILER_DIR":`
			`lambda: (None if os.getenv("VLLM_TORCH_PROFILER_DIR", None) is None else os`
			`.path.expanduser(os.getenv("VLLM_TORCH_PROFILER_DIR", "."))),`

			`# If set, vLLM will use Triton implementations of AWQ.`
			`"VLLM_USE_TRITON_AWQ":`
			`lambda: bool(int(os.getenv("VLLM_USE_TRITON_AWQ", "0"))),`

			`# If set, allow loading or unloading lora adapters in runtime,`
			`"VLLM_ALLOW_RUNTIME_LORA_UPDATING":`
			`lambda:`
			`(os.environ.get("VLLM_ALLOW_RUNTIME_LORA_UPDATING", "0").strip().lower() in`
			`("1", "true")),`

			`# By default, vLLM will check the peer-to-peer capability itself,`
			`# in case of broken drivers. See https://github.com/vllm-project/vllm/blob/a9b15c606fea67a072416ea0ea115261a2756058/vllm/distributed/device_communicators/custom_all_reduce_utils.py#L101-L108 for details. # noqa`
			`# If this env var is set to 1, vLLM will skip the peer-to-peer check,`
			`# and trust the driver's peer-to-peer capability report.`
			`"VLLM_SKIP_P2P_CHECK":`
			`lambda: os.getenv("VLLM_SKIP_P2P_CHECK", "0") == "1",`

			`# If set, allowing the use of deprecated block manager V1`
			`"VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1":`
			`lambda: os.environ.get("VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1", "0"`
			`) == "1",`
Update to new version of base image 2025-10-24 15:45:06 +08:00
			`# Whether to turn on the outlines cache for V0`
			`# This cache is unbounded and on disk, so it's not safe to use in`
			`# an environment with potentially malicious users.`
			`"VLLM_V0_USE_OUTLINES_CACHE":`
			`lambda: os.environ.get("VLLM_V0_USE_OUTLINES_CACHE", "0") == "1",`
First commit 2025-08-05 19:02:46 +08:00			`}`

			`# end-env-vars-definition`


			`def __getattr__(name: str):`
			`# lazy evaluation of environment variables`
			`if name in environment_variables:`
			`return environment_variables[name]()`
			`raise AttributeError(f"module {__name__!r} has no attribute {name!r}")`


			`def __dir__():`
			`return list(environment_variables.keys())`