Use an env var SGLANG_SET_CPU_AFFINITY to set cpu affinity; turn it off by default (#2217)

2024-11-27 01:13:41 -08:00
parent 37c8a5761f
commit a0e58740a8
6 changed files with 23 additions and 20 deletions
--- a/python/sglang/srt/utils.py
+++ b/python/sglang/srt/utils.py
@@ -72,7 +72,7 @@ def is_flashinfer_available():
    Check whether flashinfer is available.
    As of Oct. 6, 2024, it is only available on NVIDIA GPUs.
    """
-    if os.environ.get("SGLANG_IS_FLASHINFER_AVAILABLE", "true") == "false":
+    if get_bool_env_var("SGLANG_IS_FLASHINFER_AVAILABLE", default="true"):
        return False
    return torch.cuda.is_available() and not is_hip()

@@ -626,7 +626,7 @@ def add_api_key_middleware(app, api_key: str):


 def prepare_model_and_tokenizer(model_path: str, tokenizer_path: str):
-    if "SGLANG_USE_MODELSCOPE" in os.environ:
+    if get_bool_env_var("SGLANG_USE_MODELSCOPE"):
        if not os.path.exists(model_path):
            from modelscope import snapshot_download

@@ -931,7 +931,7 @@ def get_nvgpu_memory_capacity():

 def crash_on_warnings():
    # Crash on warning if we are running CI tests
-    return os.getenv("SGLANG_IS_IN_CI", "false").lower() == "true"
+    return get_bool_env_var("SGLANG_IS_IN_CI")


 def get_device_name(device_id: int = 0) -> str:
@@ -990,7 +990,7 @@ def direct_register_custom_op(
        my_lib._register_fake(op_name, fake_impl)


-def gpu_proc_affinity(
+def set_gpu_proc_affinity(
    tp_size: int,
    nnodes: int,
    gpu_id: int,
@@ -1022,3 +1022,8 @@ def gpu_proc_affinity(
    # set cpu_affinity to current process
    p.cpu_affinity(bind_cpu_ids)
    logger.info(f"Process {pid} gpu_id {gpu_id} is running on CPUs: {p.cpu_affinity()}")
+
+
+def get_bool_env_var(name: str, default: str = "false") -> bool:
+    value = os.getenv(name, default)
+    return value.lower() in ("true", "1")