Tiny support setting numa nodes for different ranks (#10006)

2025-09-05 19:01:27 +08:00
parent 339f8eef09
commit df97b31f37
3 changed files with 20 additions and 0 deletions
--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
@@ -158,6 +158,7 @@ from sglang.srt.utils import (
    get_zmq_socket,
    is_cpu,
    kill_itself_when_parent_died,
+    numa_bind_to_node,
    point_to_point_pyobj,
    pyspy_dump_schedulers,
    require_mlp_sync,
@@ -2519,6 +2520,9 @@ def run_scheduler_process(
    pipe_writer,
    balance_meta: Optional[DPBalanceMeta] = None,
 ):
+    if (numa_node := server_args.numa_node) is not None:
+        numa_bind_to_node(numa_node[gpu_id])
+
    # Generate the prefix
    prefix = ""
    if dp_rank is not None:
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -351,6 +351,7 @@ class ServerArgs:
    disable_fast_image_processor: bool = False
    enable_return_hidden_states: bool = False
    scheduler_recv_interval: int = 1
+    numa_node: Optional[List[int]] = None

    # Debug tensor dumps
    debug_tensor_dump_output_folder: Optional[str] = None
@@ -1991,6 +1992,12 @@ class ServerArgs:
            default=ServerArgs.scheduler_recv_interval,
            help="The interval to poll requests in scheduler. Can be set to >1 to reduce the overhead of this.",
        )
+        parser.add_argument(
+            "--numa-node",
+            type=int,
+            nargs="+",
+            help="Sets the numa node for the subprocesses. i-th element corresponds to i-th subprocess.",
+        )

        # Debug tensor dumps
        parser.add_argument(
--- a/python/sglang/srt/utils.py
+++ b/python/sglang/srt/utils.py
@@ -3027,3 +3027,12 @@ def check_cuda_result(raw_output):
        raise Exception(f"CUDA error: {err}")

    return results
+
+
+def numa_bind_to_node(node: int):
+    libnuma = ctypes.CDLL("libnuma.so")
+    if libnuma.numa_available() < 0:
+        raise SystemError("numa not available on this system")
+
+    libnuma.numa_run_on_node(ctypes.c_int(node))
+    libnuma.numa_set_localalloc()