[CPU] support the case where num_attention_heads or intermediate_size is not divisible by the TP size (#6771)

2025-07-04 00:51:38 +08:00
parent 9fcc9a80e7
commit 1dce6c480f
11 changed files with 399 additions and 40 deletions
--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
@@ -149,6 +149,7 @@ from sglang.srt.utils import (
    get_available_gpu_memory,
    get_bool_env_var,
    get_zmq_socket,
+    is_cpu,
    kill_itself_when_parent_died,
    point_to_point_pyobj,
    pyspy_dump_schedulers,
@@ -167,6 +168,8 @@ TEST_RETRACT = get_bool_env_var("SGLANG_TEST_RETRACT")
 RECORD_STEP_TIME = get_bool_env_var("SGLANG_RECORD_STEP_TIME")
 GRAMMAR_TIMEOUT = float(os.environ.get("SGLANG_GRAMMAR_TIMEOUT", 300))

+_is_cpu = is_cpu()
+

@dataclass
 class GenerationBatchResult:
@@ -2115,11 +2118,14 @@ class Scheduler(
            "kvcache": round(
                self.token_to_kv_pool_allocator.get_kvcache().mem_usage, 2
            ),
-            "cuda_graph": round(
-                self.tp_worker.worker.model_runner.cuda_graph_mem_usage, 2
-            ),
            "token_capacity": int(self.max_total_num_tokens),
        }
+
+        if not _is_cpu:
+            ret["memory_usage"]["cuda_graph"] = round(
+                self.tp_worker.worker.model_runner.cuda_graph_mem_usage, 2
+            )
+
        if not self.spec_algorithm.is_none() and self.cum_spec_accept_count > 0:
            ret["avg_spec_accept_length"] = (
                self.cum_spec_accept_length / self.cum_spec_accept_count