[CI] Fix ci tests (#5769)

2025-04-27 07:18:10 -07:00
parent 35ca04d2fa
commit 621e96bf9b
18 changed files with 126 additions and 295 deletions
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -201,7 +201,7 @@ class ServerArgs:
        # Expert parallelism
        if self.enable_ep_moe:
            self.ep_size = self.tp_size
-            logger.info(
+            logger.warning(
                f"EP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
            )

@@ -243,19 +243,19 @@ class ServerArgs:
                self.chunked_prefill_size = 2048
            else:
                self.chunked_prefill_size = 8192
-
        assert self.chunked_prefill_size % self.page_size == 0

        assert self.moe_dense_tp_size in {
            1,
            None,
-        }, f"moe_dense_tp_size only support 1 and None currently"
+        }, "moe_dense_tp_size only support 1 and None currently"

        if self.attention_backend == "flashmla":
            logger.warning(
                "FlashMLA only supports a page_size of 64, change page_size to 64."
            )
            self.page_size = 64
+
        # Set cuda graph max batch size
        if self.cuda_graph_max_bs is None:
            # Based on detailed statistics, when serving TP1/TP2 models on lower-end GPUs with HBM<25G, you can either disable cuda graph or set `cuda_graph_max_bs` to a very small value to reduce the memory overhead of creating cuda graphs, with almost no impact on performance. However, when serving models with TP4 or TP8, we need to enable cuda graph to maintain high performance. In this case, we can set `cuda_graph_max_bs` to 80 (half of the default value 160) to reduce the memory overhead of creating cuda graphs. Looking at the logs from TP4 serving of qwen2-72b, a value of 80 is sufficient and can reduce the memory overhead of creating cuda graphs on lower-end GPUs compared to the original 160, avoiding OOM issues.
@@ -270,6 +270,7 @@ class ServerArgs:
            self.attention_backend = "torch_native"
            self.sampling_backend = "pytorch"

+        # Set kernel backends
        if self.sampling_backend is None:
            self.sampling_backend = (
                "flashinfer" if is_flashinfer_available() else "pytorch"
@@ -297,8 +298,8 @@ class ServerArgs:
                f"DP attention is enabled. The chunked prefill size is adjusted to {self.chunked_prefill_size} to avoid MoE kernel issues. "
            )

-        self.enable_sp_layernorm = False
        # DeepEP MoE
+        self.enable_sp_layernorm = False
        if self.enable_deepep_moe:
            if self.deepep_mode == "auto":
                assert (
@@ -308,7 +309,7 @@ class ServerArgs:
            self.enable_sp_layernorm = (
                self.dp_size < self.tp_size if self.enable_dp_attention else True
            )
-            logger.info(
+            logger.warning(
                f"DeepEP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
            )

@@ -317,14 +318,11 @@ class ServerArgs:
            # NEXTN shares the same implementation of EAGLE
            self.speculative_algorithm = "EAGLE"

-        if (
-            self.speculative_algorithm == "EAGLE"
-            or self.speculative_algorithm == "EAGLE3"
-        ):
+        if self.speculative_algorithm in ("EAGLE", "EAGLE3"):
            if self.max_running_requests is None:
                self.max_running_requests = 48
            self.disable_overlap_schedule = True
-            logger.info(
+            logger.warning(
                "Overlap scheduler is disabled because of using "
                "eagle speculative decoding."
            )
@@ -343,7 +341,7 @@ class ServerArgs:

            if self.page_size > 1 and self.speculative_eagle_topk > 1:
                self.speculative_eagle_topk = 1
-                logger.info(
+                logger.warning(
                    "speculative_eagle_topk is adjusted to 1 when page_size > 1"
                )

@@ -351,7 +349,7 @@ class ServerArgs:
                self.speculative_eagle_topk == 1
                and self.speculative_num_draft_tokens != self.speculative_num_steps + 1
            ):
-                logger.info(
+                logger.warning(
                    "speculative_num_draft_tokens is adjusted to speculative_num_steps + 1 when speculative_eagle_topk == 1"
                )
                self.speculative_num_draft_tokens = self.speculative_num_steps + 1
@@ -381,18 +379,6 @@ class ServerArgs:
            self.disable_radix_cache = True
            logger.warning("KV cache is forced as chunk cache for decode server")

-        if self.enable_memory_saver:
-            try:
-                import torch_memory_saver
-            except ImportError:
-                logger.warning(
-                    "enable_memory_saver is enabled, but "
-                    "torch-memory-saver is not installed. Please install it "
-                    "via `pip3 uninstall torch-memory-saver`. "
-                    "For normal operation, it will be disabled."
-                )
-                raise
-
        os.environ["SGLANG_ENABLE_TORCH_COMPILE"] = (
            "1" if self.enable_torch_compile else "0"
        )