Fix CI (#9012)

2025-08-09 13:33:42 -07:00
parent 41d71ca488
commit 9a44b643c6
9 changed files with 24 additions and 20 deletions
--- a/python/sglang/srt/entrypoints/engine.py
+++ b/python/sglang/srt/entrypoints/engine.py
@@ -67,6 +67,7 @@ from sglang.srt.utils import (
    MultiprocessingSerializer,
    assert_pkg_version,
    configure_logger,
+    get_bool_env_var,
    get_zmq_socket,
    is_cuda,
    kill_process_tree,
@@ -627,7 +628,6 @@ def _set_envs_and_config(server_args: ServerArgs):
    os.environ["NCCL_CUMEM_ENABLE"] = str(int(server_args.enable_symm_mem))
    if not server_args.enable_symm_mem:
        os.environ["NCCL_NVLS_ENABLE"] = str(int(server_args.enable_nccl_nvls))
-    os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1"
    os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "4"
    os.environ["CUDA_MODULE_LOADING"] = "AUTO"

@@ -647,7 +647,7 @@ def _set_envs_and_config(server_args: ServerArgs):
            "reinstall the latest version by following the instructions "
            "at https://docs.flashinfer.ai/installation.html.",
        )
-    if _is_cuda:
+    if _is_cuda and not get_bool_env_var("SGLANG_SKIP_SGL_KERNEL_VERSION_CHECK"):
        assert_pkg_version(
            "sgl-kernel",
            "0.3.3",
--- a/python/sglang/srt/entrypoints/openai/tool_server.py
+++ b/python/sglang/srt/entrypoints/openai/tool_server.py
@@ -5,16 +5,17 @@ from abc import ABC, abstractmethod
 from contextlib import AbstractAsyncContextManager, asynccontextmanager
 from typing import Any

-logger = logging.getLogger(__name__)
 try:
    from mcp import ClientSession
    from mcp.client.sse import sse_client
    from mcp.types import ListToolsResult
-except ImportError:
-    logger.warning("Ignoring mcp import error")
+except ImportError as e:
+    ClientSession = sse_client = ListToolsResult = e

 from openai_harmony import ToolDescription, ToolNamespaceConfig

+logger = logging.getLogger(__name__)
+

 async def list_server_and_tools(server_url: str):

--- a/python/sglang/srt/layers/moe/fused_moe_triton/layer.py
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/layer.py
@@ -147,6 +147,7 @@ class FusedMoE(torch.nn.Module):

        self.layer_id = layer_id
        self.top_k = top_k
+        self.hidden_size = hidden_size
        self.num_experts = num_experts
        self.num_fused_shared_experts = num_fused_shared_experts
        self.expert_map_cpu = None
--- a/python/sglang/srt/layers/quantization/init.py
+++ b/python/sglang/srt/layers/quantization/init.py
@@ -26,8 +26,9 @@ try:
    from vllm.model_executor.layers.quantization.tpu_int8 import Int8TpuConfig

    VLLM_AVAILABLE = True
-except ImportError:
+except ImportError as e:
    VLLM_AVAILABLE = False
+    VLLM_IMPORT_ERROR = e

    # Define empty classes as placeholders when vllm is not available
    class DummyConfig:
@@ -137,7 +138,8 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
    if quantization in VLLM_QUANTIZATION_METHODS and not VLLM_AVAILABLE:
        raise ValueError(
            f"{quantization} quantization requires some operators from vllm. "
-            "Please install vllm by `pip install vllm==0.9.0.1`"
+            f"Please install vllm by `pip install vllm==0.9.0.1`\n"
+            f"Import error: {VLLM_IMPORT_ERROR}"
        )

    return QUANTIZATION_METHODS[quantization]
--- a/python/sglang/srt/layers/quantization/modelopt_quant.py
+++ b/python/sglang/srt/layers/quantization/modelopt_quant.py
@@ -1,9 +1,8 @@
 # Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/quantization/modelopt.py
 from __future__ import annotations

-import importlib.util
 import logging
-from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union
+from typing import TYPE_CHECKING, Any, Dict, List, Optional

 import torch
 from torch.nn.parameter import Parameter
@@ -42,11 +41,7 @@ if is_cuda():

 try:
    from flashinfer import mm_fp4 as fp4_gemm
-    from flashinfer import (
-        reorder_rows_for_gated_act_gemm,
-        shuffle_matrix_a,
-        shuffle_matrix_sf_a,
-    )
+    from flashinfer import reorder_rows_for_gated_act_gemm, shuffle_matrix_sf_a

    enable_flashinfer_fp4_gemm = True
 except ImportError:
--- a/python/sglang/srt/managers/multimodal_processor.py
+++ b/python/sglang/srt/managers/multimodal_processor.py
@@ -20,7 +20,7 @@ def import_processors():
            try:
                module = importlib.import_module(name)
            except Exception as e:
-                logger.warning(f"Ignore import error when loading {name}: " f"{e}")
+                logger.warning(f"Ignore import error when loading {name}: {e}")
                continue
            all_members = inspect.getmembers(module, inspect.isclass)
            classes = [
--- a/python/sglang/srt/models/registry.py
+++ b/python/sglang/srt/models/registry.py
@@ -83,7 +83,7 @@ def import_model_classes():
            try:
                module = importlib.import_module(name)
            except Exception as e:
-                logger.warning(f"Ignore import error when loading {name}. " f"{e}")
+                logger.warning(f"Ignore import error when loading {name}: {e}")
                continue
            if hasattr(module, "EntryClass"):
                entry = module.EntryClass