From 9a44b643c67d10bcb4d8129e2153647a26558ae5 Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Sat, 9 Aug 2025 13:33:42 -0700 Subject: [PATCH] Fix CI (#9012) --- .github/workflows/vllm-dependency-test.yml | 12 +++++++++--- python/sglang/srt/entrypoints/engine.py | 4 ++-- python/sglang/srt/entrypoints/openai/tool_server.py | 7 ++++--- .../sglang/srt/layers/moe/fused_moe_triton/layer.py | 1 + python/sglang/srt/layers/quantization/__init__.py | 6 ++++-- .../sglang/srt/layers/quantization/modelopt_quant.py | 9 ++------- python/sglang/srt/managers/multimodal_processor.py | 2 +- python/sglang/srt/models/registry.py | 2 +- test/srt/test_utils_update_weights.py | 1 - 9 files changed, 24 insertions(+), 20 deletions(-) diff --git a/.github/workflows/vllm-dependency-test.yml b/.github/workflows/vllm-dependency-test.yml index 785b42efc..7dc6a8ba6 100644 --- a/.github/workflows/vllm-dependency-test.yml +++ b/.github/workflows/vllm-dependency-test.yml @@ -30,13 +30,19 @@ jobs: - name: Install dependencies run: | bash scripts/ci_install_dependency.sh - pip install "vllm==0.9.0" - pip install "bitsandbytes>=0.44.0" - pip install torch==2.8.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/test/cu126 + pip install "vllm==0.10.0" pip install "openai==1.99.1" + pip install "bitsandbytes>=0.44.0" + + # NOTE: The latest sgl-kernel depends on torch 2.8.0 but the latest vllm depends on torch 2.7.0 + # so they are not compatible. Here we install the old sgl-kernel to make the test pass. + # TODO: remove this once vllm supports torch 2.8.0. + pip install "sgl-kernel==0.2.9" - name: Run vLLM dependency tests timeout-minutes: 60 run: | + export SGLANG_SKIP_SGL_KERNEL_VERSION_CHECK=1 + cd test/srt python3 run_suite.py --suite vllm_dependency_test --timeout-per-file 3600 diff --git a/python/sglang/srt/entrypoints/engine.py b/python/sglang/srt/entrypoints/engine.py index b24032988..bde60ddfc 100644 --- a/python/sglang/srt/entrypoints/engine.py +++ b/python/sglang/srt/entrypoints/engine.py @@ -67,6 +67,7 @@ from sglang.srt.utils import ( MultiprocessingSerializer, assert_pkg_version, configure_logger, + get_bool_env_var, get_zmq_socket, is_cuda, kill_process_tree, @@ -627,7 +628,6 @@ def _set_envs_and_config(server_args: ServerArgs): os.environ["NCCL_CUMEM_ENABLE"] = str(int(server_args.enable_symm_mem)) if not server_args.enable_symm_mem: os.environ["NCCL_NVLS_ENABLE"] = str(int(server_args.enable_nccl_nvls)) - os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1" os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "4" os.environ["CUDA_MODULE_LOADING"] = "AUTO" @@ -647,7 +647,7 @@ def _set_envs_and_config(server_args: ServerArgs): "reinstall the latest version by following the instructions " "at https://docs.flashinfer.ai/installation.html.", ) - if _is_cuda: + if _is_cuda and not get_bool_env_var("SGLANG_SKIP_SGL_KERNEL_VERSION_CHECK"): assert_pkg_version( "sgl-kernel", "0.3.3", diff --git a/python/sglang/srt/entrypoints/openai/tool_server.py b/python/sglang/srt/entrypoints/openai/tool_server.py index fd66eb42b..269d9e99e 100644 --- a/python/sglang/srt/entrypoints/openai/tool_server.py +++ b/python/sglang/srt/entrypoints/openai/tool_server.py @@ -5,16 +5,17 @@ from abc import ABC, abstractmethod from contextlib import AbstractAsyncContextManager, asynccontextmanager from typing import Any -logger = logging.getLogger(__name__) try: from mcp import ClientSession from mcp.client.sse import sse_client from mcp.types import ListToolsResult -except ImportError: - logger.warning("Ignoring mcp import error") +except ImportError as e: + ClientSession = sse_client = ListToolsResult = e from openai_harmony import ToolDescription, ToolNamespaceConfig +logger = logging.getLogger(__name__) + async def list_server_and_tools(server_url: str): diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/layer.py b/python/sglang/srt/layers/moe/fused_moe_triton/layer.py index 9bf97b690..8bde5ac8d 100644 --- a/python/sglang/srt/layers/moe/fused_moe_triton/layer.py +++ b/python/sglang/srt/layers/moe/fused_moe_triton/layer.py @@ -147,6 +147,7 @@ class FusedMoE(torch.nn.Module): self.layer_id = layer_id self.top_k = top_k + self.hidden_size = hidden_size self.num_experts = num_experts self.num_fused_shared_experts = num_fused_shared_experts self.expert_map_cpu = None diff --git a/python/sglang/srt/layers/quantization/__init__.py b/python/sglang/srt/layers/quantization/__init__.py index 7be1572da..e94b3f18a 100644 --- a/python/sglang/srt/layers/quantization/__init__.py +++ b/python/sglang/srt/layers/quantization/__init__.py @@ -26,8 +26,9 @@ try: from vllm.model_executor.layers.quantization.tpu_int8 import Int8TpuConfig VLLM_AVAILABLE = True -except ImportError: +except ImportError as e: VLLM_AVAILABLE = False + VLLM_IMPORT_ERROR = e # Define empty classes as placeholders when vllm is not available class DummyConfig: @@ -137,7 +138,8 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]: if quantization in VLLM_QUANTIZATION_METHODS and not VLLM_AVAILABLE: raise ValueError( f"{quantization} quantization requires some operators from vllm. " - "Please install vllm by `pip install vllm==0.9.0.1`" + f"Please install vllm by `pip install vllm==0.9.0.1`\n" + f"Import error: {VLLM_IMPORT_ERROR}" ) return QUANTIZATION_METHODS[quantization] diff --git a/python/sglang/srt/layers/quantization/modelopt_quant.py b/python/sglang/srt/layers/quantization/modelopt_quant.py index 4e2b3a53e..dc28ee545 100755 --- a/python/sglang/srt/layers/quantization/modelopt_quant.py +++ b/python/sglang/srt/layers/quantization/modelopt_quant.py @@ -1,9 +1,8 @@ # Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/quantization/modelopt.py from __future__ import annotations -import importlib.util import logging -from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union +from typing import TYPE_CHECKING, Any, Dict, List, Optional import torch from torch.nn.parameter import Parameter @@ -42,11 +41,7 @@ if is_cuda(): try: from flashinfer import mm_fp4 as fp4_gemm - from flashinfer import ( - reorder_rows_for_gated_act_gemm, - shuffle_matrix_a, - shuffle_matrix_sf_a, - ) + from flashinfer import reorder_rows_for_gated_act_gemm, shuffle_matrix_sf_a enable_flashinfer_fp4_gemm = True except ImportError: diff --git a/python/sglang/srt/managers/multimodal_processor.py b/python/sglang/srt/managers/multimodal_processor.py index 51b6f3d92..bc060a5b3 100644 --- a/python/sglang/srt/managers/multimodal_processor.py +++ b/python/sglang/srt/managers/multimodal_processor.py @@ -20,7 +20,7 @@ def import_processors(): try: module = importlib.import_module(name) except Exception as e: - logger.warning(f"Ignore import error when loading {name}: " f"{e}") + logger.warning(f"Ignore import error when loading {name}: {e}") continue all_members = inspect.getmembers(module, inspect.isclass) classes = [ diff --git a/python/sglang/srt/models/registry.py b/python/sglang/srt/models/registry.py index f81d3c76e..76e042a95 100644 --- a/python/sglang/srt/models/registry.py +++ b/python/sglang/srt/models/registry.py @@ -83,7 +83,7 @@ def import_model_classes(): try: module = importlib.import_module(name) except Exception as e: - logger.warning(f"Ignore import error when loading {name}. " f"{e}") + logger.warning(f"Ignore import error when loading {name}: {e}") continue if hasattr(module, "EntryClass"): entry = module.EntryClass diff --git a/test/srt/test_utils_update_weights.py b/test/srt/test_utils_update_weights.py index 03262f10a..8c138f0ab 100644 --- a/test/srt/test_utils_update_weights.py +++ b/test/srt/test_utils_update_weights.py @@ -83,7 +83,6 @@ class TestUtilsUpdateWeights(unittest.TestCase): # Set up environment variables os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" os.environ["NCCL_CUMEM_ENABLE"] = "0" - os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1" os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "4" os.environ["CUDA_MODULE_LOADING"] = "AUTO"