Fix CI (#9012)
This commit is contained in:
12
.github/workflows/vllm-dependency-test.yml
vendored
12
.github/workflows/vllm-dependency-test.yml
vendored
@@ -30,13 +30,19 @@ jobs:
|
|||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
run: |
|
run: |
|
||||||
bash scripts/ci_install_dependency.sh
|
bash scripts/ci_install_dependency.sh
|
||||||
pip install "vllm==0.9.0"
|
pip install "vllm==0.10.0"
|
||||||
pip install "bitsandbytes>=0.44.0"
|
|
||||||
pip install torch==2.8.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/test/cu126
|
|
||||||
pip install "openai==1.99.1"
|
pip install "openai==1.99.1"
|
||||||
|
pip install "bitsandbytes>=0.44.0"
|
||||||
|
|
||||||
|
# NOTE: The latest sgl-kernel depends on torch 2.8.0 but the latest vllm depends on torch 2.7.0
|
||||||
|
# so they are not compatible. Here we install the old sgl-kernel to make the test pass.
|
||||||
|
# TODO: remove this once vllm supports torch 2.8.0.
|
||||||
|
pip install "sgl-kernel==0.2.9"
|
||||||
|
|
||||||
- name: Run vLLM dependency tests
|
- name: Run vLLM dependency tests
|
||||||
timeout-minutes: 60
|
timeout-minutes: 60
|
||||||
run: |
|
run: |
|
||||||
|
export SGLANG_SKIP_SGL_KERNEL_VERSION_CHECK=1
|
||||||
|
|
||||||
cd test/srt
|
cd test/srt
|
||||||
python3 run_suite.py --suite vllm_dependency_test --timeout-per-file 3600
|
python3 run_suite.py --suite vllm_dependency_test --timeout-per-file 3600
|
||||||
|
|||||||
@@ -67,6 +67,7 @@ from sglang.srt.utils import (
|
|||||||
MultiprocessingSerializer,
|
MultiprocessingSerializer,
|
||||||
assert_pkg_version,
|
assert_pkg_version,
|
||||||
configure_logger,
|
configure_logger,
|
||||||
|
get_bool_env_var,
|
||||||
get_zmq_socket,
|
get_zmq_socket,
|
||||||
is_cuda,
|
is_cuda,
|
||||||
kill_process_tree,
|
kill_process_tree,
|
||||||
@@ -627,7 +628,6 @@ def _set_envs_and_config(server_args: ServerArgs):
|
|||||||
os.environ["NCCL_CUMEM_ENABLE"] = str(int(server_args.enable_symm_mem))
|
os.environ["NCCL_CUMEM_ENABLE"] = str(int(server_args.enable_symm_mem))
|
||||||
if not server_args.enable_symm_mem:
|
if not server_args.enable_symm_mem:
|
||||||
os.environ["NCCL_NVLS_ENABLE"] = str(int(server_args.enable_nccl_nvls))
|
os.environ["NCCL_NVLS_ENABLE"] = str(int(server_args.enable_nccl_nvls))
|
||||||
os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1"
|
|
||||||
os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "4"
|
os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "4"
|
||||||
os.environ["CUDA_MODULE_LOADING"] = "AUTO"
|
os.environ["CUDA_MODULE_LOADING"] = "AUTO"
|
||||||
|
|
||||||
@@ -647,7 +647,7 @@ def _set_envs_and_config(server_args: ServerArgs):
|
|||||||
"reinstall the latest version by following the instructions "
|
"reinstall the latest version by following the instructions "
|
||||||
"at https://docs.flashinfer.ai/installation.html.",
|
"at https://docs.flashinfer.ai/installation.html.",
|
||||||
)
|
)
|
||||||
if _is_cuda:
|
if _is_cuda and not get_bool_env_var("SGLANG_SKIP_SGL_KERNEL_VERSION_CHECK"):
|
||||||
assert_pkg_version(
|
assert_pkg_version(
|
||||||
"sgl-kernel",
|
"sgl-kernel",
|
||||||
"0.3.3",
|
"0.3.3",
|
||||||
|
|||||||
@@ -5,16 +5,17 @@ from abc import ABC, abstractmethod
|
|||||||
from contextlib import AbstractAsyncContextManager, asynccontextmanager
|
from contextlib import AbstractAsyncContextManager, asynccontextmanager
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
try:
|
try:
|
||||||
from mcp import ClientSession
|
from mcp import ClientSession
|
||||||
from mcp.client.sse import sse_client
|
from mcp.client.sse import sse_client
|
||||||
from mcp.types import ListToolsResult
|
from mcp.types import ListToolsResult
|
||||||
except ImportError:
|
except ImportError as e:
|
||||||
logger.warning("Ignoring mcp import error")
|
ClientSession = sse_client = ListToolsResult = e
|
||||||
|
|
||||||
from openai_harmony import ToolDescription, ToolNamespaceConfig
|
from openai_harmony import ToolDescription, ToolNamespaceConfig
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
async def list_server_and_tools(server_url: str):
|
async def list_server_and_tools(server_url: str):
|
||||||
|
|
||||||
|
|||||||
@@ -147,6 +147,7 @@ class FusedMoE(torch.nn.Module):
|
|||||||
|
|
||||||
self.layer_id = layer_id
|
self.layer_id = layer_id
|
||||||
self.top_k = top_k
|
self.top_k = top_k
|
||||||
|
self.hidden_size = hidden_size
|
||||||
self.num_experts = num_experts
|
self.num_experts = num_experts
|
||||||
self.num_fused_shared_experts = num_fused_shared_experts
|
self.num_fused_shared_experts = num_fused_shared_experts
|
||||||
self.expert_map_cpu = None
|
self.expert_map_cpu = None
|
||||||
|
|||||||
@@ -26,8 +26,9 @@ try:
|
|||||||
from vllm.model_executor.layers.quantization.tpu_int8 import Int8TpuConfig
|
from vllm.model_executor.layers.quantization.tpu_int8 import Int8TpuConfig
|
||||||
|
|
||||||
VLLM_AVAILABLE = True
|
VLLM_AVAILABLE = True
|
||||||
except ImportError:
|
except ImportError as e:
|
||||||
VLLM_AVAILABLE = False
|
VLLM_AVAILABLE = False
|
||||||
|
VLLM_IMPORT_ERROR = e
|
||||||
|
|
||||||
# Define empty classes as placeholders when vllm is not available
|
# Define empty classes as placeholders when vllm is not available
|
||||||
class DummyConfig:
|
class DummyConfig:
|
||||||
@@ -137,7 +138,8 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
|
|||||||
if quantization in VLLM_QUANTIZATION_METHODS and not VLLM_AVAILABLE:
|
if quantization in VLLM_QUANTIZATION_METHODS and not VLLM_AVAILABLE:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"{quantization} quantization requires some operators from vllm. "
|
f"{quantization} quantization requires some operators from vllm. "
|
||||||
"Please install vllm by `pip install vllm==0.9.0.1`"
|
f"Please install vllm by `pip install vllm==0.9.0.1`\n"
|
||||||
|
f"Import error: {VLLM_IMPORT_ERROR}"
|
||||||
)
|
)
|
||||||
|
|
||||||
return QUANTIZATION_METHODS[quantization]
|
return QUANTIZATION_METHODS[quantization]
|
||||||
|
|||||||
@@ -1,9 +1,8 @@
|
|||||||
# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/quantization/modelopt.py
|
# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/quantization/modelopt.py
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import importlib.util
|
|
||||||
import logging
|
import logging
|
||||||
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union
|
from typing import TYPE_CHECKING, Any, Dict, List, Optional
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
from torch.nn.parameter import Parameter
|
from torch.nn.parameter import Parameter
|
||||||
@@ -42,11 +41,7 @@ if is_cuda():
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
from flashinfer import mm_fp4 as fp4_gemm
|
from flashinfer import mm_fp4 as fp4_gemm
|
||||||
from flashinfer import (
|
from flashinfer import reorder_rows_for_gated_act_gemm, shuffle_matrix_sf_a
|
||||||
reorder_rows_for_gated_act_gemm,
|
|
||||||
shuffle_matrix_a,
|
|
||||||
shuffle_matrix_sf_a,
|
|
||||||
)
|
|
||||||
|
|
||||||
enable_flashinfer_fp4_gemm = True
|
enable_flashinfer_fp4_gemm = True
|
||||||
except ImportError:
|
except ImportError:
|
||||||
|
|||||||
@@ -20,7 +20,7 @@ def import_processors():
|
|||||||
try:
|
try:
|
||||||
module = importlib.import_module(name)
|
module = importlib.import_module(name)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"Ignore import error when loading {name}: " f"{e}")
|
logger.warning(f"Ignore import error when loading {name}: {e}")
|
||||||
continue
|
continue
|
||||||
all_members = inspect.getmembers(module, inspect.isclass)
|
all_members = inspect.getmembers(module, inspect.isclass)
|
||||||
classes = [
|
classes = [
|
||||||
|
|||||||
@@ -83,7 +83,7 @@ def import_model_classes():
|
|||||||
try:
|
try:
|
||||||
module = importlib.import_module(name)
|
module = importlib.import_module(name)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"Ignore import error when loading {name}. " f"{e}")
|
logger.warning(f"Ignore import error when loading {name}: {e}")
|
||||||
continue
|
continue
|
||||||
if hasattr(module, "EntryClass"):
|
if hasattr(module, "EntryClass"):
|
||||||
entry = module.EntryClass
|
entry = module.EntryClass
|
||||||
|
|||||||
@@ -83,7 +83,6 @@ class TestUtilsUpdateWeights(unittest.TestCase):
|
|||||||
# Set up environment variables
|
# Set up environment variables
|
||||||
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
|
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
|
||||||
os.environ["NCCL_CUMEM_ENABLE"] = "0"
|
os.environ["NCCL_CUMEM_ENABLE"] = "0"
|
||||||
os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1"
|
|
||||||
os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "4"
|
os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "4"
|
||||||
os.environ["CUDA_MODULE_LOADING"] = "AUTO"
|
os.environ["CUDA_MODULE_LOADING"] = "AUTO"
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user