diff --git a/.github/workflows/vllm-dependency-test.yml b/.github/workflows/vllm-dependency-test.yml index 57ad68d52..e4590a9ee 100644 --- a/.github/workflows/vllm-dependency-test.yml +++ b/.github/workflows/vllm-dependency-test.yml @@ -30,7 +30,7 @@ jobs: - name: Install dependencies run: | bash scripts/ci_install_dependency.sh - pip install "vllm>=0.6.4.post1,<=0.7.2" + pip install "vllm==0.8.4" pip install "bitsandbytes>=0.44.0" - name: Run VLLM dependency tests diff --git a/python/pyproject.toml b/python/pyproject.toml index afec3d10b..4477a424a 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -47,7 +47,7 @@ runtime_common = [ srt = [ "sglang[runtime_common]", - "sgl-kernel==0.1.0", + "sgl-kernel==0.1.1", "flashinfer_python==0.2.5", "torch==2.6.0", "torchvision==0.21.0", diff --git a/python/sglang/srt/entrypoints/engine.py b/python/sglang/srt/entrypoints/engine.py index f26437db9..3c1d308e8 100644 --- a/python/sglang/srt/entrypoints/engine.py +++ b/python/sglang/srt/entrypoints/engine.py @@ -461,7 +461,7 @@ def _set_envs_and_config(server_args: ServerArgs): if _is_cuda: assert_pkg_version( "sgl-kernel", - "0.1.0", + "0.1.1", "Please reinstall the latest version with `pip install sgl-kernel --force-reinstall`", ) diff --git a/python/sglang/srt/layers/quantization/__init__.py b/python/sglang/srt/layers/quantization/__init__.py index 6b8719bfa..81ee11a09 100644 --- a/python/sglang/srt/layers/quantization/__init__.py +++ b/python/sglang/srt/layers/quantization/__init__.py @@ -109,7 +109,7 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]: if quantization in VLLM_QUANTIZATION_METHODS and not VLLM_AVAILABLE: raise ValueError( f"{quantization} quantization requires some operators from vllm. " - "Pleaes install vllm by `pip install vllm==0.7.2`" + "Pleaes install vllm by `pip install vllm==0.8.4`" ) return QUANTIZATION_METHODS[quantization] @@ -310,7 +310,7 @@ def monkey_patch_moe_apply(class_obj: "FusedMoEMethodBase"): if correction_bias is not None: if not has_correction_bias: raise ValueError( - "Please increase the version of your vllm. Try `pip install vllm==0.7.2`" + "Please increase the version of your vllm. Try `pip install vllm==0.8.4`" ) kwargs["e_score_correction_bias"] = correction_bias return original_apply(**kwargs) diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py index 231428b83..8b0ad93a5 100644 --- a/python/sglang/srt/model_executor/model_runner.py +++ b/python/sglang/srt/model_executor/model_runner.py @@ -79,6 +79,7 @@ from sglang.srt.utils import ( get_available_gpu_memory, get_bool_env_var, init_custom_process_group, + is_ampere_with_cuda_12_3, is_cuda, is_fa3_default_architecture, is_flashinfer_available, @@ -246,7 +247,7 @@ class ModelRunner: if not self.use_mla_backend: # MHA architecture if ( - is_hopper_with_cuda_12_3() + (is_ampere_with_cuda_12_3() or is_hopper_with_cuda_12_3()) and is_no_spec_infer_or_topk_one(server_args) and is_fa3_default_architecture(self.model_config.hf_config) ): @@ -927,8 +928,10 @@ class ModelRunner: self.attn_backend = FlashMLABackend(self) elif self.server_args.attention_backend == "fa3": - assert torch.cuda.get_device_capability()[0] >= 9, ( - "FlashAttention v3 Backend requires SM>=90. " + assert ( + torch.cuda.get_device_capability()[0] == 8 and not self.use_mla_backend + ) or torch.cuda.get_device_capability()[0] == 9, ( + "FlashAttention v3 Backend requires SM>=80 and SM<=90. " "Please use `--attention-backend flashinfer`." ) from sglang.srt.layers.attention.flashattention_backend import ( diff --git a/python/sglang/srt/utils.py b/python/sglang/srt/utils.py index 3a7f17db7..ef18002e0 100644 --- a/python/sglang/srt/utils.py +++ b/python/sglang/srt/utils.py @@ -1905,13 +1905,16 @@ def fast_topk(values, topk, dim): return torch.topk(values, topk, dim=dim) -def is_hopper_with_cuda_12_3(): +def _check(cc_major): if not is_cuda(): return False - is_hopper = torch.cuda.get_device_capability()[0] == 9 - cuda_version = torch.version.cuda.split(".") - is_cuda_compatible = int(cuda_version[0]) == 12 and int(cuda_version[1]) >= 3 - return is_hopper and is_cuda_compatible + return torch.cuda.get_device_capability()[0] == cc_major and tuple( + map(int, torch.version.cuda.split(".")[:2]) + ) >= (12, 3) + + +is_ampere_with_cuda_12_3 = lambda: _check(8) +is_hopper_with_cuda_12_3 = lambda: _check(9) def get_free_port(): diff --git a/scripts/ci_install_dependency.sh b/scripts/ci_install_dependency.sh index e9aa3c332..a02d6e2d5 100755 --- a/scripts/ci_install_dependency.sh +++ b/scripts/ci_install_dependency.sh @@ -16,7 +16,7 @@ rm -rf /usr/local/lib/python3.10/dist-packages/sgl_kernel* pip install --upgrade pip # Install sgl-kernel -pip install sgl-kernel==0.1.0 --no-cache-dir +pip install sgl-kernel==0.1.1 --no-cache-dir # Install the main package pip install -e "python[all]"