chore: upgrade sgl-kernel 0.1.1 (#5933)
This commit is contained in:
2
.github/workflows/vllm-dependency-test.yml
vendored
2
.github/workflows/vllm-dependency-test.yml
vendored
@@ -30,7 +30,7 @@ jobs:
|
|||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
run: |
|
run: |
|
||||||
bash scripts/ci_install_dependency.sh
|
bash scripts/ci_install_dependency.sh
|
||||||
pip install "vllm>=0.6.4.post1,<=0.7.2"
|
pip install "vllm==0.8.4"
|
||||||
pip install "bitsandbytes>=0.44.0"
|
pip install "bitsandbytes>=0.44.0"
|
||||||
|
|
||||||
- name: Run VLLM dependency tests
|
- name: Run VLLM dependency tests
|
||||||
|
|||||||
@@ -47,7 +47,7 @@ runtime_common = [
|
|||||||
|
|
||||||
srt = [
|
srt = [
|
||||||
"sglang[runtime_common]",
|
"sglang[runtime_common]",
|
||||||
"sgl-kernel==0.1.0",
|
"sgl-kernel==0.1.1",
|
||||||
"flashinfer_python==0.2.5",
|
"flashinfer_python==0.2.5",
|
||||||
"torch==2.6.0",
|
"torch==2.6.0",
|
||||||
"torchvision==0.21.0",
|
"torchvision==0.21.0",
|
||||||
|
|||||||
@@ -461,7 +461,7 @@ def _set_envs_and_config(server_args: ServerArgs):
|
|||||||
if _is_cuda:
|
if _is_cuda:
|
||||||
assert_pkg_version(
|
assert_pkg_version(
|
||||||
"sgl-kernel",
|
"sgl-kernel",
|
||||||
"0.1.0",
|
"0.1.1",
|
||||||
"Please reinstall the latest version with `pip install sgl-kernel --force-reinstall`",
|
"Please reinstall the latest version with `pip install sgl-kernel --force-reinstall`",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -109,7 +109,7 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
|
|||||||
if quantization in VLLM_QUANTIZATION_METHODS and not VLLM_AVAILABLE:
|
if quantization in VLLM_QUANTIZATION_METHODS and not VLLM_AVAILABLE:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"{quantization} quantization requires some operators from vllm. "
|
f"{quantization} quantization requires some operators from vllm. "
|
||||||
"Pleaes install vllm by `pip install vllm==0.7.2`"
|
"Pleaes install vllm by `pip install vllm==0.8.4`"
|
||||||
)
|
)
|
||||||
|
|
||||||
return QUANTIZATION_METHODS[quantization]
|
return QUANTIZATION_METHODS[quantization]
|
||||||
@@ -310,7 +310,7 @@ def monkey_patch_moe_apply(class_obj: "FusedMoEMethodBase"):
|
|||||||
if correction_bias is not None:
|
if correction_bias is not None:
|
||||||
if not has_correction_bias:
|
if not has_correction_bias:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"Please increase the version of your vllm. Try `pip install vllm==0.7.2`"
|
"Please increase the version of your vllm. Try `pip install vllm==0.8.4`"
|
||||||
)
|
)
|
||||||
kwargs["e_score_correction_bias"] = correction_bias
|
kwargs["e_score_correction_bias"] = correction_bias
|
||||||
return original_apply(**kwargs)
|
return original_apply(**kwargs)
|
||||||
|
|||||||
@@ -79,6 +79,7 @@ from sglang.srt.utils import (
|
|||||||
get_available_gpu_memory,
|
get_available_gpu_memory,
|
||||||
get_bool_env_var,
|
get_bool_env_var,
|
||||||
init_custom_process_group,
|
init_custom_process_group,
|
||||||
|
is_ampere_with_cuda_12_3,
|
||||||
is_cuda,
|
is_cuda,
|
||||||
is_fa3_default_architecture,
|
is_fa3_default_architecture,
|
||||||
is_flashinfer_available,
|
is_flashinfer_available,
|
||||||
@@ -246,7 +247,7 @@ class ModelRunner:
|
|||||||
if not self.use_mla_backend:
|
if not self.use_mla_backend:
|
||||||
# MHA architecture
|
# MHA architecture
|
||||||
if (
|
if (
|
||||||
is_hopper_with_cuda_12_3()
|
(is_ampere_with_cuda_12_3() or is_hopper_with_cuda_12_3())
|
||||||
and is_no_spec_infer_or_topk_one(server_args)
|
and is_no_spec_infer_or_topk_one(server_args)
|
||||||
and is_fa3_default_architecture(self.model_config.hf_config)
|
and is_fa3_default_architecture(self.model_config.hf_config)
|
||||||
):
|
):
|
||||||
@@ -927,8 +928,10 @@ class ModelRunner:
|
|||||||
|
|
||||||
self.attn_backend = FlashMLABackend(self)
|
self.attn_backend = FlashMLABackend(self)
|
||||||
elif self.server_args.attention_backend == "fa3":
|
elif self.server_args.attention_backend == "fa3":
|
||||||
assert torch.cuda.get_device_capability()[0] >= 9, (
|
assert (
|
||||||
"FlashAttention v3 Backend requires SM>=90. "
|
torch.cuda.get_device_capability()[0] == 8 and not self.use_mla_backend
|
||||||
|
) or torch.cuda.get_device_capability()[0] == 9, (
|
||||||
|
"FlashAttention v3 Backend requires SM>=80 and SM<=90. "
|
||||||
"Please use `--attention-backend flashinfer`."
|
"Please use `--attention-backend flashinfer`."
|
||||||
)
|
)
|
||||||
from sglang.srt.layers.attention.flashattention_backend import (
|
from sglang.srt.layers.attention.flashattention_backend import (
|
||||||
|
|||||||
@@ -1905,13 +1905,16 @@ def fast_topk(values, topk, dim):
|
|||||||
return torch.topk(values, topk, dim=dim)
|
return torch.topk(values, topk, dim=dim)
|
||||||
|
|
||||||
|
|
||||||
def is_hopper_with_cuda_12_3():
|
def _check(cc_major):
|
||||||
if not is_cuda():
|
if not is_cuda():
|
||||||
return False
|
return False
|
||||||
is_hopper = torch.cuda.get_device_capability()[0] == 9
|
return torch.cuda.get_device_capability()[0] == cc_major and tuple(
|
||||||
cuda_version = torch.version.cuda.split(".")
|
map(int, torch.version.cuda.split(".")[:2])
|
||||||
is_cuda_compatible = int(cuda_version[0]) == 12 and int(cuda_version[1]) >= 3
|
) >= (12, 3)
|
||||||
return is_hopper and is_cuda_compatible
|
|
||||||
|
|
||||||
|
is_ampere_with_cuda_12_3 = lambda: _check(8)
|
||||||
|
is_hopper_with_cuda_12_3 = lambda: _check(9)
|
||||||
|
|
||||||
|
|
||||||
def get_free_port():
|
def get_free_port():
|
||||||
|
|||||||
@@ -16,7 +16,7 @@ rm -rf /usr/local/lib/python3.10/dist-packages/sgl_kernel*
|
|||||||
pip install --upgrade pip
|
pip install --upgrade pip
|
||||||
|
|
||||||
# Install sgl-kernel
|
# Install sgl-kernel
|
||||||
pip install sgl-kernel==0.1.0 --no-cache-dir
|
pip install sgl-kernel==0.1.1 --no-cache-dir
|
||||||
|
|
||||||
# Install the main package
|
# Install the main package
|
||||||
pip install -e "python[all]"
|
pip install -e "python[all]"
|
||||||
|
|||||||
Reference in New Issue
Block a user