chore: upgrade flashinfer v0.2.6.post1 jit (#6958)

Co-authored-by: alcanderian <alcanderian@gmail.com> Co-authored-by: Qiaolin Yu <qy254@cornell.edu> Co-authored-by: Baizhou Zhang <sobereddiezhang@gmail.com> Co-authored-by: Mick <mickjagger19@icloud.com> Co-authored-by: ispobock <ispobaoke@gmail.com>
2025-06-09 09:22:39 -07:00
parent 98c00a2df1
commit 56ccd3c22c
14 changed files with 189 additions and 27 deletions
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -49,10 +49,11 @@ runtime_common = [

 srt = [
    "sglang[runtime_common]",
-    "sgl-kernel==0.1.6.post1",
-    "flashinfer_python==0.2.5",
-    "torch==2.6.0",
-    "torchvision==0.21.0",
+    "sgl-kernel==0.1.7",
+    "flashinfer_python==0.2.6.post1",
+    "torch==2.7.1",
+    "torchaudio==2.7.1",
+    "torchvision==0.22.1",
    "cuda-python",
    "outlines>=0.0.44,<=0.1.11",
    "einops",
@@ -61,12 +62,13 @@ srt = [
 blackwell = [
    "sglang[runtime_common]",
    "sgl-kernel",
-    "torch==2.7.0",
+    "torch==2.7.1",
+    "torchaudio==2.7.1",
    "torchvision==0.22.0",
    "cuda-python",
    "outlines>=0.0.44,<=0.1.11",
    "einops",
-    "flashinfer_python==0.2.5",
+    "flashinfer_python==0.2.6.post1",
 ]

 # HIP (Heterogeneous-computing Interface for Portability) for AMD
--- a/python/sglang/srt/entrypoints/engine.py
+++ b/python/sglang/srt/entrypoints/engine.py
@@ -571,7 +571,7 @@ def _set_envs_and_config(server_args: ServerArgs):
    if server_args.attention_backend == "flashinfer":
        assert_pkg_version(
            "flashinfer_python",
-            "0.2.5",
+            "0.2.6.post1",
            "Please uninstall the old version and "
            "reinstall the latest version by following the instructions "
            "at https://docs.flashinfer.ai/installation.html.",
@@ -579,7 +579,7 @@ def _set_envs_and_config(server_args: ServerArgs):
    if _is_cuda:
        assert_pkg_version(
            "sgl-kernel",
-            "0.1.6.post1",
+            "0.1.7",
            "Please reinstall the latest version with `pip install sgl-kernel --force-reinstall`",
        )

--- a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
--- a/python/sglang/srt/layers/moe/fused_moe_triton/layer.py
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/layer.py
@@ -316,6 +316,7 @@ class FusedMoE(torch.nn.Module):
        if params_dtype is None:
            params_dtype = torch.get_default_dtype()

+        self.hidden_size = hidden_size
        self.tp_size = (
            tp_size if tp_size is not None else get_tensor_model_parallel_world_size()
        )
--- a/python/sglang/srt/layers/multimodal.py
+++ b/python/sglang/srt/layers/multimodal.py
@@ -32,8 +32,8 @@ def hash_kernel(
    offsets = block_start + tl.arange(0, BLOCK_SIZE)
    mask = offsets < n_elements

-    data = tl.load(input_ptr + offsets, mask=mask, other=0)
-    mixed = data ^ (offsets + XCONST)
+    data = tl.load(input_ptr + offsets, mask=mask, other=0).to(tl.int64)
+    mixed = data ^ (offsets.to(tl.int64) + XCONST)
    hash_val = mixed * PRIME
    hash_val = hash_val ^ (hash_val >> 16)
    hash_val = hash_val * (PRIME ^ XCONST)
@@ -53,7 +53,7 @@ def gpu_tensor_hash(tensor: torch.Tensor) -> int:
    BLOCK_SIZE = 1024
    grid = (triton.cdiv(n, BLOCK_SIZE),)

-    intermediate_hashes = torch.empty(n, dtype=torch.int32, device=tensor.device)
+    intermediate_hashes = torch.empty(n, dtype=torch.int64, device=tensor.device)

    hash_kernel[grid](
        tensor,
--- a/python/sglang/srt/layers/quantization/init.py
+++ b/python/sglang/srt/layers/quantization/init.py
@@ -114,7 +114,7 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
    if quantization in VLLM_QUANTIZATION_METHODS and not VLLM_AVAILABLE:
        raise ValueError(
            f"{quantization} quantization requires some operators from vllm. "
-            "Please install vllm by `pip install vllm==0.8.4`"
+            "Please install vllm by `pip install vllm==0.9.0.1`"
        )

    return QUANTIZATION_METHODS[quantization]
@@ -316,7 +316,7 @@ def monkey_patch_moe_apply(class_obj: "FusedMoEMethodBase"):
        if correction_bias is not None:
            if not has_correction_bias:
                raise ValueError(
-                    "Please increase the version of your vllm. Try `pip install vllm==0.8.4`"
+                    "Please increase the version of your vllm. Try `pip install vllm==0.9.0.1`"
                )
            kwargs["e_score_correction_bias"] = correction_bias
        return original_apply(**kwargs)
--- a/python/sglang/test/test_utils.py
+++ b/python/sglang/test/test_utils.py
@@ -81,7 +81,6 @@ DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2 = "neuralmagic/Meta-Llama-3.1-70B-In
 DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_QUANT_TP1 = "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4,hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4,hugging-quants/Mixtral-8x7B-Instruct-v0.1-AWQ-INT4"
 DEFAULT_SMALL_MODEL_NAME_FOR_TEST_QWEN = "Qwen/Qwen2.5-1.5B-Instruct"
 DEFAULT_SMALL_VLM_MODEL_NAME_FOR_TEST = "Qwen/Qwen2.5-VL-3B-Instruct"
-DEFAULT_VLM_CHAT_TEMPLATE_FOR_TEST = "qwen2-vl"

 DEFAULT_IMAGE_URL = "https://github.com/sgl-project/sglang/blob/main/test/lang/example_image.png?raw=true"
 DEFAULT_VIDEO_URL = "https://raw.githubusercontent.com/EvolvingLMMs-Lab/sglang/dev/onevision_local/assets/jobs.mp4"