chore: upgrade flashinfer v0.2.6.post1 jit (#6958)
Co-authored-by: alcanderian <alcanderian@gmail.com> Co-authored-by: Qiaolin Yu <qy254@cornell.edu> Co-authored-by: Baizhou Zhang <sobereddiezhang@gmail.com> Co-authored-by: Mick <mickjagger19@icloud.com> Co-authored-by: ispobock <ispobaoke@gmail.com>
This commit is contained in:
@@ -49,10 +49,11 @@ runtime_common = [
|
||||
|
||||
srt = [
|
||||
"sglang[runtime_common]",
|
||||
"sgl-kernel==0.1.6.post1",
|
||||
"flashinfer_python==0.2.5",
|
||||
"torch==2.6.0",
|
||||
"torchvision==0.21.0",
|
||||
"sgl-kernel==0.1.7",
|
||||
"flashinfer_python==0.2.6.post1",
|
||||
"torch==2.7.1",
|
||||
"torchaudio==2.7.1",
|
||||
"torchvision==0.22.1",
|
||||
"cuda-python",
|
||||
"outlines>=0.0.44,<=0.1.11",
|
||||
"einops",
|
||||
@@ -61,12 +62,13 @@ srt = [
|
||||
blackwell = [
|
||||
"sglang[runtime_common]",
|
||||
"sgl-kernel",
|
||||
"torch==2.7.0",
|
||||
"torch==2.7.1",
|
||||
"torchaudio==2.7.1",
|
||||
"torchvision==0.22.0",
|
||||
"cuda-python",
|
||||
"outlines>=0.0.44,<=0.1.11",
|
||||
"einops",
|
||||
"flashinfer_python==0.2.5",
|
||||
"flashinfer_python==0.2.6.post1",
|
||||
]
|
||||
|
||||
# HIP (Heterogeneous-computing Interface for Portability) for AMD
|
||||
|
||||
@@ -571,7 +571,7 @@ def _set_envs_and_config(server_args: ServerArgs):
|
||||
if server_args.attention_backend == "flashinfer":
|
||||
assert_pkg_version(
|
||||
"flashinfer_python",
|
||||
"0.2.5",
|
||||
"0.2.6.post1",
|
||||
"Please uninstall the old version and "
|
||||
"reinstall the latest version by following the instructions "
|
||||
"at https://docs.flashinfer.ai/installation.html.",
|
||||
@@ -579,7 +579,7 @@ def _set_envs_and_config(server_args: ServerArgs):
|
||||
if _is_cuda:
|
||||
assert_pkg_version(
|
||||
"sgl-kernel",
|
||||
"0.1.6.post1",
|
||||
"0.1.7",
|
||||
"Please reinstall the latest version with `pip install sgl-kernel --force-reinstall`",
|
||||
)
|
||||
|
||||
|
||||
@@ -0,0 +1,146 @@
|
||||
{
|
||||
"1": {
|
||||
"BLOCK_SIZE_M": 16,
|
||||
"BLOCK_SIZE_N": 64,
|
||||
"BLOCK_SIZE_K": 256,
|
||||
"GROUP_SIZE_M": 1,
|
||||
"num_warps": 4,
|
||||
"num_stages": 4
|
||||
},
|
||||
"2": {
|
||||
"BLOCK_SIZE_M": 64,
|
||||
"BLOCK_SIZE_N": 64,
|
||||
"BLOCK_SIZE_K": 128,
|
||||
"GROUP_SIZE_M": 16,
|
||||
"num_warps": 4,
|
||||
"num_stages": 5
|
||||
},
|
||||
"4": {
|
||||
"BLOCK_SIZE_M": 16,
|
||||
"BLOCK_SIZE_N": 32,
|
||||
"BLOCK_SIZE_K": 256,
|
||||
"GROUP_SIZE_M": 16,
|
||||
"num_warps": 4,
|
||||
"num_stages": 3
|
||||
},
|
||||
"8": {
|
||||
"BLOCK_SIZE_M": 32,
|
||||
"BLOCK_SIZE_N": 128,
|
||||
"BLOCK_SIZE_K": 128,
|
||||
"GROUP_SIZE_M": 1,
|
||||
"num_warps": 4,
|
||||
"num_stages": 3
|
||||
},
|
||||
"16": {
|
||||
"BLOCK_SIZE_M": 64,
|
||||
"BLOCK_SIZE_N": 128,
|
||||
"BLOCK_SIZE_K": 128,
|
||||
"GROUP_SIZE_M": 64,
|
||||
"num_warps": 8,
|
||||
"num_stages": 3
|
||||
},
|
||||
"24": {
|
||||
"BLOCK_SIZE_M": 16,
|
||||
"BLOCK_SIZE_N": 256,
|
||||
"BLOCK_SIZE_K": 128,
|
||||
"GROUP_SIZE_M": 32,
|
||||
"num_warps": 8,
|
||||
"num_stages": 3
|
||||
},
|
||||
"32": {
|
||||
"BLOCK_SIZE_M": 64,
|
||||
"BLOCK_SIZE_N": 128,
|
||||
"BLOCK_SIZE_K": 128,
|
||||
"GROUP_SIZE_M": 1,
|
||||
"num_warps": 4,
|
||||
"num_stages": 3
|
||||
},
|
||||
"48": {
|
||||
"BLOCK_SIZE_M": 64,
|
||||
"BLOCK_SIZE_N": 128,
|
||||
"BLOCK_SIZE_K": 128,
|
||||
"GROUP_SIZE_M": 1,
|
||||
"num_warps": 4,
|
||||
"num_stages": 4
|
||||
},
|
||||
"64": {
|
||||
"BLOCK_SIZE_M": 64,
|
||||
"BLOCK_SIZE_N": 128,
|
||||
"BLOCK_SIZE_K": 128,
|
||||
"GROUP_SIZE_M": 1,
|
||||
"num_warps": 4,
|
||||
"num_stages": 4
|
||||
},
|
||||
"96": {
|
||||
"BLOCK_SIZE_M": 64,
|
||||
"BLOCK_SIZE_N": 128,
|
||||
"BLOCK_SIZE_K": 64,
|
||||
"GROUP_SIZE_M": 1,
|
||||
"num_warps": 4,
|
||||
"num_stages": 5
|
||||
},
|
||||
"128": {
|
||||
"BLOCK_SIZE_M": 64,
|
||||
"BLOCK_SIZE_N": 128,
|
||||
"BLOCK_SIZE_K": 64,
|
||||
"GROUP_SIZE_M": 1,
|
||||
"num_warps": 4,
|
||||
"num_stages": 5
|
||||
},
|
||||
"256": {
|
||||
"BLOCK_SIZE_M": 128,
|
||||
"BLOCK_SIZE_N": 128,
|
||||
"BLOCK_SIZE_K": 64,
|
||||
"GROUP_SIZE_M": 1,
|
||||
"num_warps": 8,
|
||||
"num_stages": 5
|
||||
},
|
||||
"512": {
|
||||
"BLOCK_SIZE_M": 128,
|
||||
"BLOCK_SIZE_N": 256,
|
||||
"BLOCK_SIZE_K": 64,
|
||||
"GROUP_SIZE_M": 64,
|
||||
"num_warps": 8,
|
||||
"num_stages": 4
|
||||
},
|
||||
"1024": {
|
||||
"BLOCK_SIZE_M": 128,
|
||||
"BLOCK_SIZE_N": 256,
|
||||
"BLOCK_SIZE_K": 64,
|
||||
"GROUP_SIZE_M": 64,
|
||||
"num_warps": 8,
|
||||
"num_stages": 4
|
||||
},
|
||||
"1536": {
|
||||
"BLOCK_SIZE_M": 128,
|
||||
"BLOCK_SIZE_N": 256,
|
||||
"BLOCK_SIZE_K": 64,
|
||||
"GROUP_SIZE_M": 32,
|
||||
"num_warps": 8,
|
||||
"num_stages": 4
|
||||
},
|
||||
"2048": {
|
||||
"BLOCK_SIZE_M": 128,
|
||||
"BLOCK_SIZE_N": 256,
|
||||
"BLOCK_SIZE_K": 64,
|
||||
"GROUP_SIZE_M": 64,
|
||||
"num_warps": 8,
|
||||
"num_stages": 3
|
||||
},
|
||||
"3072": {
|
||||
"BLOCK_SIZE_M": 128,
|
||||
"BLOCK_SIZE_N": 256,
|
||||
"BLOCK_SIZE_K": 64,
|
||||
"GROUP_SIZE_M": 32,
|
||||
"num_warps": 8,
|
||||
"num_stages": 4
|
||||
},
|
||||
"4096": {
|
||||
"BLOCK_SIZE_M": 128,
|
||||
"BLOCK_SIZE_N": 256,
|
||||
"BLOCK_SIZE_K": 64,
|
||||
"GROUP_SIZE_M": 16,
|
||||
"num_warps": 8,
|
||||
"num_stages": 4
|
||||
}
|
||||
}
|
||||
@@ -316,6 +316,7 @@ class FusedMoE(torch.nn.Module):
|
||||
if params_dtype is None:
|
||||
params_dtype = torch.get_default_dtype()
|
||||
|
||||
self.hidden_size = hidden_size
|
||||
self.tp_size = (
|
||||
tp_size if tp_size is not None else get_tensor_model_parallel_world_size()
|
||||
)
|
||||
|
||||
@@ -32,8 +32,8 @@ def hash_kernel(
|
||||
offsets = block_start + tl.arange(0, BLOCK_SIZE)
|
||||
mask = offsets < n_elements
|
||||
|
||||
data = tl.load(input_ptr + offsets, mask=mask, other=0)
|
||||
mixed = data ^ (offsets + XCONST)
|
||||
data = tl.load(input_ptr + offsets, mask=mask, other=0).to(tl.int64)
|
||||
mixed = data ^ (offsets.to(tl.int64) + XCONST)
|
||||
hash_val = mixed * PRIME
|
||||
hash_val = hash_val ^ (hash_val >> 16)
|
||||
hash_val = hash_val * (PRIME ^ XCONST)
|
||||
@@ -53,7 +53,7 @@ def gpu_tensor_hash(tensor: torch.Tensor) -> int:
|
||||
BLOCK_SIZE = 1024
|
||||
grid = (triton.cdiv(n, BLOCK_SIZE),)
|
||||
|
||||
intermediate_hashes = torch.empty(n, dtype=torch.int32, device=tensor.device)
|
||||
intermediate_hashes = torch.empty(n, dtype=torch.int64, device=tensor.device)
|
||||
|
||||
hash_kernel[grid](
|
||||
tensor,
|
||||
|
||||
@@ -114,7 +114,7 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
|
||||
if quantization in VLLM_QUANTIZATION_METHODS and not VLLM_AVAILABLE:
|
||||
raise ValueError(
|
||||
f"{quantization} quantization requires some operators from vllm. "
|
||||
"Please install vllm by `pip install vllm==0.8.4`"
|
||||
"Please install vllm by `pip install vllm==0.9.0.1`"
|
||||
)
|
||||
|
||||
return QUANTIZATION_METHODS[quantization]
|
||||
@@ -316,7 +316,7 @@ def monkey_patch_moe_apply(class_obj: "FusedMoEMethodBase"):
|
||||
if correction_bias is not None:
|
||||
if not has_correction_bias:
|
||||
raise ValueError(
|
||||
"Please increase the version of your vllm. Try `pip install vllm==0.8.4`"
|
||||
"Please increase the version of your vllm. Try `pip install vllm==0.9.0.1`"
|
||||
)
|
||||
kwargs["e_score_correction_bias"] = correction_bias
|
||||
return original_apply(**kwargs)
|
||||
|
||||
@@ -81,7 +81,6 @@ DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2 = "neuralmagic/Meta-Llama-3.1-70B-In
|
||||
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_QUANT_TP1 = "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4,hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4,hugging-quants/Mixtral-8x7B-Instruct-v0.1-AWQ-INT4"
|
||||
DEFAULT_SMALL_MODEL_NAME_FOR_TEST_QWEN = "Qwen/Qwen2.5-1.5B-Instruct"
|
||||
DEFAULT_SMALL_VLM_MODEL_NAME_FOR_TEST = "Qwen/Qwen2.5-VL-3B-Instruct"
|
||||
DEFAULT_VLM_CHAT_TEMPLATE_FOR_TEST = "qwen2-vl"
|
||||
|
||||
DEFAULT_IMAGE_URL = "https://github.com/sgl-project/sglang/blob/main/test/lang/example_image.png?raw=true"
|
||||
DEFAULT_VIDEO_URL = "https://raw.githubusercontent.com/EvolvingLMMs-Lab/sglang/dev/onevision_local/assets/jobs.mp4"
|
||||
|
||||
Reference in New Issue
Block a user