feat: update model_specific_adjustment (#5344)

Co-authored-by: hebiao064 <hebiaobuaa@gmail.com>
This commit is contained in:
Yineng Zhang
2025-04-15 14:45:15 -07:00
committed by GitHub
parent e8f62b20ca
commit fa909dc3c4
4 changed files with 51 additions and 17 deletions

View File

@@ -78,7 +78,7 @@ class ForwardMode(IntEnum):
self == ForwardMode.EXTEND
or self == ForwardMode.MIXED
or self == ForwardMode.DRAFT_EXTEND
or self == self.TARGET_VERIFY
or self == ForwardMode.TARGET_VERIFY
)
def is_decode(self):
@@ -96,6 +96,13 @@ class ForwardMode(IntEnum):
def is_draft_extend(self):
return self == ForwardMode.DRAFT_EXTEND
def is_extend_or_draft_extend_or_mixed(self):
return (
self == ForwardMode.EXTEND
or self == ForwardMode.DRAFT_EXTEND
or self == ForwardMode.MIXED
)
def is_cuda_graph(self):
return (
self == ForwardMode.DECODE
@@ -103,9 +110,6 @@ class ForwardMode(IntEnum):
or self == ForwardMode.IDLE
)
def is_extend_or_draft_extend(self):
return self == ForwardMode.EXTEND or self == ForwardMode.DRAFT_EXTEND
def is_dummy_first(self):
return self == ForwardMode.DUMMY_FIRST

View File

@@ -78,9 +78,11 @@ from sglang.srt.utils import (
get_available_gpu_memory,
init_custom_process_group,
is_cuda,
is_fa3_default_architecture,
is_flashinfer_available,
is_hip,
is_hopper_with_cuda_12_3,
is_no_spec_infer_or_topk_one,
monkey_patch_p2p_access_check,
monkey_patch_vllm_gguf_config,
set_cpu_offload_max_bytes,
@@ -242,18 +244,21 @@ class ModelRunner:
elif server_args.attention_backend is None:
# By default, use flashinfer for non-mla attention and triton for mla attention
if not self.use_mla_backend:
server_args.attention_backend = (
"flashinfer" if is_flashinfer_available() else "triton"
)
if (
is_hopper_with_cuda_12_3()
and is_no_spec_infer_or_topk_one(server_args)
and is_fa3_default_architecture(self.model_config.hf_config)
):
server_args.attention_backend = "fa3"
else:
server_args.attention_backend = (
"flashinfer" if is_flashinfer_available() else "triton"
)
else:
if is_hopper_with_cuda_12_3():
if server_args.speculative_eagle_topk is None or (
server_args.speculative_eagle_topk is not None
and server_args.speculative_eagle_topk == 1
):
server_args.attention_backend = "fa3"
else:
server_args.attention_backend = "triton"
if is_hopper_with_cuda_12_3() and is_no_spec_infer_or_topk_one(
server_args
):
server_args.attention_backend = "fa3"
else:
server_args.attention_backend = "triton"
logger.info(