feat: update model_specific_adjustment (#5344)
Co-authored-by: hebiao064 <hebiaobuaa@gmail.com>
This commit is contained in:
@@ -78,7 +78,7 @@ class ForwardMode(IntEnum):
|
||||
self == ForwardMode.EXTEND
|
||||
or self == ForwardMode.MIXED
|
||||
or self == ForwardMode.DRAFT_EXTEND
|
||||
or self == self.TARGET_VERIFY
|
||||
or self == ForwardMode.TARGET_VERIFY
|
||||
)
|
||||
|
||||
def is_decode(self):
|
||||
@@ -96,6 +96,13 @@ class ForwardMode(IntEnum):
|
||||
def is_draft_extend(self):
|
||||
return self == ForwardMode.DRAFT_EXTEND
|
||||
|
||||
def is_extend_or_draft_extend_or_mixed(self):
|
||||
return (
|
||||
self == ForwardMode.EXTEND
|
||||
or self == ForwardMode.DRAFT_EXTEND
|
||||
or self == ForwardMode.MIXED
|
||||
)
|
||||
|
||||
def is_cuda_graph(self):
|
||||
return (
|
||||
self == ForwardMode.DECODE
|
||||
@@ -103,9 +110,6 @@ class ForwardMode(IntEnum):
|
||||
or self == ForwardMode.IDLE
|
||||
)
|
||||
|
||||
def is_extend_or_draft_extend(self):
|
||||
return self == ForwardMode.EXTEND or self == ForwardMode.DRAFT_EXTEND
|
||||
|
||||
def is_dummy_first(self):
|
||||
return self == ForwardMode.DUMMY_FIRST
|
||||
|
||||
|
||||
@@ -78,9 +78,11 @@ from sglang.srt.utils import (
|
||||
get_available_gpu_memory,
|
||||
init_custom_process_group,
|
||||
is_cuda,
|
||||
is_fa3_default_architecture,
|
||||
is_flashinfer_available,
|
||||
is_hip,
|
||||
is_hopper_with_cuda_12_3,
|
||||
is_no_spec_infer_or_topk_one,
|
||||
monkey_patch_p2p_access_check,
|
||||
monkey_patch_vllm_gguf_config,
|
||||
set_cpu_offload_max_bytes,
|
||||
@@ -242,18 +244,21 @@ class ModelRunner:
|
||||
elif server_args.attention_backend is None:
|
||||
# By default, use flashinfer for non-mla attention and triton for mla attention
|
||||
if not self.use_mla_backend:
|
||||
server_args.attention_backend = (
|
||||
"flashinfer" if is_flashinfer_available() else "triton"
|
||||
)
|
||||
if (
|
||||
is_hopper_with_cuda_12_3()
|
||||
and is_no_spec_infer_or_topk_one(server_args)
|
||||
and is_fa3_default_architecture(self.model_config.hf_config)
|
||||
):
|
||||
server_args.attention_backend = "fa3"
|
||||
else:
|
||||
server_args.attention_backend = (
|
||||
"flashinfer" if is_flashinfer_available() else "triton"
|
||||
)
|
||||
else:
|
||||
if is_hopper_with_cuda_12_3():
|
||||
if server_args.speculative_eagle_topk is None or (
|
||||
server_args.speculative_eagle_topk is not None
|
||||
and server_args.speculative_eagle_topk == 1
|
||||
):
|
||||
server_args.attention_backend = "fa3"
|
||||
else:
|
||||
server_args.attention_backend = "triton"
|
||||
if is_hopper_with_cuda_12_3() and is_no_spec_infer_or_topk_one(
|
||||
server_args
|
||||
):
|
||||
server_args.attention_backend = "fa3"
|
||||
else:
|
||||
server_args.attention_backend = "triton"
|
||||
logger.info(
|
||||
|
||||
Reference in New Issue
Block a user