fix: gemma 3 not use softcap (#5622)
This commit is contained in:
@@ -78,6 +78,11 @@ class ModelConfig:
|
|||||||
logger.info(
|
logger.info(
|
||||||
"Multimodal is disabled for Llama4. To enable it, set --enable-llama4-multimodal."
|
"Multimodal is disabled for Llama4. To enable it, set --enable-llama4-multimodal."
|
||||||
)
|
)
|
||||||
|
elif self.hf_config.architectures[0] == "Gemma3ForConditionalGeneration":
|
||||||
|
enable_multimodal = False
|
||||||
|
logger.info(
|
||||||
|
"Multimodal is disabled for Gemma3. To enable it, set --enable-gemma3-multimodal."
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
enable_multimodal = True
|
enable_multimodal = True
|
||||||
|
|
||||||
|
|||||||
@@ -189,7 +189,7 @@ class Gemma3Attention(nn.Module):
|
|||||||
self.scaling,
|
self.scaling,
|
||||||
num_kv_heads=self.num_kv_heads,
|
num_kv_heads=self.num_kv_heads,
|
||||||
layer_id=layer_id,
|
layer_id=layer_id,
|
||||||
logit_cap=getattr(self.config, "attn_logit_softcapping", None),
|
logit_cap=0.0,
|
||||||
# Module must also define `get_attention_sliding_window_size` to correctly initialize
|
# Module must also define `get_attention_sliding_window_size` to correctly initialize
|
||||||
# attention backend in `ForwardBatch`.
|
# attention backend in `ForwardBatch`.
|
||||||
sliding_window_size=self.sliding_window,
|
sliding_window_size=self.sliding_window,
|
||||||
|
|||||||
@@ -154,6 +154,7 @@ class ServerArgs:
|
|||||||
disable_outlines_disk_cache: bool = False
|
disable_outlines_disk_cache: bool = False
|
||||||
disable_custom_all_reduce: bool = False
|
disable_custom_all_reduce: bool = False
|
||||||
enable_llama4_multimodal: Optional[bool] = None
|
enable_llama4_multimodal: Optional[bool] = None
|
||||||
|
enable_gemma3_multimodal: Optional[bool] = None
|
||||||
disable_overlap_schedule: bool = False
|
disable_overlap_schedule: bool = False
|
||||||
enable_mixed_chunk: bool = False
|
enable_mixed_chunk: bool = False
|
||||||
enable_dp_attention: bool = False
|
enable_dp_attention: bool = False
|
||||||
@@ -285,7 +286,9 @@ class ServerArgs:
|
|||||||
if self.grammar_backend is None:
|
if self.grammar_backend is None:
|
||||||
self.grammar_backend = "xgrammar"
|
self.grammar_backend = "xgrammar"
|
||||||
|
|
||||||
self.enable_multimodal: Optional[bool] = self.enable_llama4_multimodal
|
self.enable_multimodal: Optional[bool] = (
|
||||||
|
self.enable_llama4_multimodal or self.enable_gemma3_multimodal
|
||||||
|
)
|
||||||
|
|
||||||
# Data parallelism attention
|
# Data parallelism attention
|
||||||
if self.enable_dp_attention:
|
if self.enable_dp_attention:
|
||||||
@@ -984,6 +987,12 @@ class ServerArgs:
|
|||||||
action="store_true",
|
action="store_true",
|
||||||
help="Enable the multimodal functionality for Llama-4.",
|
help="Enable the multimodal functionality for Llama-4.",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--enable-gemma3-multimodal",
|
||||||
|
default=ServerArgs.enable_gemma3_multimodal,
|
||||||
|
action="store_true",
|
||||||
|
help="Enable the multimodal functionality for Gemma-3.",
|
||||||
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--disable-overlap-schedule",
|
"--disable-overlap-schedule",
|
||||||
action="store_true",
|
action="store_true",
|
||||||
|
|||||||
@@ -1971,6 +1971,7 @@ def is_fa3_default_architecture(hf_config):
|
|||||||
"LlamaForCausalLM",
|
"LlamaForCausalLM",
|
||||||
"MistralForCausalLM",
|
"MistralForCausalLM",
|
||||||
"Gemma2ForCausalLM",
|
"Gemma2ForCausalLM",
|
||||||
|
"Gemma3ForConditionalGeneration",
|
||||||
}
|
}
|
||||||
return architectures[0] in default_archs
|
return architectures[0] in default_archs
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user