diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index 037505dd7..3b52f5801 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -1928,6 +1928,12 @@ class ServerArgs: if "Llama4" in model_arch: assert self.attention_backend == "fa3", "fa3 is required for Llama4 model" + if "Gemma2ForCausalLM" in model_arch: + # FIXME: https://github.com/sgl-project/sglang/pull/7367 is not compatible with gemma2 model. + # It failed at this test: https://github.com/sgl-project/sglang/actions/runs/16255155597/job/45890331952#step:4:736 + logger.warning("Disable hybrid SWA memory for Gemma2ForCausalLM.") + self.disable_hybrid_swa_memory = True + # Check LoRA self.check_lora_server_args() diff --git a/python/sglang/test/test_utils.py b/python/sglang/test/test_utils.py index c155a4d6d..9002bd0e5 100644 --- a/python/sglang/test/test_utils.py +++ b/python/sglang/test/test_utils.py @@ -27,9 +27,6 @@ import torch.nn.functional as F from sglang.bench_serving import run_benchmark from sglang.global_config import global_config -from sglang.lang.backend.openai import OpenAI -from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint -from sglang.lang.interpreter import ProgramState from sglang.srt.utils import ( get_bool_env_var, get_device, @@ -358,6 +355,9 @@ def add_common_sglang_args_and_parse(parser: argparse.ArgumentParser): def select_sglang_backend(args: argparse.Namespace): + from sglang.lang.backend.openai import OpenAI + from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint + if args.backend.startswith("srt"): if args.backend == "srt-no-parallel": global_config.enable_parallel_encoding = False