From da681f35d3ebf7f2d957448b010446b2b0713b36 Mon Sep 17 00:00:00 2001 From: Yineng Zhang Date: Fri, 17 Oct 2025 10:01:36 -0700 Subject: [PATCH] Revert "Set csgmv as default lora backend. (#11488)" (#11735) --- benchmark/lora/launch_server.py | 2 +- python/sglang/srt/server_args.py | 2 +- python/sglang/test/runners.py | 2 +- test/srt/lora/test_lora.py | 6 ++++-- test/srt/lora/test_lora_cuda_graph.py | 2 ++ test/srt/lora/test_lora_eviction.py | 2 ++ test/srt/lora/test_lora_qwen3.py | 6 ++++-- test/srt/lora/test_lora_radix_cache.py | 3 +++ test/srt/lora/test_lora_tp.py | 1 + test/srt/lora/test_lora_update.py | 2 +- test/srt/lora/utils.py | 6 +++--- 11 files changed, 23 insertions(+), 11 deletions(-) diff --git a/benchmark/lora/launch_server.py b/benchmark/lora/launch_server.py index 5dcf66ad6..de93a6e13 100644 --- a/benchmark/lora/launch_server.py +++ b/benchmark/lora/launch_server.py @@ -53,7 +53,7 @@ if __name__ == "__main__": parser.add_argument( "--lora-backend", type=str, - default="csgmv", + default="triton", ) parser.add_argument( "--tp-size", diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index 864432496..dfb341128 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -309,8 +309,8 @@ class ServerArgs: ] = None max_loaded_loras: Optional[int] = None max_loras_per_batch: int = 8 - lora_backend: str = "csgmv" lora_eviction_policy: str = DEFAULT_LORA_EVICTION_POLICY + lora_backend: str = "triton" max_lora_chunk_size: Optional[int] = 16 # Kernel backend diff --git a/python/sglang/test/runners.py b/python/sglang/test/runners.py index 9eecb14b5..dc7efe528 100644 --- a/python/sglang/test/runners.py +++ b/python/sglang/test/runners.py @@ -496,7 +496,7 @@ class SRTRunner: attention_backend: Optional[str] = None, prefill_attention_backend: Optional[str] = None, decode_attention_backend: Optional[str] = None, - lora_backend: str = "csgmv", + lora_backend: str = "triton", disable_cuda_graph: bool = False, disable_radix_cache: bool = False, chunked_prefill_size: Optional[int] = None, diff --git a/test/srt/lora/test_lora.py b/test/srt/lora/test_lora.py index 3ab7b624d..ab1c630fc 100644 --- a/test/srt/lora/test_lora.py +++ b/test/srt/lora/test_lora.py @@ -81,12 +81,13 @@ class TestLoRA(CustomTestCase): for model_case in model_cases: for torch_dtype in TORCH_DTYPES: max_new_tokens = 32 + backend = "triton" base_path = model_case.base lora_adapter_paths = [a.name for a in model_case.adaptors] assert len(lora_adapter_paths) >= 2 print( - f"\n========== Testing multiple batches on base '{base_path}', dtype={torch_dtype} ---" + f"\n========== Testing multiple batches on base '{base_path}' with backend={backend}, dtype={torch_dtype} ---" ) # Initialize runners @@ -96,6 +97,7 @@ class TestLoRA(CustomTestCase): model_type="generation", lora_paths=[lora_adapter_paths[0], lora_adapter_paths[1]], max_loras_per_batch=len(lora_adapter_paths) + 1, + lora_backend=backend, sleep_on_idle=True, # Eliminate non-determinism by forcing all requests to be processed in one batch. attention_backend="torch_native", ) @@ -140,7 +142,7 @@ class TestLoRA(CustomTestCase): if rouge_score < rouge_tol: raise AssertionError( f"ROUGE-L score {rouge_score} below tolerance {rouge_tol} " - f"for base '{base_path}', adaptor '{lora_paths}', prompt: '{prompts}...'" + f"for base '{base_path}', adaptor '{lora_paths}', backend '{backend}', prompt: '{prompts}...'" ) print(f"--- Batch {i} Comparison Passed --- ") diff --git a/test/srt/lora/test_lora_cuda_graph.py b/test/srt/lora/test_lora_cuda_graph.py index d14e3c76e..ba68df59a 100644 --- a/test/srt/lora/test_lora_cuda_graph.py +++ b/test/srt/lora/test_lora_cuda_graph.py @@ -62,6 +62,7 @@ class TestLoRACudaGraph(CustomTestCase): model_case, torch_dtype, max_new_tokens=32, + backend="triton", disable_cuda_graph=True, test_tag="without_cuda_graph", ) @@ -76,6 +77,7 @@ class TestLoRACudaGraph(CustomTestCase): model_case, torch_dtype, max_new_tokens=32, + backend="triton", disable_cuda_graph=False, test_tag="cuda_graph_padding", ) diff --git a/test/srt/lora/test_lora_eviction.py b/test/srt/lora/test_lora_eviction.py index fc1e00e3d..d27b11906 100644 --- a/test/srt/lora/test_lora_eviction.py +++ b/test/srt/lora/test_lora_eviction.py @@ -83,6 +83,7 @@ class TestLoRAEviction(CustomTestCase): ): REUSED_LORA_NAME = "lora" max_new_tokens = 256 + backend = "triton" torch_dtype = torch.float16 base_path = BASE_MODEL assert len(lora_paths) >= 2 @@ -95,6 +96,7 @@ class TestLoRAEviction(CustomTestCase): model_type="generation", lora_paths=initial_lora_paths, max_loras_per_batch=1, + lora_backend=backend, enable_lora=True, max_lora_rank=256, lora_target_modules=["all"], diff --git a/test/srt/lora/test_lora_qwen3.py b/test/srt/lora/test_lora_qwen3.py index beab18cf4..f77156707 100644 --- a/test/srt/lora/test_lora_qwen3.py +++ b/test/srt/lora/test_lora_qwen3.py @@ -71,6 +71,7 @@ class TestLoRAQwen3(CustomTestCase): for model_case in model_cases: for torch_dtype in TORCH_DTYPES: max_new_tokens = 32 + backend = "triton" base_path = model_case.base lora_adapter_paths = [a.name for a in model_case.adaptors] assert len(lora_adapter_paths) >= 2 @@ -127,7 +128,7 @@ class TestLoRAQwen3(CustomTestCase): ] print( - f"\n========== Testing multiple batches on base '{base_path}', dtype={torch_dtype} ---" + f"\n========== Testing multiple batches on base '{base_path}' with backend={backend}, dtype={torch_dtype} ---" ) # Initialize runners @@ -138,6 +139,7 @@ class TestLoRAQwen3(CustomTestCase): model_type="generation", lora_paths=[lora_adapter_paths[0], lora_adapter_paths[1]], max_loras_per_batch=len(lora_adapter_paths) + 1, + lora_backend=backend, sleep_on_idle=True, # Eliminate non-determinism by forcing all requests to be processed in one batch. attention_backend="torch_native", ) @@ -181,7 +183,7 @@ class TestLoRAQwen3(CustomTestCase): if rouge_score < rouge_tol: raise AssertionError( f"ROUGE-L score {rouge_score} below tolerance {rouge_tol} " - f"for base '{base_path}', adaptor '{lora_paths}', prompt: '{prompts}...'" + f"for base '{base_path}', adaptor '{lora_paths}', backend '{backend}', prompt: '{prompts}...'" ) print(f"--- Batch {i+1} Comparison Passed --- ") diff --git a/test/srt/lora/test_lora_radix_cache.py b/test/srt/lora/test_lora_radix_cache.py index 2faacf930..d3ecb219c 100644 --- a/test/srt/lora/test_lora_radix_cache.py +++ b/test/srt/lora/test_lora_radix_cache.py @@ -44,6 +44,7 @@ class TestLoRARadixCache(CustomTestCase): torch_dtype = torch.float16 max_new_tokens = 32 + backend = "triton" batch_prompts = ( PROMPTS if not model_case.skip_long_prompt @@ -56,6 +57,7 @@ class TestLoRARadixCache(CustomTestCase): model_case, torch_dtype, max_new_tokens=max_new_tokens, + backend=backend, disable_radix_cache=False, test_tag="lora-with-radix-cache", ) @@ -66,6 +68,7 @@ class TestLoRARadixCache(CustomTestCase): model_case, torch_dtype, max_new_tokens=max_new_tokens, + backend=backend, disable_radix_cache=True, test_tag="lora-without-radix-cache", ) diff --git a/test/srt/lora/test_lora_tp.py b/test/srt/lora/test_lora_tp.py index e459532a5..51a552d78 100644 --- a/test/srt/lora/test_lora_tp.py +++ b/test/srt/lora/test_lora_tp.py @@ -48,6 +48,7 @@ class TestLoRATP(CustomTestCase): model_case, torch_dtype, max_new_tokens=32, + backend="triton", test_tag=f"tp={tp_size}", ) diff --git a/test/srt/lora/test_lora_update.py b/test/srt/lora/test_lora_update.py index bc3232ca0..073100e17 100644 --- a/test/srt/lora/test_lora_update.py +++ b/test/srt/lora/test_lora_update.py @@ -763,7 +763,7 @@ class LoRAUpdateTestSessionBase: max_lora_rank: Optional[int], enable_lora: Optional[bool] = None, lora_target_modules: Optional[List[str]] = None, - lora_backend: str = "csgmv", + lora_backend: str = "triton", disable_cuda_graph: bool = False, cuda_graph_max_bs: int = 4, ): diff --git a/test/srt/lora/utils.py b/test/srt/lora/utils.py index 42050fc72..94ce8ab60 100644 --- a/test/srt/lora/utils.py +++ b/test/srt/lora/utils.py @@ -14,7 +14,7 @@ import dataclasses import random -from typing import List, Optional +from typing import List import torch @@ -50,7 +50,7 @@ class LoRAModelCase: TORCH_DTYPES = [torch.float16] -BACKENDS = ["triton", "csgmv"] +BACKENDS = ["triton"] DEFAULT_PROMPTS = [ "AI is a field of computer science focused on", """ @@ -135,7 +135,7 @@ def run_lora_test_one_by_one( model_case: LoRAModelCase, torch_dtype: torch.dtype, max_new_tokens: int, - backend: str = "csgmv", + backend: str, disable_cuda_graph: bool = False, disable_radix_cache: bool = False, mem_fraction_static: float = 0.88,