From b0d20cdec79c9b4cc1a10ee9cc2ffa35451a9df1 Mon Sep 17 00:00:00 2001 From: Lifu Huang Date: Wed, 15 Oct 2025 21:53:24 -0700 Subject: [PATCH] Set csgmv as default lora backend. (#11488) --- benchmark/lora/launch_server.py | 2 +- python/sglang/srt/server_args.py | 2 +- python/sglang/test/runners.py | 2 +- test/srt/lora/test_lora.py | 6 ++---- test/srt/lora/test_lora_cuda_graph.py | 2 -- test/srt/lora/test_lora_eviction.py | 2 -- test/srt/lora/test_lora_qwen3.py | 6 ++---- test/srt/lora/test_lora_radix_cache.py | 3 --- test/srt/lora/test_lora_tp.py | 1 - test/srt/lora/test_lora_update.py | 2 +- test/srt/lora/utils.py | 6 +++--- 11 files changed, 11 insertions(+), 23 deletions(-) diff --git a/benchmark/lora/launch_server.py b/benchmark/lora/launch_server.py index de93a6e13..5dcf66ad6 100644 --- a/benchmark/lora/launch_server.py +++ b/benchmark/lora/launch_server.py @@ -53,7 +53,7 @@ if __name__ == "__main__": parser.add_argument( "--lora-backend", type=str, - default="triton", + default="csgmv", ) parser.add_argument( "--tp-size", diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index b5ae57817..043bec845 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -309,8 +309,8 @@ class ServerArgs: ] = None max_loaded_loras: Optional[int] = None max_loras_per_batch: int = 8 + lora_backend: str = "csgmv" lora_eviction_policy: str = DEFAULT_LORA_EVICTION_POLICY - lora_backend: str = "triton" max_lora_chunk_size: Optional[int] = 16 # Kernel backend diff --git a/python/sglang/test/runners.py b/python/sglang/test/runners.py index dc7efe528..9eecb14b5 100644 --- a/python/sglang/test/runners.py +++ b/python/sglang/test/runners.py @@ -496,7 +496,7 @@ class SRTRunner: attention_backend: Optional[str] = None, prefill_attention_backend: Optional[str] = None, decode_attention_backend: Optional[str] = None, - lora_backend: str = "triton", + lora_backend: str = "csgmv", disable_cuda_graph: bool = False, disable_radix_cache: bool = False, chunked_prefill_size: Optional[int] = None, diff --git a/test/srt/lora/test_lora.py b/test/srt/lora/test_lora.py index ab1c630fc..3ab7b624d 100644 --- a/test/srt/lora/test_lora.py +++ b/test/srt/lora/test_lora.py @@ -81,13 +81,12 @@ class TestLoRA(CustomTestCase): for model_case in model_cases: for torch_dtype in TORCH_DTYPES: max_new_tokens = 32 - backend = "triton" base_path = model_case.base lora_adapter_paths = [a.name for a in model_case.adaptors] assert len(lora_adapter_paths) >= 2 print( - f"\n========== Testing multiple batches on base '{base_path}' with backend={backend}, dtype={torch_dtype} ---" + f"\n========== Testing multiple batches on base '{base_path}', dtype={torch_dtype} ---" ) # Initialize runners @@ -97,7 +96,6 @@ class TestLoRA(CustomTestCase): model_type="generation", lora_paths=[lora_adapter_paths[0], lora_adapter_paths[1]], max_loras_per_batch=len(lora_adapter_paths) + 1, - lora_backend=backend, sleep_on_idle=True, # Eliminate non-determinism by forcing all requests to be processed in one batch. attention_backend="torch_native", ) @@ -142,7 +140,7 @@ class TestLoRA(CustomTestCase): if rouge_score < rouge_tol: raise AssertionError( f"ROUGE-L score {rouge_score} below tolerance {rouge_tol} " - f"for base '{base_path}', adaptor '{lora_paths}', backend '{backend}', prompt: '{prompts}...'" + f"for base '{base_path}', adaptor '{lora_paths}', prompt: '{prompts}...'" ) print(f"--- Batch {i} Comparison Passed --- ") diff --git a/test/srt/lora/test_lora_cuda_graph.py b/test/srt/lora/test_lora_cuda_graph.py index ba68df59a..d14e3c76e 100644 --- a/test/srt/lora/test_lora_cuda_graph.py +++ b/test/srt/lora/test_lora_cuda_graph.py @@ -62,7 +62,6 @@ class TestLoRACudaGraph(CustomTestCase): model_case, torch_dtype, max_new_tokens=32, - backend="triton", disable_cuda_graph=True, test_tag="without_cuda_graph", ) @@ -77,7 +76,6 @@ class TestLoRACudaGraph(CustomTestCase): model_case, torch_dtype, max_new_tokens=32, - backend="triton", disable_cuda_graph=False, test_tag="cuda_graph_padding", ) diff --git a/test/srt/lora/test_lora_eviction.py b/test/srt/lora/test_lora_eviction.py index d27b11906..fc1e00e3d 100644 --- a/test/srt/lora/test_lora_eviction.py +++ b/test/srt/lora/test_lora_eviction.py @@ -83,7 +83,6 @@ class TestLoRAEviction(CustomTestCase): ): REUSED_LORA_NAME = "lora" max_new_tokens = 256 - backend = "triton" torch_dtype = torch.float16 base_path = BASE_MODEL assert len(lora_paths) >= 2 @@ -96,7 +95,6 @@ class TestLoRAEviction(CustomTestCase): model_type="generation", lora_paths=initial_lora_paths, max_loras_per_batch=1, - lora_backend=backend, enable_lora=True, max_lora_rank=256, lora_target_modules=["all"], diff --git a/test/srt/lora/test_lora_qwen3.py b/test/srt/lora/test_lora_qwen3.py index f77156707..beab18cf4 100644 --- a/test/srt/lora/test_lora_qwen3.py +++ b/test/srt/lora/test_lora_qwen3.py @@ -71,7 +71,6 @@ class TestLoRAQwen3(CustomTestCase): for model_case in model_cases: for torch_dtype in TORCH_DTYPES: max_new_tokens = 32 - backend = "triton" base_path = model_case.base lora_adapter_paths = [a.name for a in model_case.adaptors] assert len(lora_adapter_paths) >= 2 @@ -128,7 +127,7 @@ class TestLoRAQwen3(CustomTestCase): ] print( - f"\n========== Testing multiple batches on base '{base_path}' with backend={backend}, dtype={torch_dtype} ---" + f"\n========== Testing multiple batches on base '{base_path}', dtype={torch_dtype} ---" ) # Initialize runners @@ -139,7 +138,6 @@ class TestLoRAQwen3(CustomTestCase): model_type="generation", lora_paths=[lora_adapter_paths[0], lora_adapter_paths[1]], max_loras_per_batch=len(lora_adapter_paths) + 1, - lora_backend=backend, sleep_on_idle=True, # Eliminate non-determinism by forcing all requests to be processed in one batch. attention_backend="torch_native", ) @@ -183,7 +181,7 @@ class TestLoRAQwen3(CustomTestCase): if rouge_score < rouge_tol: raise AssertionError( f"ROUGE-L score {rouge_score} below tolerance {rouge_tol} " - f"for base '{base_path}', adaptor '{lora_paths}', backend '{backend}', prompt: '{prompts}...'" + f"for base '{base_path}', adaptor '{lora_paths}', prompt: '{prompts}...'" ) print(f"--- Batch {i+1} Comparison Passed --- ") diff --git a/test/srt/lora/test_lora_radix_cache.py b/test/srt/lora/test_lora_radix_cache.py index d3ecb219c..2faacf930 100644 --- a/test/srt/lora/test_lora_radix_cache.py +++ b/test/srt/lora/test_lora_radix_cache.py @@ -44,7 +44,6 @@ class TestLoRARadixCache(CustomTestCase): torch_dtype = torch.float16 max_new_tokens = 32 - backend = "triton" batch_prompts = ( PROMPTS if not model_case.skip_long_prompt @@ -57,7 +56,6 @@ class TestLoRARadixCache(CustomTestCase): model_case, torch_dtype, max_new_tokens=max_new_tokens, - backend=backend, disable_radix_cache=False, test_tag="lora-with-radix-cache", ) @@ -68,7 +66,6 @@ class TestLoRARadixCache(CustomTestCase): model_case, torch_dtype, max_new_tokens=max_new_tokens, - backend=backend, disable_radix_cache=True, test_tag="lora-without-radix-cache", ) diff --git a/test/srt/lora/test_lora_tp.py b/test/srt/lora/test_lora_tp.py index 51a552d78..e459532a5 100644 --- a/test/srt/lora/test_lora_tp.py +++ b/test/srt/lora/test_lora_tp.py @@ -48,7 +48,6 @@ class TestLoRATP(CustomTestCase): model_case, torch_dtype, max_new_tokens=32, - backend="triton", test_tag=f"tp={tp_size}", ) diff --git a/test/srt/lora/test_lora_update.py b/test/srt/lora/test_lora_update.py index 073100e17..bc3232ca0 100644 --- a/test/srt/lora/test_lora_update.py +++ b/test/srt/lora/test_lora_update.py @@ -763,7 +763,7 @@ class LoRAUpdateTestSessionBase: max_lora_rank: Optional[int], enable_lora: Optional[bool] = None, lora_target_modules: Optional[List[str]] = None, - lora_backend: str = "triton", + lora_backend: str = "csgmv", disable_cuda_graph: bool = False, cuda_graph_max_bs: int = 4, ): diff --git a/test/srt/lora/utils.py b/test/srt/lora/utils.py index 94ce8ab60..42050fc72 100644 --- a/test/srt/lora/utils.py +++ b/test/srt/lora/utils.py @@ -14,7 +14,7 @@ import dataclasses import random -from typing import List +from typing import List, Optional import torch @@ -50,7 +50,7 @@ class LoRAModelCase: TORCH_DTYPES = [torch.float16] -BACKENDS = ["triton"] +BACKENDS = ["triton", "csgmv"] DEFAULT_PROMPTS = [ "AI is a field of computer science focused on", """ @@ -135,7 +135,7 @@ def run_lora_test_one_by_one( model_case: LoRAModelCase, torch_dtype: torch.dtype, max_new_tokens: int, - backend: str, + backend: str = "csgmv", disable_cuda_graph: bool = False, disable_radix_cache: bool = False, mem_fraction_static: float = 0.88,