From da681f35d3ebf7f2d957448b010446b2b0713b36 Mon Sep 17 00:00:00 2001
From: Yineng Zhang <me@zhyncs.com>
Date: Fri, 17 Oct 2025 10:01:36 -0700
Subject: [PATCH] Revert "Set csgmv as default lora backend. (#11488)" (#11735)

---
 benchmark/lora/launch_server.py        | 2 +-
 python/sglang/srt/server_args.py       | 2 +-
 python/sglang/test/runners.py          | 2 +-
 test/srt/lora/test_lora.py             | 6 ++++--
 test/srt/lora/test_lora_cuda_graph.py  | 2 ++
 test/srt/lora/test_lora_eviction.py    | 2 ++
 test/srt/lora/test_lora_qwen3.py       | 6 ++++--
 test/srt/lora/test_lora_radix_cache.py | 3 +++
 test/srt/lora/test_lora_tp.py          | 1 +
 test/srt/lora/test_lora_update.py      | 2 +-
 test/srt/lora/utils.py                 | 6 +++---
 11 files changed, 23 insertions(+), 11 deletions(-)

diff --git a/benchmark/lora/launch_server.py b/benchmark/lora/launch_server.py
index 5dcf66ad6..de93a6e13 100644
--- a/benchmark/lora/launch_server.py
+++ b/benchmark/lora/launch_server.py
@@ -53,7 +53,7 @@ if __name__ == "__main__":
     parser.add_argument(
         "--lora-backend",
         type=str,
-        default="csgmv",
+        default="triton",
     )
     parser.add_argument(
         "--tp-size",
diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py
index 864432496..dfb341128 100644
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -309,8 +309,8 @@ class ServerArgs:
     ] = None
     max_loaded_loras: Optional[int] = None
     max_loras_per_batch: int = 8
-    lora_backend: str = "csgmv"
     lora_eviction_policy: str = DEFAULT_LORA_EVICTION_POLICY
+    lora_backend: str = "triton"
     max_lora_chunk_size: Optional[int] = 16
 
     # Kernel backend
diff --git a/python/sglang/test/runners.py b/python/sglang/test/runners.py
index 9eecb14b5..dc7efe528 100644
--- a/python/sglang/test/runners.py
+++ b/python/sglang/test/runners.py
@@ -496,7 +496,7 @@ class SRTRunner:
         attention_backend: Optional[str] = None,
         prefill_attention_backend: Optional[str] = None,
         decode_attention_backend: Optional[str] = None,
-        lora_backend: str = "csgmv",
+        lora_backend: str = "triton",
         disable_cuda_graph: bool = False,
         disable_radix_cache: bool = False,
         chunked_prefill_size: Optional[int] = None,
diff --git a/test/srt/lora/test_lora.py b/test/srt/lora/test_lora.py
index 3ab7b624d..ab1c630fc 100644
--- a/test/srt/lora/test_lora.py
+++ b/test/srt/lora/test_lora.py
@@ -81,12 +81,13 @@ class TestLoRA(CustomTestCase):
         for model_case in model_cases:
             for torch_dtype in TORCH_DTYPES:
                 max_new_tokens = 32
+                backend = "triton"
                 base_path = model_case.base
                 lora_adapter_paths = [a.name for a in model_case.adaptors]
                 assert len(lora_adapter_paths) >= 2
 
                 print(
-                    f"\n========== Testing multiple batches on base '{base_path}', dtype={torch_dtype} ---"
+                    f"\n========== Testing multiple batches on base '{base_path}' with backend={backend}, dtype={torch_dtype} ---"
                 )
 
                 # Initialize runners
@@ -96,6 +97,7 @@ class TestLoRA(CustomTestCase):
                     model_type="generation",
                     lora_paths=[lora_adapter_paths[0], lora_adapter_paths[1]],
                     max_loras_per_batch=len(lora_adapter_paths) + 1,
+                    lora_backend=backend,
                     sleep_on_idle=True,  # Eliminate non-determinism by forcing all requests to be processed in one batch.
                     attention_backend="torch_native",
                 )
@@ -140,7 +142,7 @@ class TestLoRA(CustomTestCase):
                             if rouge_score < rouge_tol:
                                 raise AssertionError(
                                     f"ROUGE-L score {rouge_score} below tolerance {rouge_tol} "
-                                    f"for base '{base_path}', adaptor '{lora_paths}', prompt: '{prompts}...'"
+                                    f"for base '{base_path}', adaptor '{lora_paths}', backend '{backend}', prompt: '{prompts}...'"
                                 )
 
                         print(f"--- Batch {i} Comparison Passed --- ")
diff --git a/test/srt/lora/test_lora_cuda_graph.py b/test/srt/lora/test_lora_cuda_graph.py
index d14e3c76e..ba68df59a 100644
--- a/test/srt/lora/test_lora_cuda_graph.py
+++ b/test/srt/lora/test_lora_cuda_graph.py
@@ -62,6 +62,7 @@ class TestLoRACudaGraph(CustomTestCase):
                     model_case,
                     torch_dtype,
                     max_new_tokens=32,
+                    backend="triton",
                     disable_cuda_graph=True,
                     test_tag="without_cuda_graph",
                 )
@@ -76,6 +77,7 @@ class TestLoRACudaGraph(CustomTestCase):
                     model_case,
                     torch_dtype,
                     max_new_tokens=32,
+                    backend="triton",
                     disable_cuda_graph=False,
                     test_tag="cuda_graph_padding",
                 )
diff --git a/test/srt/lora/test_lora_eviction.py b/test/srt/lora/test_lora_eviction.py
index fc1e00e3d..d27b11906 100644
--- a/test/srt/lora/test_lora_eviction.py
+++ b/test/srt/lora/test_lora_eviction.py
@@ -83,6 +83,7 @@ class TestLoRAEviction(CustomTestCase):
     ):
         REUSED_LORA_NAME = "lora"
         max_new_tokens = 256
+        backend = "triton"
         torch_dtype = torch.float16
         base_path = BASE_MODEL
         assert len(lora_paths) >= 2
@@ -95,6 +96,7 @@ class TestLoRAEviction(CustomTestCase):
             model_type="generation",
             lora_paths=initial_lora_paths,
             max_loras_per_batch=1,
+            lora_backend=backend,
             enable_lora=True,
             max_lora_rank=256,
             lora_target_modules=["all"],
diff --git a/test/srt/lora/test_lora_qwen3.py b/test/srt/lora/test_lora_qwen3.py
index beab18cf4..f77156707 100644
--- a/test/srt/lora/test_lora_qwen3.py
+++ b/test/srt/lora/test_lora_qwen3.py
@@ -71,6 +71,7 @@ class TestLoRAQwen3(CustomTestCase):
         for model_case in model_cases:
             for torch_dtype in TORCH_DTYPES:
                 max_new_tokens = 32
+                backend = "triton"
                 base_path = model_case.base
                 lora_adapter_paths = [a.name for a in model_case.adaptors]
                 assert len(lora_adapter_paths) >= 2
@@ -127,7 +128,7 @@ class TestLoRAQwen3(CustomTestCase):
                 ]
 
                 print(
-                    f"\n========== Testing multiple batches on base '{base_path}', dtype={torch_dtype} ---"
+                    f"\n========== Testing multiple batches on base '{base_path}' with backend={backend}, dtype={torch_dtype} ---"
                 )
 
                 # Initialize runners
@@ -138,6 +139,7 @@ class TestLoRAQwen3(CustomTestCase):
                     model_type="generation",
                     lora_paths=[lora_adapter_paths[0], lora_adapter_paths[1]],
                     max_loras_per_batch=len(lora_adapter_paths) + 1,
+                    lora_backend=backend,
                     sleep_on_idle=True,  # Eliminate non-determinism by forcing all requests to be processed in one batch.
                     attention_backend="torch_native",
                 )
@@ -181,7 +183,7 @@ class TestLoRAQwen3(CustomTestCase):
                             if rouge_score < rouge_tol:
                                 raise AssertionError(
                                     f"ROUGE-L score {rouge_score} below tolerance {rouge_tol} "
-                                    f"for base '{base_path}', adaptor '{lora_paths}', prompt: '{prompts}...'"
+                                    f"for base '{base_path}', adaptor '{lora_paths}', backend '{backend}', prompt: '{prompts}...'"
                                 )
 
                         print(f"--- Batch {i+1} Comparison Passed --- ")
diff --git a/test/srt/lora/test_lora_radix_cache.py b/test/srt/lora/test_lora_radix_cache.py
index 2faacf930..d3ecb219c 100644
--- a/test/srt/lora/test_lora_radix_cache.py
+++ b/test/srt/lora/test_lora_radix_cache.py
@@ -44,6 +44,7 @@ class TestLoRARadixCache(CustomTestCase):
 
         torch_dtype = torch.float16
         max_new_tokens = 32
+        backend = "triton"
         batch_prompts = (
             PROMPTS
             if not model_case.skip_long_prompt
@@ -56,6 +57,7 @@ class TestLoRARadixCache(CustomTestCase):
             model_case,
             torch_dtype,
             max_new_tokens=max_new_tokens,
+            backend=backend,
             disable_radix_cache=False,
             test_tag="lora-with-radix-cache",
         )
@@ -66,6 +68,7 @@ class TestLoRARadixCache(CustomTestCase):
             model_case,
             torch_dtype,
             max_new_tokens=max_new_tokens,
+            backend=backend,
             disable_radix_cache=True,
             test_tag="lora-without-radix-cache",
         )
diff --git a/test/srt/lora/test_lora_tp.py b/test/srt/lora/test_lora_tp.py
index e459532a5..51a552d78 100644
--- a/test/srt/lora/test_lora_tp.py
+++ b/test/srt/lora/test_lora_tp.py
@@ -48,6 +48,7 @@ class TestLoRATP(CustomTestCase):
                         model_case,
                         torch_dtype,
                         max_new_tokens=32,
+                        backend="triton",
                         test_tag=f"tp={tp_size}",
                     )
 
diff --git a/test/srt/lora/test_lora_update.py b/test/srt/lora/test_lora_update.py
index bc3232ca0..073100e17 100644
--- a/test/srt/lora/test_lora_update.py
+++ b/test/srt/lora/test_lora_update.py
@@ -763,7 +763,7 @@ class LoRAUpdateTestSessionBase:
         max_lora_rank: Optional[int],
         enable_lora: Optional[bool] = None,
         lora_target_modules: Optional[List[str]] = None,
-        lora_backend: str = "csgmv",
+        lora_backend: str = "triton",
         disable_cuda_graph: bool = False,
         cuda_graph_max_bs: int = 4,
     ):
diff --git a/test/srt/lora/utils.py b/test/srt/lora/utils.py
index 42050fc72..94ce8ab60 100644
--- a/test/srt/lora/utils.py
+++ b/test/srt/lora/utils.py
@@ -14,7 +14,7 @@
 
 import dataclasses
 import random
-from typing import List, Optional
+from typing import List
 
 import torch
 
@@ -50,7 +50,7 @@ class LoRAModelCase:
 
 
 TORCH_DTYPES = [torch.float16]
-BACKENDS = ["triton", "csgmv"]
+BACKENDS = ["triton"]
 DEFAULT_PROMPTS = [
     "AI is a field of computer science focused on",
     """
@@ -135,7 +135,7 @@ def run_lora_test_one_by_one(
     model_case: LoRAModelCase,
     torch_dtype: torch.dtype,
     max_new_tokens: int,
-    backend: str = "csgmv",
+    backend: str,
     disable_cuda_graph: bool = False,
     disable_radix_cache: bool = False,
     mem_fraction_static: float = 0.88,