Support radix cache for Lora feature (#7216)

This commit is contained in:
Baizhou Zhang
2025-08-11 10:14:11 -07:00
committed by GitHub
parent 6f81a710f7
commit 75e6a7cde1
12 changed files with 546 additions and 27 deletions

View File

@@ -104,7 +104,6 @@ class TestLoRA(CustomTestCase):
lora_paths=[lora_adapter_paths[0], lora_adapter_paths[1]],
max_loras_per_batch=len(lora_adapter_paths) + 1,
lora_backend=backend,
disable_radix_cache=True,
sleep_on_idle=True, # Eliminate non-determinism by forcing all requests to be processed in one batch.
attention_backend="torch_native",
)

View File

@@ -97,7 +97,6 @@ class TestLoRAEviction(CustomTestCase):
lora_paths=initial_lora_paths,
max_loras_per_batch=1,
lora_backend=backend,
disable_radix_cache=True,
enable_lora=True,
max_lora_rank=256,
lora_target_modules=["all"],

View File

@@ -140,7 +140,6 @@ class TestLoRA(CustomTestCase):
lora_paths=[lora_adapter_paths[0], lora_adapter_paths[1]],
max_loras_per_batch=len(lora_adapter_paths) + 1,
lora_backend=backend,
disable_radix_cache=True,
)
hf_runner = HFRunner(
base_path,

View File

@@ -0,0 +1,83 @@
# Copyright 2023-2024 SGLang Team
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
import multiprocessing as mp
import random
import unittest
import torch
from utils import CI_MULTI_LORA_MODELS, DEFAULT_PROMPTS, run_lora_test_one_by_one
from sglang.test.runners import HFRunner, SRTRunner
from sglang.test.test_utils import CustomTestCase
PROMPTS = [
"AI is a field of computer science focused on",
"""
### Instruction:
Tell me about llamas and alpacas
### Response:
Llamas are large, long-necked animals with a woolly coat. They have two toes on each foot instead of three like other camelids.
### Question:
What do you know about llamas?
### Answer:
""",
]
class TestLoRARadixCache(CustomTestCase):
def test_lora_radix_cache(self):
# Here we need a model case with multiple adaptors for testing correctness of radix cache
model_case = CI_MULTI_LORA_MODELS[0]
torch_dtype = torch.float16
max_new_tokens = 32
backend = "triton"
batch_prompts = (
PROMPTS
if not model_case.skip_long_prompt
else [p for p in PROMPTS if len(p) < 1000]
)
# Test lora with radix cache
run_lora_test_one_by_one(
batch_prompts,
model_case,
torch_dtype,
max_new_tokens=max_new_tokens,
backend=backend,
disable_radix_cache=False,
test_tag="lora-with-radix-cache",
)
# Test lora without radix cache
run_lora_test_one_by_one(
batch_prompts,
model_case,
torch_dtype,
max_new_tokens=max_new_tokens,
backend=backend,
disable_radix_cache=True,
test_tag="lora-without-radix-cache",
)
if __name__ == "__main__":
try:
mp.set_start_method("spawn")
except RuntimeError:
pass
unittest.main(warnings="ignore")

View File

@@ -787,7 +787,6 @@ class LoRAUpdateEngineTestSession(LoRAUpdateTestSessionBase):
max_loaded_loras=self.max_loaded_loras,
disable_cuda_graph=self.disable_cuda_graph,
cuda_graph_max_bs=self.cuda_graph_max_bs,
disable_radix_cache=True,
enable_lora=self.enable_lora,
)
self.handle.__enter__()
@@ -917,7 +916,6 @@ class LoRAUpdateServerTestSession(LoRAUpdateTestSessionBase):
str(self.max_loras_per_batch),
"--lora-backend",
self.lora_backend,
"--disable-radix-cache",
"--random-seed",
"42",
"--max-running-request",

View File

@@ -136,7 +136,7 @@ def run_lora_test_one_by_one(
max_new_tokens: int,
backend: str,
disable_cuda_graph: bool = False,
disable_radix_cache: bool = True,
disable_radix_cache: bool = False,
mem_fraction_static: float = 0.88,
test_tag: str = "",
):
@@ -156,7 +156,7 @@ def run_lora_test_one_by_one(
max_new_tokens (int): The maximum number of new tokens to generate.
backend (str): The lora backend to use.
disable_cuda_graph (bool, optional): Whether to disable CUDA graph. Defaults to False.
disable_radix_cache (bool, optional): Whether to disable radix cache. Defaults to True.
disable_radix_cache (bool, optional): Whether to disable radix cache. Defaults to False.
mem_fraction_static (float, optional): The fraction of memory to use. Defaults to 0.88.
test_tag (str, optional): The tag to use for the test. Defaults to "".
"""
@@ -284,7 +284,7 @@ def run_lora_test_by_batch(
max_new_tokens: int,
backend: str,
disable_cuda_graph: bool = False,
disable_radix_cache: bool = True,
disable_radix_cache: bool = False,
mem_fraction_static: float = 0.88,
test_tag: str = "",
):
@@ -303,7 +303,7 @@ def run_lora_test_by_batch(
max_new_tokens (int): The maximum number of new tokens to generate.
backend (str): The lora backend to use.
disable_cuda_graph (bool, optional): Whether to disable CUDA graph. Defaults to False.
disable_radix_cache (bool, optional): Whether to disable radix cache. Defaults to True.
disable_radix_cache (bool, optional): Whether to disable radix cache. Defaults to False.
mem_fraction_static (float, optional): The fraction of memory to use. Defaults to 0.88.
test_tag (str, optional): The tag to use for the test. Defaults to "".
"""

View File

@@ -23,6 +23,7 @@ suites = {
TestFile("lora/test_lora_cuda_graph.py", 250),
TestFile("lora/test_lora_update.py", 400),
TestFile("lora/test_lora_qwen3.py", 97),
TestFile("lora/test_lora_radix_cache.py", 100),
TestFile("models/test_embedding_models.py", 73),
# TestFile("models/test_clip_models.py", 52),
TestFile("models/test_encoder_embedding_models.py", 100),