Support radix cache for Lora feature (#7216)
This commit is contained in:
@@ -104,7 +104,6 @@ class TestLoRA(CustomTestCase):
|
||||
lora_paths=[lora_adapter_paths[0], lora_adapter_paths[1]],
|
||||
max_loras_per_batch=len(lora_adapter_paths) + 1,
|
||||
lora_backend=backend,
|
||||
disable_radix_cache=True,
|
||||
sleep_on_idle=True, # Eliminate non-determinism by forcing all requests to be processed in one batch.
|
||||
attention_backend="torch_native",
|
||||
)
|
||||
|
||||
@@ -97,7 +97,6 @@ class TestLoRAEviction(CustomTestCase):
|
||||
lora_paths=initial_lora_paths,
|
||||
max_loras_per_batch=1,
|
||||
lora_backend=backend,
|
||||
disable_radix_cache=True,
|
||||
enable_lora=True,
|
||||
max_lora_rank=256,
|
||||
lora_target_modules=["all"],
|
||||
|
||||
@@ -140,7 +140,6 @@ class TestLoRA(CustomTestCase):
|
||||
lora_paths=[lora_adapter_paths[0], lora_adapter_paths[1]],
|
||||
max_loras_per_batch=len(lora_adapter_paths) + 1,
|
||||
lora_backend=backend,
|
||||
disable_radix_cache=True,
|
||||
)
|
||||
hf_runner = HFRunner(
|
||||
base_path,
|
||||
|
||||
83
test/srt/lora/test_lora_radix_cache.py
Normal file
83
test/srt/lora/test_lora_radix_cache.py
Normal file
@@ -0,0 +1,83 @@
|
||||
# Copyright 2023-2024 SGLang Team
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ==============================================================================
|
||||
|
||||
import multiprocessing as mp
|
||||
import random
|
||||
import unittest
|
||||
|
||||
import torch
|
||||
from utils import CI_MULTI_LORA_MODELS, DEFAULT_PROMPTS, run_lora_test_one_by_one
|
||||
|
||||
from sglang.test.runners import HFRunner, SRTRunner
|
||||
from sglang.test.test_utils import CustomTestCase
|
||||
|
||||
PROMPTS = [
|
||||
"AI is a field of computer science focused on",
|
||||
"""
|
||||
### Instruction:
|
||||
Tell me about llamas and alpacas
|
||||
### Response:
|
||||
Llamas are large, long-necked animals with a woolly coat. They have two toes on each foot instead of three like other camelids.
|
||||
### Question:
|
||||
What do you know about llamas?
|
||||
### Answer:
|
||||
""",
|
||||
]
|
||||
|
||||
|
||||
class TestLoRARadixCache(CustomTestCase):
|
||||
|
||||
def test_lora_radix_cache(self):
|
||||
# Here we need a model case with multiple adaptors for testing correctness of radix cache
|
||||
model_case = CI_MULTI_LORA_MODELS[0]
|
||||
|
||||
torch_dtype = torch.float16
|
||||
max_new_tokens = 32
|
||||
backend = "triton"
|
||||
batch_prompts = (
|
||||
PROMPTS
|
||||
if not model_case.skip_long_prompt
|
||||
else [p for p in PROMPTS if len(p) < 1000]
|
||||
)
|
||||
|
||||
# Test lora with radix cache
|
||||
run_lora_test_one_by_one(
|
||||
batch_prompts,
|
||||
model_case,
|
||||
torch_dtype,
|
||||
max_new_tokens=max_new_tokens,
|
||||
backend=backend,
|
||||
disable_radix_cache=False,
|
||||
test_tag="lora-with-radix-cache",
|
||||
)
|
||||
|
||||
# Test lora without radix cache
|
||||
run_lora_test_one_by_one(
|
||||
batch_prompts,
|
||||
model_case,
|
||||
torch_dtype,
|
||||
max_new_tokens=max_new_tokens,
|
||||
backend=backend,
|
||||
disable_radix_cache=True,
|
||||
test_tag="lora-without-radix-cache",
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
mp.set_start_method("spawn")
|
||||
except RuntimeError:
|
||||
pass
|
||||
|
||||
unittest.main(warnings="ignore")
|
||||
@@ -787,7 +787,6 @@ class LoRAUpdateEngineTestSession(LoRAUpdateTestSessionBase):
|
||||
max_loaded_loras=self.max_loaded_loras,
|
||||
disable_cuda_graph=self.disable_cuda_graph,
|
||||
cuda_graph_max_bs=self.cuda_graph_max_bs,
|
||||
disable_radix_cache=True,
|
||||
enable_lora=self.enable_lora,
|
||||
)
|
||||
self.handle.__enter__()
|
||||
@@ -917,7 +916,6 @@ class LoRAUpdateServerTestSession(LoRAUpdateTestSessionBase):
|
||||
str(self.max_loras_per_batch),
|
||||
"--lora-backend",
|
||||
self.lora_backend,
|
||||
"--disable-radix-cache",
|
||||
"--random-seed",
|
||||
"42",
|
||||
"--max-running-request",
|
||||
|
||||
@@ -136,7 +136,7 @@ def run_lora_test_one_by_one(
|
||||
max_new_tokens: int,
|
||||
backend: str,
|
||||
disable_cuda_graph: bool = False,
|
||||
disable_radix_cache: bool = True,
|
||||
disable_radix_cache: bool = False,
|
||||
mem_fraction_static: float = 0.88,
|
||||
test_tag: str = "",
|
||||
):
|
||||
@@ -156,7 +156,7 @@ def run_lora_test_one_by_one(
|
||||
max_new_tokens (int): The maximum number of new tokens to generate.
|
||||
backend (str): The lora backend to use.
|
||||
disable_cuda_graph (bool, optional): Whether to disable CUDA graph. Defaults to False.
|
||||
disable_radix_cache (bool, optional): Whether to disable radix cache. Defaults to True.
|
||||
disable_radix_cache (bool, optional): Whether to disable radix cache. Defaults to False.
|
||||
mem_fraction_static (float, optional): The fraction of memory to use. Defaults to 0.88.
|
||||
test_tag (str, optional): The tag to use for the test. Defaults to "".
|
||||
"""
|
||||
@@ -284,7 +284,7 @@ def run_lora_test_by_batch(
|
||||
max_new_tokens: int,
|
||||
backend: str,
|
||||
disable_cuda_graph: bool = False,
|
||||
disable_radix_cache: bool = True,
|
||||
disable_radix_cache: bool = False,
|
||||
mem_fraction_static: float = 0.88,
|
||||
test_tag: str = "",
|
||||
):
|
||||
@@ -303,7 +303,7 @@ def run_lora_test_by_batch(
|
||||
max_new_tokens (int): The maximum number of new tokens to generate.
|
||||
backend (str): The lora backend to use.
|
||||
disable_cuda_graph (bool, optional): Whether to disable CUDA graph. Defaults to False.
|
||||
disable_radix_cache (bool, optional): Whether to disable radix cache. Defaults to True.
|
||||
disable_radix_cache (bool, optional): Whether to disable radix cache. Defaults to False.
|
||||
mem_fraction_static (float, optional): The fraction of memory to use. Defaults to 0.88.
|
||||
test_tag (str, optional): The tag to use for the test. Defaults to "".
|
||||
"""
|
||||
|
||||
@@ -23,6 +23,7 @@ suites = {
|
||||
TestFile("lora/test_lora_cuda_graph.py", 250),
|
||||
TestFile("lora/test_lora_update.py", 400),
|
||||
TestFile("lora/test_lora_qwen3.py", 97),
|
||||
TestFile("lora/test_lora_radix_cache.py", 100),
|
||||
TestFile("models/test_embedding_models.py", 73),
|
||||
# TestFile("models/test_clip_models.py", 52),
|
||||
TestFile("models/test_encoder_embedding_models.py", 100),
|
||||
|
||||
Reference in New Issue
Block a user