[FEATURE] Add OpenAI-Compatible LoRA Adapter Selection (#11570)

2025-10-21 00:44:33 -07:00
parent 7e6191c098
commit 852c0578fd
10 changed files with 815 additions and 40 deletions
--- a/examples/runtime/lora.py
+++ b/examples/runtime/lora.py
@@ -1,37 +1,67 @@
-# launch server
-# python -m sglang.launch_server --model mistralai/Mistral-7B-Instruct-v0.3 --lora-paths /home/ying/test_lora lora1=/home/ying/test_lora_1 lora2=/home/ying/test_lora_2 --disable-radix --disable-cuda-graph --max-loras-per-batch 4
+"""
+OpenAI-compatible LoRA adapter usage with SGLang.

-# send requests
-# lora_path[i] specifies the LoRA used for text[i], so make sure they have the same length
-# use None to specify base-only prompt, e.x. "lora_path": [None, "/home/ying/test_lora"]
-import json
+Server Setup:
+    python -m sglang.launch_server \\
+        --model meta-llama/Llama-3.1-8B-Instruct \\
+        --enable-lora \\
+        --lora-paths sql=/path/to/sql python=/path/to/python
+"""

-import requests
+import openai

-url = "http://127.0.0.1:30000"
-json_data = {
-    "text": [
-        "prompt 1",
-        "prompt 2",
-        "prompt 3",
-        "prompt 4",
-        "prompt 5",
-        "prompt 6",
-        "prompt 7",
-    ],
-    "sampling_params": {"max_new_tokens": 32},
-    "lora_path": [
-        "/home/ying/test_lora",
-        "lora1",
-        "lora2",
-        "lora1",
-        "lora2",
-        None,
-        None,
-    ],
-}
-response = requests.post(
-    url + "/generate",
-    json=json_data,
-)
-print(json.dumps(response.json()))
+client = openai.Client(base_url="http://127.0.0.1:30000/v1", api_key="EMPTY")
+
+
+def main():
+    print("SGLang OpenAI-Compatible LoRA Examples\n")
+
+    # Example 1: NEW - Adapter in model parameter (OpenAI-compatible)
+    print("1. Chat with LoRA adapter in model parameter:")
+    response = client.chat.completions.create(
+        model="meta-llama/Llama-3.1-8B-Instruct:sql",  # ← adapter:name syntax
+        messages=[{"role": "user", "content": "Convert to SQL: show all users"}],
+        max_tokens=50,
+    )
+    print(f"   Response: {response.choices[0].message.content}\n")
+
+    # Example 2: Completions API with adapter
+    print("2. Completion with LoRA adapter:")
+    response = client.completions.create(
+        model="meta-llama/Llama-3.1-8B-Instruct:python",
+        prompt="def fibonacci(n):",
+        max_tokens=50,
+    )
+    print(f"   Response: {response.choices[0].text}\n")
+
+    # Example 3: OLD - Backward compatible with explicit lora_path
+    print("3. Backward compatible (explicit lora_path):")
+    response = client.chat.completions.create(
+        model="meta-llama/Llama-3.1-8B-Instruct",
+        messages=[{"role": "user", "content": "Convert to SQL: show all users"}],
+        extra_body={"lora_path": "sql"},
+        max_tokens=50,
+    )
+    print(f"   Response: {response.choices[0].message.content}\n")
+
+    # Example 4: Base model (no adapter)
+    print("4. Base model without adapter:")
+    response = client.chat.completions.create(
+        model="meta-llama/Llama-3.1-8B-Instruct",
+        messages=[{"role": "user", "content": "Hello!"}],
+        max_tokens=30,
+    )
+    print(f"   Response: {response.choices[0].message.content}\n")
+
+    print("All examples completed!")
+
+
+if __name__ == "__main__":
+    try:
+        main()
+    except Exception as e:
+        print(f"Error: {e}")
+        print(
+            "\nEnsure server is running:\n"
+            "  python -m sglang.launch_server --model ... --enable-lora --lora-paths ..."
+        )