[FEATURE] Add OpenAI-Compatible LoRA Adapter Selection (#11570)
This commit is contained in:
@@ -1,37 +1,67 @@
|
||||
# launch server
|
||||
# python -m sglang.launch_server --model mistralai/Mistral-7B-Instruct-v0.3 --lora-paths /home/ying/test_lora lora1=/home/ying/test_lora_1 lora2=/home/ying/test_lora_2 --disable-radix --disable-cuda-graph --max-loras-per-batch 4
|
||||
"""
|
||||
OpenAI-compatible LoRA adapter usage with SGLang.
|
||||
|
||||
# send requests
|
||||
# lora_path[i] specifies the LoRA used for text[i], so make sure they have the same length
|
||||
# use None to specify base-only prompt, e.x. "lora_path": [None, "/home/ying/test_lora"]
|
||||
import json
|
||||
Server Setup:
|
||||
python -m sglang.launch_server \\
|
||||
--model meta-llama/Llama-3.1-8B-Instruct \\
|
||||
--enable-lora \\
|
||||
--lora-paths sql=/path/to/sql python=/path/to/python
|
||||
"""
|
||||
|
||||
import requests
|
||||
import openai
|
||||
|
||||
url = "http://127.0.0.1:30000"
|
||||
json_data = {
|
||||
"text": [
|
||||
"prompt 1",
|
||||
"prompt 2",
|
||||
"prompt 3",
|
||||
"prompt 4",
|
||||
"prompt 5",
|
||||
"prompt 6",
|
||||
"prompt 7",
|
||||
],
|
||||
"sampling_params": {"max_new_tokens": 32},
|
||||
"lora_path": [
|
||||
"/home/ying/test_lora",
|
||||
"lora1",
|
||||
"lora2",
|
||||
"lora1",
|
||||
"lora2",
|
||||
None,
|
||||
None,
|
||||
],
|
||||
}
|
||||
response = requests.post(
|
||||
url + "/generate",
|
||||
json=json_data,
|
||||
)
|
||||
print(json.dumps(response.json()))
|
||||
client = openai.Client(base_url="http://127.0.0.1:30000/v1", api_key="EMPTY")
|
||||
|
||||
|
||||
def main():
|
||||
print("SGLang OpenAI-Compatible LoRA Examples\n")
|
||||
|
||||
# Example 1: NEW - Adapter in model parameter (OpenAI-compatible)
|
||||
print("1. Chat with LoRA adapter in model parameter:")
|
||||
response = client.chat.completions.create(
|
||||
model="meta-llama/Llama-3.1-8B-Instruct:sql", # ← adapter:name syntax
|
||||
messages=[{"role": "user", "content": "Convert to SQL: show all users"}],
|
||||
max_tokens=50,
|
||||
)
|
||||
print(f" Response: {response.choices[0].message.content}\n")
|
||||
|
||||
# Example 2: Completions API with adapter
|
||||
print("2. Completion with LoRA adapter:")
|
||||
response = client.completions.create(
|
||||
model="meta-llama/Llama-3.1-8B-Instruct:python",
|
||||
prompt="def fibonacci(n):",
|
||||
max_tokens=50,
|
||||
)
|
||||
print(f" Response: {response.choices[0].text}\n")
|
||||
|
||||
# Example 3: OLD - Backward compatible with explicit lora_path
|
||||
print("3. Backward compatible (explicit lora_path):")
|
||||
response = client.chat.completions.create(
|
||||
model="meta-llama/Llama-3.1-8B-Instruct",
|
||||
messages=[{"role": "user", "content": "Convert to SQL: show all users"}],
|
||||
extra_body={"lora_path": "sql"},
|
||||
max_tokens=50,
|
||||
)
|
||||
print(f" Response: {response.choices[0].message.content}\n")
|
||||
|
||||
# Example 4: Base model (no adapter)
|
||||
print("4. Base model without adapter:")
|
||||
response = client.chat.completions.create(
|
||||
model="meta-llama/Llama-3.1-8B-Instruct",
|
||||
messages=[{"role": "user", "content": "Hello!"}],
|
||||
max_tokens=30,
|
||||
)
|
||||
print(f" Response: {response.choices[0].message.content}\n")
|
||||
|
||||
print("All examples completed!")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
main()
|
||||
except Exception as e:
|
||||
print(f"Error: {e}")
|
||||
print(
|
||||
"\nEnsure server is running:\n"
|
||||
" python -m sglang.launch_server --model ... --enable-lora --lora-paths ..."
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user