Update quick start examples (#120)

This commit is contained in:
Lianmin Zheng
2024-01-30 04:29:32 -08:00
committed by GitHub
parent 4ea92f8307
commit 0617528632
20 changed files with 567 additions and 237 deletions

View File

@@ -39,40 +39,10 @@ pip install -e "python[all]"
- For NVIDIA V100, please install the [nightly](https://triton-lang.org/main/getting-started/installation.html) version. - For NVIDIA V100, please install the [nightly](https://triton-lang.org/main/getting-started/installation.html) version.
- If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"` - If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`
## Quick Start ## Quick Start
The example below shows how to use sglang to answer a mulit-turn question. The example below shows how to use sglang to answer a mulit-turn question.
### Using OpenAI Models
Set the OpenAI API Key
```
export OPENAI_API_KEY=sk-******
```
Then, answer a multi-turn question.
```python
from sglang import function, system, user, assistant, gen, set_default_backend, OpenAI
@function
def multi_turn_question(s, question_1, question_2):
s += system("You are a helpful assistant.")
s += user(question_1)
s += assistant(gen("answer_1", max_tokens=256))
s += user(question_2)
s += assistant(gen("answer_2", max_tokens=256))
set_default_backend(OpenAI("gpt-3.5-turbo"))
state = multi_turn_question.run(
question_1="What is the capital of the United States?",
question_2="List two local attractions.",
)
for m in state.messages():
print(m["role"], ":", m["content"])
print(state["answer_1"])
```
### Using Local Models ### Using Local Models
First, launch a server with First, launch a server with
``` ```
@@ -105,6 +75,37 @@ for m in state.messages():
print(state["answer_1"]) print(state["answer_1"])
``` ```
### Using OpenAI Models
Set the OpenAI API Key
```
export OPENAI_API_KEY=sk-******
```
Then, answer a multi-turn question.
```python
from sglang import function, system, user, assistant, gen, set_default_backend, OpenAI
@function
def multi_turn_question(s, question_1, question_2):
s += system("You are a helpful assistant.")
s += user(question_1)
s += assistant(gen("answer_1", max_tokens=256))
s += user(question_2)
s += assistant(gen("answer_2", max_tokens=256))
set_default_backend(OpenAI("gpt-3.5-turbo"))
state = multi_turn_question.run(
question_1="What is the capital of the United States?",
question_2="List two local attractions.",
)
for m in state.messages():
print(m["role"], ":", m["content"])
print(state["answer_1"])
```
### More Examples ### More Examples
Anthropic and VertexAI (Gemini) models are also supported. Anthropic and VertexAI (Gemini) models are also supported.
@@ -120,7 +121,7 @@ import sglang as sgl
`sglang` provides some simple primitives such as `gen`, `select`, `fork`, `image`. `sglang` provides some simple primitives such as `gen`, `select`, `fork`, `image`.
You can implement your prompt flow in a function decorated by `sgl.function`. You can implement your prompt flow in a function decorated by `sgl.function`.
You can then invoke the function with `run` or `run_batch`. You can then invoke the function with `run` or `run_batch`.
The system will manage the state, chat template, and parallelism for you. The system will manage the state, chat template, parallelism and batching for you.
### Control Flow ### Control Flow
You can use any Python code within the function body, including control flow, nested function calls, and external libraries. You can use any Python code within the function body, including control flow, nested function calls, and external libraries.

View File

@@ -1,19 +1,67 @@
from sglang import function, system, user, assistant, gen, set_default_backend, Anthropic """
Usage:
export ANTHROPIC_API_KEY=sk-******
python3 anthropic_example_chat.py
"""
import sglang as sgl
@function @sgl.function
def multi_turn_question(s, question_1, question_2): def multi_turn_question(s, question_1, question_2):
s += user(question_1) s += sgl.user(question_1)
s += assistant(gen("answer_1", max_tokens=256)) s += sgl.assistant(sgl.gen("answer_1", max_tokens=256))
s += user(question_2) s += sgl.user(question_2)
s += assistant(gen("answer_2", max_tokens=256)) s += sgl.assistant(sgl.gen("answer_2", max_tokens=256))
set_default_backend(Anthropic("claude-2"))
state = multi_turn_question.run( def single():
question_1="What is the capital of the United States?", state = multi_turn_question.run(
question_2="List two local attractions.", question_1="What is the capital of the United States?",
) question_2="List two local attractions.",
)
for m in state.messages(): for m in state.messages():
print(m["role"], ":", m["content"]) print(m["role"], ":", m["content"])
print("answer_1", state["answer_1"])
def stream():
state = multi_turn_question.run(
question_1="What is the capital of the United States?",
question_2="List two local attractions.",
stream=True
)
for out in state.text_iter():
print(out, end="", flush=True)
print()
def batch():
states = multi_turn_question.run_batch([
{"question_1": "What is the capital of the United States?",
"question_2": "List two local attractions."},
{"question_1": "What is the capital of France?",
"question_2": "What is the population of this city?"},
])
for s in states:
print(s.messages())
if __name__ == "__main__":
sgl.set_default_backend(sgl.Anthropic("claude-2"))
# Run a single request
print("\n========== single ==========\n")
single()
# Stream output
print("\n========== stream ==========\n")
stream()
# Run a batch of requests
print("\n========== batch ==========\n")
batch()

View File

@@ -1,7 +1,13 @@
from sglang import function, gen, set_default_backend, Anthropic """
Usage:
export ANTHROPIC_API_KEY=sk-******
python3 anthropic_example_complete.py
"""
import sglang as sgl
@function @sgl.function
def few_shot_qa(s, question): def few_shot_qa(s, question):
s += ( s += (
""" """
@@ -13,14 +19,49 @@ def few_shot_qa(s, question):
\n\nAssistant: Rome \n\nAssistant: Rome
""") """)
s += "\n\nHuman: " + question + "\n" s += "\n\nHuman: " + question + "\n"
s += "\n\nAssistant:" + gen("answer", stop="\n", temperature=0) s += "\n\nAssistant:" + sgl.gen("answer", stop="\n", temperature=0)
set_default_backend(Anthropic("claude-2")) def single():
state = few_shot_qa.run(question="What is the capital of the United States?")
answer = state["answer"].strip().lower()
state = few_shot_qa.run(question="What is the capital of the United States?") assert "washington" in answer, f"answer: {state['answer']}"
answer = state["answer"].strip().lower()
assert "washington" in answer, f"answer: {state['answer']}" print(state.text())
print(state.text())
def stream():
state = few_shot_qa.run(
question="What is the capital of the United States?",
stream=True)
for out in state.text_iter("answer"):
print(out, end="", flush=True)
print()
def batch():
states = few_shot_qa.run_batch([
{"question": "What is the capital of the United States?"},
{"question": "What is the capital of China?"},
])
for s in states:
print(s["answer"])
if __name__ == "__main__":
sgl.set_default_backend(sgl.Anthropic("claude-2"))
# Run a single request
print("\n========== single ==========\n")
single()
# Stream output
print("\n========== stream ==========\n")
stream()
# Run a batch of requests
print("\n========== batch ==========\n")
batch()

View File

@@ -1,20 +0,0 @@
from sglang import function, system, user, assistant, gen, set_default_backend, Anthropic
@function
def multi_turn_question(s, question_1, question_2):
s += user(question_1)
s += assistant(gen("answer_1", max_tokens=256))
s += user(question_2)
s += assistant(gen("answer_2", max_tokens=256))
set_default_backend(Anthropic("claude-2"))
state = multi_turn_question.run(
question_1="What is the capital of the United States?",
question_2="List two local attractions.",
stream=True
)
for out in state.text_iter():
print(out, end="", flush=True)

View File

@@ -0,0 +1,67 @@
"""
Usage:
export GCP_PROJECT_ID=******
python3 gemini_example_chat.py
"""
import sglang as sgl
@sgl.function
def multi_turn_question(s, question_1, question_2):
s += sgl.user(question_1)
s += sgl.assistant(sgl.gen("answer_1", max_tokens=256))
s += sgl.user(question_2)
s += sgl.assistant(sgl.gen("answer_2", max_tokens=256))
def single():
state = multi_turn_question.run(
question_1="What is the capital of the United States?",
question_2="List two local attractions.",
)
for m in state.messages():
print(m["role"], ":", m["content"])
print("answer_1", state["answer_1"])
def stream():
state = multi_turn_question.run(
question_1="What is the capital of the United States?",
question_2="List two local attractions.",
stream=True
)
for out in state.text_iter():
print(out, end="", flush=True)
print()
def batch():
states = multi_turn_question.run_batch([
{"question_1": "What is the capital of the United States?",
"question_2": "List two local attractions."},
{"question_1": "What is the capital of France?",
"question_2": "What is the population of this city?"},
])
for s in states:
print(s.messages())
if __name__ == "__main__":
sgl.set_default_backend(sgl.VertexAI("gemini-pro"))
# Run a single request
print("\n========== single ==========\n")
single()
# Stream output
print("\n========== stream ==========\n")
stream()
# Run a batch of requests
print("\n========== batch ==========\n")
batch()

View File

@@ -1,7 +1,13 @@
from sglang import function, gen, set_default_backend, VertexAI """
Usage:
export GCP_PROJECT_ID=******
python3 gemini_example_complete.py
"""
import sglang as sgl
@function @sgl.function
def few_shot_qa(s, question): def few_shot_qa(s, question):
s += ( s += (
"""The following are questions with answers. """The following are questions with answers.
@@ -13,14 +19,49 @@ Q: What is the capital of Italy?
A: Rome A: Rome
""") """)
s += "Q: " + question + "\n" s += "Q: " + question + "\n"
s += "A:" + gen("answer", stop="\n", temperature=0) s += "A:" + sgl.gen("answer", stop="\n", temperature=0)
set_default_backend(VertexAI("gemini-pro")) def single():
state = few_shot_qa.run(question="What is the capital of the United States?")
answer = state["answer"].strip().lower()
state = few_shot_qa.run(question="What is the capital of the United States?") assert "washington" in answer, f"answer: {state['answer']}"
answer = state["answer"].strip().lower()
assert "washington" in answer, f"answer: {state['answer']}" print(state.text())
print(state.text())
def stream():
state = few_shot_qa.run(
question="What is the capital of the United States?",
stream=True)
for out in state.text_iter("answer"):
print(out, end="", flush=True)
print()
def batch():
states = few_shot_qa.run_batch([
{"question": "What is the capital of the United States?"},
{"question": "What is the capital of China?"},
])
for s in states:
print(s["answer"])
if __name__ == "__main__":
sgl.set_default_backend(sgl.VertexAI("gemini-pro"))
# Run a single request
print("\n========== single ==========\n")
single()
# Stream output
print("\n========== stream ==========\n")
stream()
# Run a batch of requests
print("\n========== batch ==========\n")
batch()

View File

@@ -1,19 +1,29 @@
from sglang import function, user, assistant, gen, image, set_default_backend, VertexAI """
Usage:
export GCP_PROJECT_ID=******
python3 gemini_example_multimodal_chat.py
"""
import sglang as sgl
@function @sgl.function
def image_qa(s, image_file1, image_file2, question): def image_qa(s, image_file1, image_file2, question):
s += user(image(image_file1) + image(image_file2) + question) s += sgl.user(sgl.image(image_file1) + sgl.image(image_file2) + question)
s += assistant(gen("answer_1", max_tokens=256)) s += sgl.assistant(sgl.gen("answer", max_tokens=256))
set_default_backend(VertexAI("gemini-pro-vision"))
state = image_qa.run( if __name__ == "__main__":
image_file1="./images/cat.jpeg", sgl.set_default_backend(sgl.VertexAI("gemini-pro-vision"))
image_file2="./images/dog.jpeg",
question="Describe difference of the 2 images in one sentence.",
stream=True
)
for out in state.text_iter(): state = image_qa.run(
print(out, end="", flush=True) image_file1="./images/cat.jpeg",
image_file2="./images/dog.jpeg",
question="Describe difference of the two images in one sentence.",
stream=True
)
for out in state.text_iter("answer"):
print(out, end="", flush=True)
print()
print(state["answer"])

View File

@@ -1,20 +0,0 @@
from sglang import function, user, assistant, gen, set_default_backend, VertexAI
@function
def multi_turn_question(s, question_1, question_2):
s += user(question_1)
s += assistant(gen("answer_1", max_tokens=256))
s += user(question_2)
s += assistant(gen("answer_2", max_tokens=256))
set_default_backend(VertexAI("gemini-pro"))
state = multi_turn_question.run(
question_1="What is the capital of the United States?",
question_2="List two local attractions.",
stream=True
)
for out in state.text_iter():
print(out, end="", flush=True)

View File

@@ -1,20 +1,68 @@
from sglang import function, system, user, assistant, gen, set_default_backend, OpenAI """
Usage:
export OPENAI_API_KEY=sk-******
python3 openai_example_chat.py
"""
import sglang as sgl
@function @sgl.function
def multi_turn_question(s, question_1, question_2): def multi_turn_question(s, question_1, question_2):
s += system("You are a helpful assistant.") s += sgl.system("You are a helpful assistant.")
s += user(question_1) s += sgl.user(question_1)
s += assistant(gen("answer_1", max_tokens=256)) s += sgl.assistant(sgl.gen("answer_1", max_tokens=256))
s += user(question_2) s += sgl.user(question_2)
s += assistant(gen("answer_2", max_tokens=256)) s += sgl.assistant(sgl.gen("answer_2", max_tokens=256))
set_default_backend(OpenAI("gpt-3.5-turbo"))
state = multi_turn_question.run( def single():
question_1="What is the capital of the United States?", state = multi_turn_question.run(
question_2="List two local attractions.", question_1="What is the capital of the United States?",
) question_2="List two local attractions.",
)
for m in state.messages(): for m in state.messages():
print(m["role"], ":", m["content"]) print(m["role"], ":", m["content"])
print("answer_1", state["answer_1"])
def stream():
state = multi_turn_question.run(
question_1="What is the capital of the United States?",
question_2="List two local attractions.",
stream=True
)
for out in state.text_iter():
print(out, end="", flush=True)
print()
def batch():
states = multi_turn_question.run_batch([
{"question_1": "What is the capital of the United States?",
"question_2": "List two local attractions."},
{"question_1": "What is the capital of France?",
"question_2": "What is the population of this city?"},
])
for s in states:
print(s.messages())
if __name__ == "__main__":
sgl.set_default_backend(sgl.OpenAI("gpt-3.5-turbo"))
# Run a single request
print("\n========== single ==========\n")
single()
# Stream output
print("\n========== stream ==========\n")
stream()
# Run a batch of requests
print("\n========== batch ==========\n")
batch()

View File

@@ -1,7 +1,13 @@
from sglang import function, gen, set_default_backend, OpenAI """
Usage:
export OPENAI_API_KEY=sk-******
python3 openai_example_complete.py
"""
import sglang as sgl
@function @sgl.function
def few_shot_qa(s, question): def few_shot_qa(s, question):
s += ( s += (
"""The following are questions with answers. """The following are questions with answers.
@@ -13,14 +19,49 @@ Q: What is the capital of Italy?
A: Rome A: Rome
""") """)
s += "Q: " + question + "\n" s += "Q: " + question + "\n"
s += "A:" + gen("answer", stop="\n", temperature=0) s += "A:" + sgl.gen("answer", stop="\n", temperature=0)
set_default_backend(OpenAI("gpt-3.5-turbo-instruct")) def single():
state = few_shot_qa.run(question="What is the capital of the United States?")
answer = state["answer"].strip().lower()
state = few_shot_qa.run(question="What is the capital of the United States?") assert "washington" in answer, f"answer: {state['answer']}"
answer = state["answer"].strip().lower()
assert "washington" in answer, f"answer: {state['answer']}" print(state.text())
print(state.text())
def stream():
state = few_shot_qa.run(
question="What is the capital of the United States?",
stream=True)
for out in state.text_iter("answer"):
print(out, end="", flush=True)
print()
def batch():
states = few_shot_qa.run_batch([
{"question": "What is the capital of the United States?"},
{"question": "What is the capital of China?"},
])
for s in states:
print(s["answer"])
if __name__ == "__main__":
sgl.set_default_backend(sgl.OpenAI("gpt-3.5-turbo-instruct"))
# Run a single request
print("\n========== single ==========\n")
single()
# Stream output
print("\n========== stream ==========\n")
stream()
# Run a batch of requests
print("\n========== batch ==========\n")
batch()

View File

@@ -1,21 +0,0 @@
from sglang import function, system, user, assistant, gen, set_default_backend, OpenAI
@function
def multi_turn_question(s, question_1, question_2):
s += system("You are a helpful assistant.")
s += user(question_1)
s += assistant(gen("answer_1", max_tokens=256))
s += user(question_2)
s += assistant(gen("answer_2", max_tokens=256))
set_default_backend(OpenAI("gpt-3.5-turbo"))
state = multi_turn_question.run(
question_1="What is the capital of the United States?",
question_2="List two local attractions.",
stream=True
)
for out in state.text_iter():
print(out, end="", flush=True)

View File

@@ -1,26 +1,69 @@
from sglang import function, system, user, assistant, gen, set_default_backend, Runtime """
Usage:
python3 srt_example_chat.py
"""
import sglang as sgl
@function @sgl.function
def multi_turn_question(s, question_1, question_2): def multi_turn_question(s, question_1, question_2):
s += system("You are a helpful assistant.") s += sgl.user(question_1)
s += user(question_1) s += sgl.assistant(sgl.gen("answer_1", max_tokens=256))
s += assistant(gen("answer_1", max_tokens=256)) s += sgl.user(question_2)
s += user(question_2) s += sgl.assistant(sgl.gen("answer_2", max_tokens=256))
s += assistant(gen("answer_2", max_tokens=256))
runtime = Runtime(model_path="meta-llama/Llama-2-7b-chat-hf") def single():
#runtime = Runtime(model_path="mistralai/Mixtral-8x7B-Instruct-v0.1") state = multi_turn_question.run(
set_default_backend(runtime) question_1="What is the capital of the United States?",
question_2="List two local attractions.",
)
state = multi_turn_question.run( for m in state.messages():
question_1="What is the capital of the United States?", print(m["role"], ":", m["content"])
question_2="List two local attractions.",
)
for m in state.messages(): print("answer_1", state["answer_1"])
print(m["role"], ":", m["content"])
runtime.shutdown() def stream():
state = multi_turn_question.run(
question_1="What is the capital of the United States?",
question_2="List two local attractions.",
stream=True
)
for out in state.text_iter():
print(out, end="", flush=True)
print()
def batch():
states = multi_turn_question.run_batch([
{"question_1": "What is the capital of the United States?",
"question_2": "List two local attractions."},
{"question_1": "What is the capital of France?",
"question_2": "What is the population of this city?"},
])
for s in states:
print(s.messages())
if __name__ == "__main__":
runtime = sgl.Runtime(model_path="meta-llama/Llama-2-7b-chat-hf")
sgl.set_default_backend(runtime)
# Run a single request
print("\n========== single ==========\n")
single()
# Stream output
print("\n========== stream ==========\n")
stream()
# Run a batch of requests
print("\n========== batch ==========\n")
batch()
runtime.shutdown()

View File

@@ -1,7 +1,11 @@
from sglang import function, gen, set_default_backend, Runtime """
Usage:
python3 srt_example_complete.py
"""
import sglang as sgl
@function @sgl.function
def few_shot_qa(s, question): def few_shot_qa(s, question):
s += ( s += (
"""The following are questions with answers. """The following are questions with answers.
@@ -13,16 +17,52 @@ Q: What is the capital of Italy?
A: Rome A: Rome
""") """)
s += "Q: " + question + "\n" s += "Q: " + question + "\n"
s += "A:" + gen("answer", stop="\n", temperature=0) s += "A:" + sgl.gen("answer", stop="\n", temperature=0)
runtime = Runtime(model_path="meta-llama/Llama-2-7b-chat-hf") def single():
set_default_backend(runtime) state = few_shot_qa.run(question="What is the capital of the United States?")
answer = state["answer"].strip().lower()
state = few_shot_qa.run(question="What is the capital of the United States?") assert "washington" in answer, f"answer: {state['answer']}"
answer = state["answer"].strip().lower() print(state.text())
assert "washington" in answer, f"answer: {state['answer']}"
print(state.text())
runtime.shutdown()
def stream():
state = few_shot_qa.run(
question="What is the capital of the United States?",
stream=True)
for out in state.text_iter("answer"):
print(out, end="", flush=True)
print()
def batch():
states = few_shot_qa.run_batch([
{"question": "What is the capital of the United States?"},
{"question": "What is the capital of China?"},
])
for s in states:
print(s["answer"])
if __name__ == "__main__":
runtime = sgl.Runtime(model_path="meta-llama/Llama-2-7b-chat-hf")
sgl.set_default_backend(runtime)
# Run a single request
print("\n========== single ==========\n")
single()
# Stream output
print("\n========== stream ==========\n")
stream()
# Run a batch of requests
print("\n========== batch ==========\n")
batch()
runtime.shutdown()

View File

@@ -10,29 +10,53 @@ def image_qa(s, image_path, question):
s += sgl.assistant(sgl.gen("answer")) s += sgl.assistant(sgl.gen("answer"))
runtime = sgl.Runtime(model_path="liuhaotian/llava-v1.5-7b", def single():
tokenizer_path="llava-hf/llava-1.5-7b-hf") state = image_qa.run(
sgl.set_default_backend(runtime) image_path="images/cat.jpeg",
question="What is this?",
max_new_tokens=64)
print(state["answer"], "\n")
# Single def stream():
state = image_qa.run( state = image_qa.run(
image_path="images/cat.jpeg", image_path="images/cat.jpeg",
question="What is this?", question="What is this?",
max_new_tokens=64) max_new_tokens=64,
print(state["answer"], "\n") stream=True)
for out in state.text_iter("answer"):
print(out, end="", flush=True)
print()
# Batch def batch():
states = image_qa.run_batch( states = image_qa.run_batch(
[ [
{"image_path": "images/cat.jpeg", "question":"What is this?"}, {"image_path": "images/cat.jpeg", "question":"What is this?"},
{"image_path": "images/dog.jpeg", "question":"What is this?"}, {"image_path": "images/dog.jpeg", "question":"What is this?"},
], ],
max_new_tokens=64, max_new_tokens=64,
) )
for s in states: for s in states:
print(s["answer"], "\n") print(s["answer"], "\n")
runtime.shutdown() if __name__ == "__main__":
runtime = sgl.Runtime(model_path="liuhaotian/llava-v1.5-7b",
tokenizer_path="llava-hf/llava-1.5-7b-hf")
sgl.set_default_backend(runtime)
# Run a single request
print("\n========== single ==========\n")
single()
# Stream output
print("\n========== stream ==========\n")
stream()
# Run a batch of requests
print("\n========== batch ==========\n")
batch()
runtime.shutdown()

View File

@@ -1,26 +0,0 @@
from sglang import function, system, user, assistant, gen, set_default_backend, Runtime
@function
def multi_turn_question(s, question_1, question_2):
s += system("You are a helpful assistant.")
s += user(question_1)
s += assistant(gen("answer_1", max_tokens=256))
s += user(question_2)
s += assistant(gen("answer_2", max_tokens=256))
runtime = Runtime("meta-llama/Llama-2-7b-chat-hf")
set_default_backend(runtime)
state = multi_turn_question.run(
question_1="What is the capital of the United States?",
question_2="List two local attractions.",
temperature=0,
stream=True,
)
for out in state.text_iter():
print(out, end="", flush=True)
print()
runtime.shutdown()

View File

@@ -651,7 +651,7 @@ class ProgramState:
def sync(self): def sync(self):
return self.stream_executor.sync() return self.stream_executor.sync()
def text_iter(self, var_name=None): def text_iter(self, var_name: Optional[str] = None):
if self.stream_executor.stream: if self.stream_executor.stream:
prev = 0 prev = 0
if var_name is None: if var_name is None:
@@ -682,7 +682,9 @@ class ProgramState:
else: else:
yield self.get_var(name) yield self.get_var(name)
async def text_async_iter(self, var_name=None, return_meta_data=False): async def text_async_iter(
self, var_name: Optional[str] = None, return_meta_data: bool = False
):
loop = asyncio.get_running_loop() loop = asyncio.get_running_loop()
if self.stream_executor.stream: if self.stream_executor.stream:

View File

@@ -74,7 +74,9 @@ class SglSamplingParams:
) )
return { return {
"max_tokens_to_sample": self.max_new_tokens, "max_tokens_to_sample": self.max_new_tokens,
"stop_sequences": self.stop, "stop_sequences": self.stop
if isinstance(self.stop, (list, tuple))
else [self.stop],
"temperature": self.temperature, "temperature": self.temperature,
"top_p": self.top_p, "top_p": self.top_p,
"top_k": self.top_k, "top_k": self.top_k,

View File

@@ -8,7 +8,6 @@ from sglang.srt.layers.logits_processor import LogitsProcessor
from sglang.srt.layers.radix_attention import RadixAttention from sglang.srt.layers.radix_attention import RadixAttention
from sglang.srt.managers.router.model_runner import InputMetadata from sglang.srt.managers.router.model_runner import InputMetadata
from torch import nn from torch import nn
from transformers import Qwen2Config
from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.activation import SiluAndMul
from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.model_executor.layers.linear import ( from vllm.model_executor.layers.linear import (
@@ -30,6 +29,8 @@ from vllm.model_executor.weight_utils import (
hf_model_weights_iterator, hf_model_weights_iterator,
) )
Qwen2Config = None
class Qwen2MLP(nn.Module): class Qwen2MLP(nn.Module):
def __init__( def __init__(

View File

@@ -445,18 +445,26 @@ class Runtime:
pipe_reader, pipe_writer = mp.Pipe(duplex=False) pipe_reader, pipe_writer = mp.Pipe(duplex=False)
proc = mp.Process(target=launch_server, args=(self.server_args, pipe_writer)) proc = mp.Process(target=launch_server, args=(self.server_args, pipe_writer))
proc.start() proc.start()
pipe_writer.close()
self.pid = proc.pid self.pid = proc.pid
init_state = pipe_reader.recv() try:
init_state = pipe_reader.recv()
except EOFError:
init_state = ""
if init_state != "init ok": if init_state != "init ok":
self.shutdown() self.shutdown()
raise RuntimeError("Launch failed") raise RuntimeError("Launch failed. Please see the error messages above.")
self.endpoint = RuntimeEndpoint(self.url) self.endpoint = RuntimeEndpoint(self.url)
def shutdown(self): def shutdown(self):
if self.pid is not None: if self.pid is not None:
parent = psutil.Process(self.pid) try:
parent = psutil.Process(self.pid)
except psutil.NoSuchProcess:
return
children = parent.children(recursive=True) children = parent.children(recursive=True)
for child in children: for child in children:
child.kill() child.kill()