From 0617528632fe266427e1ee6cf5037e3fca06e538 Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Tue, 30 Jan 2024 04:29:32 -0800 Subject: [PATCH] Update quick start examples (#120) --- README.md | 65 ++++++++-------- .../quick_start/anthropic_example_chat.py | 74 ++++++++++++++---- .../quick_start/anthropic_example_complete.py | 57 ++++++++++++-- .../quick_start/anthropic_example_stream.py | 20 ----- examples/quick_start/gemini_example_chat.py | 67 ++++++++++++++++ .../quick_start/gemini_example_complete.py | 57 ++++++++++++-- .../gemini_example_multimodal_chat.py | 36 +++++---- examples/quick_start/gemini_example_stream.py | 20 ----- examples/quick_start/openai_example_chat.py | 76 ++++++++++++++---- .../quick_start/openai_example_complete.py | 57 ++++++++++++-- examples/quick_start/openai_example_stream.py | 21 ----- examples/quick_start/srt_example_chat.py | 77 +++++++++++++++---- examples/quick_start/srt_example_complete.py | 60 ++++++++++++--- examples/quick_start/srt_example_llava.py | 64 ++++++++++----- examples/quick_start/srt_example_stream.py | 26 ------- .../srt_example_regex.py | 0 python/sglang/lang/interpreter.py | 6 +- python/sglang/lang/ir.py | 4 +- python/sglang/srt/models/qwen2.py | 3 +- python/sglang/srt/server.py | 14 +++- 20 files changed, 567 insertions(+), 237 deletions(-) delete mode 100644 examples/quick_start/anthropic_example_stream.py create mode 100644 examples/quick_start/gemini_example_chat.py delete mode 100644 examples/quick_start/gemini_example_stream.py delete mode 100644 examples/quick_start/openai_example_stream.py delete mode 100644 examples/quick_start/srt_example_stream.py rename examples/{quick_start => usage}/srt_example_regex.py (100%) diff --git a/README.md b/README.md index ed1cf94eb..d5892f131 100644 --- a/README.md +++ b/README.md @@ -39,40 +39,10 @@ pip install -e "python[all]" - For NVIDIA V100, please install the [nightly](https://triton-lang.org/main/getting-started/installation.html) version. - If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"` + ## Quick Start The example below shows how to use sglang to answer a mulit-turn question. -### Using OpenAI Models -Set the OpenAI API Key -``` -export OPENAI_API_KEY=sk-****** -``` - -Then, answer a multi-turn question. -```python -from sglang import function, system, user, assistant, gen, set_default_backend, OpenAI - -@function -def multi_turn_question(s, question_1, question_2): - s += system("You are a helpful assistant.") - s += user(question_1) - s += assistant(gen("answer_1", max_tokens=256)) - s += user(question_2) - s += assistant(gen("answer_2", max_tokens=256)) - -set_default_backend(OpenAI("gpt-3.5-turbo")) - -state = multi_turn_question.run( - question_1="What is the capital of the United States?", - question_2="List two local attractions.", -) - -for m in state.messages(): - print(m["role"], ":", m["content"]) - -print(state["answer_1"]) -``` - ### Using Local Models First, launch a server with ``` @@ -105,6 +75,37 @@ for m in state.messages(): print(state["answer_1"]) ``` +### Using OpenAI Models +Set the OpenAI API Key +``` +export OPENAI_API_KEY=sk-****** +``` + +Then, answer a multi-turn question. +```python +from sglang import function, system, user, assistant, gen, set_default_backend, OpenAI + +@function +def multi_turn_question(s, question_1, question_2): + s += system("You are a helpful assistant.") + s += user(question_1) + s += assistant(gen("answer_1", max_tokens=256)) + s += user(question_2) + s += assistant(gen("answer_2", max_tokens=256)) + +set_default_backend(OpenAI("gpt-3.5-turbo")) + +state = multi_turn_question.run( + question_1="What is the capital of the United States?", + question_2="List two local attractions.", +) + +for m in state.messages(): + print(m["role"], ":", m["content"]) + +print(state["answer_1"]) +``` + ### More Examples Anthropic and VertexAI (Gemini) models are also supported. @@ -120,7 +121,7 @@ import sglang as sgl `sglang` provides some simple primitives such as `gen`, `select`, `fork`, `image`. You can implement your prompt flow in a function decorated by `sgl.function`. You can then invoke the function with `run` or `run_batch`. -The system will manage the state, chat template, and parallelism for you. +The system will manage the state, chat template, parallelism and batching for you. ### Control Flow You can use any Python code within the function body, including control flow, nested function calls, and external libraries. diff --git a/examples/quick_start/anthropic_example_chat.py b/examples/quick_start/anthropic_example_chat.py index 75e90ca5e..fe0b31a9b 100644 --- a/examples/quick_start/anthropic_example_chat.py +++ b/examples/quick_start/anthropic_example_chat.py @@ -1,19 +1,67 @@ -from sglang import function, system, user, assistant, gen, set_default_backend, Anthropic +""" +Usage: +export ANTHROPIC_API_KEY=sk-****** +python3 anthropic_example_chat.py +""" +import sglang as sgl -@function +@sgl.function def multi_turn_question(s, question_1, question_2): - s += user(question_1) - s += assistant(gen("answer_1", max_tokens=256)) - s += user(question_2) - s += assistant(gen("answer_2", max_tokens=256)) + s += sgl.user(question_1) + s += sgl.assistant(sgl.gen("answer_1", max_tokens=256)) + s += sgl.user(question_2) + s += sgl.assistant(sgl.gen("answer_2", max_tokens=256)) -set_default_backend(Anthropic("claude-2")) -state = multi_turn_question.run( - question_1="What is the capital of the United States?", - question_2="List two local attractions.", -) +def single(): + state = multi_turn_question.run( + question_1="What is the capital of the United States?", + question_2="List two local attractions.", + ) -for m in state.messages(): - print(m["role"], ":", m["content"]) + for m in state.messages(): + print(m["role"], ":", m["content"]) + + print("answer_1", state["answer_1"]) + + +def stream(): + state = multi_turn_question.run( + question_1="What is the capital of the United States?", + question_2="List two local attractions.", + stream=True + ) + + for out in state.text_iter(): + print(out, end="", flush=True) + print() + + +def batch(): + states = multi_turn_question.run_batch([ + {"question_1": "What is the capital of the United States?", + "question_2": "List two local attractions."}, + + {"question_1": "What is the capital of France?", + "question_2": "What is the population of this city?"}, + ]) + + for s in states: + print(s.messages()) + + +if __name__ == "__main__": + sgl.set_default_backend(sgl.Anthropic("claude-2")) + + # Run a single request + print("\n========== single ==========\n") + single() + + # Stream output + print("\n========== stream ==========\n") + stream() + + # Run a batch of requests + print("\n========== batch ==========\n") + batch() diff --git a/examples/quick_start/anthropic_example_complete.py b/examples/quick_start/anthropic_example_complete.py index 8648f2ff1..5a28a99c5 100644 --- a/examples/quick_start/anthropic_example_complete.py +++ b/examples/quick_start/anthropic_example_complete.py @@ -1,7 +1,13 @@ -from sglang import function, gen, set_default_backend, Anthropic +""" +Usage: +export ANTHROPIC_API_KEY=sk-****** +python3 anthropic_example_complete.py +""" + +import sglang as sgl -@function +@sgl.function def few_shot_qa(s, question): s += ( """ @@ -13,14 +19,49 @@ def few_shot_qa(s, question): \n\nAssistant: Rome """) s += "\n\nHuman: " + question + "\n" - s += "\n\nAssistant:" + gen("answer", stop="\n", temperature=0) + s += "\n\nAssistant:" + sgl.gen("answer", stop="\n", temperature=0) -set_default_backend(Anthropic("claude-2")) +def single(): + state = few_shot_qa.run(question="What is the capital of the United States?") + answer = state["answer"].strip().lower() -state = few_shot_qa.run(question="What is the capital of the United States?") -answer = state["answer"].strip().lower() + assert "washington" in answer, f"answer: {state['answer']}" -assert "washington" in answer, f"answer: {state['answer']}" + print(state.text()) -print(state.text()) + +def stream(): + state = few_shot_qa.run( + question="What is the capital of the United States?", + stream=True) + + for out in state.text_iter("answer"): + print(out, end="", flush=True) + print() + + +def batch(): + states = few_shot_qa.run_batch([ + {"question": "What is the capital of the United States?"}, + {"question": "What is the capital of China?"}, + ]) + + for s in states: + print(s["answer"]) + + +if __name__ == "__main__": + sgl.set_default_backend(sgl.Anthropic("claude-2")) + + # Run a single request + print("\n========== single ==========\n") + single() + + # Stream output + print("\n========== stream ==========\n") + stream() + + # Run a batch of requests + print("\n========== batch ==========\n") + batch() diff --git a/examples/quick_start/anthropic_example_stream.py b/examples/quick_start/anthropic_example_stream.py deleted file mode 100644 index e265e16c7..000000000 --- a/examples/quick_start/anthropic_example_stream.py +++ /dev/null @@ -1,20 +0,0 @@ -from sglang import function, system, user, assistant, gen, set_default_backend, Anthropic - - -@function -def multi_turn_question(s, question_1, question_2): - s += user(question_1) - s += assistant(gen("answer_1", max_tokens=256)) - s += user(question_2) - s += assistant(gen("answer_2", max_tokens=256)) - -set_default_backend(Anthropic("claude-2")) - -state = multi_turn_question.run( - question_1="What is the capital of the United States?", - question_2="List two local attractions.", - stream=True -) - -for out in state.text_iter(): - print(out, end="", flush=True) diff --git a/examples/quick_start/gemini_example_chat.py b/examples/quick_start/gemini_example_chat.py new file mode 100644 index 000000000..f7f70f499 --- /dev/null +++ b/examples/quick_start/gemini_example_chat.py @@ -0,0 +1,67 @@ +""" +Usage: +export GCP_PROJECT_ID=****** +python3 gemini_example_chat.py +""" +import sglang as sgl + + +@sgl.function +def multi_turn_question(s, question_1, question_2): + s += sgl.user(question_1) + s += sgl.assistant(sgl.gen("answer_1", max_tokens=256)) + s += sgl.user(question_2) + s += sgl.assistant(sgl.gen("answer_2", max_tokens=256)) + + +def single(): + state = multi_turn_question.run( + question_1="What is the capital of the United States?", + question_2="List two local attractions.", + ) + + for m in state.messages(): + print(m["role"], ":", m["content"]) + + print("answer_1", state["answer_1"]) + + +def stream(): + state = multi_turn_question.run( + question_1="What is the capital of the United States?", + question_2="List two local attractions.", + stream=True + ) + + for out in state.text_iter(): + print(out, end="", flush=True) + print() + + +def batch(): + states = multi_turn_question.run_batch([ + {"question_1": "What is the capital of the United States?", + "question_2": "List two local attractions."}, + + {"question_1": "What is the capital of France?", + "question_2": "What is the population of this city?"}, + ]) + + for s in states: + print(s.messages()) + + +if __name__ == "__main__": + sgl.set_default_backend(sgl.VertexAI("gemini-pro")) + + # Run a single request + print("\n========== single ==========\n") + single() + + # Stream output + print("\n========== stream ==========\n") + stream() + + # Run a batch of requests + print("\n========== batch ==========\n") + batch() diff --git a/examples/quick_start/gemini_example_complete.py b/examples/quick_start/gemini_example_complete.py index abaaec7c9..255a3ad4c 100644 --- a/examples/quick_start/gemini_example_complete.py +++ b/examples/quick_start/gemini_example_complete.py @@ -1,7 +1,13 @@ -from sglang import function, gen, set_default_backend, VertexAI +""" +Usage: +export GCP_PROJECT_ID=****** +python3 gemini_example_complete.py +""" + +import sglang as sgl -@function +@sgl.function def few_shot_qa(s, question): s += ( """The following are questions with answers. @@ -13,14 +19,49 @@ Q: What is the capital of Italy? A: Rome """) s += "Q: " + question + "\n" - s += "A:" + gen("answer", stop="\n", temperature=0) + s += "A:" + sgl.gen("answer", stop="\n", temperature=0) -set_default_backend(VertexAI("gemini-pro")) +def single(): + state = few_shot_qa.run(question="What is the capital of the United States?") + answer = state["answer"].strip().lower() -state = few_shot_qa.run(question="What is the capital of the United States?") -answer = state["answer"].strip().lower() + assert "washington" in answer, f"answer: {state['answer']}" -assert "washington" in answer, f"answer: {state['answer']}" + print(state.text()) -print(state.text()) + +def stream(): + state = few_shot_qa.run( + question="What is the capital of the United States?", + stream=True) + + for out in state.text_iter("answer"): + print(out, end="", flush=True) + print() + + +def batch(): + states = few_shot_qa.run_batch([ + {"question": "What is the capital of the United States?"}, + {"question": "What is the capital of China?"}, + ]) + + for s in states: + print(s["answer"]) + + +if __name__ == "__main__": + sgl.set_default_backend(sgl.VertexAI("gemini-pro")) + + # Run a single request + print("\n========== single ==========\n") + single() + + # Stream output + print("\n========== stream ==========\n") + stream() + + # Run a batch of requests + print("\n========== batch ==========\n") + batch() diff --git a/examples/quick_start/gemini_example_multimodal_chat.py b/examples/quick_start/gemini_example_multimodal_chat.py index ac5409a4e..fa5e6e8b7 100644 --- a/examples/quick_start/gemini_example_multimodal_chat.py +++ b/examples/quick_start/gemini_example_multimodal_chat.py @@ -1,19 +1,29 @@ -from sglang import function, user, assistant, gen, image, set_default_backend, VertexAI +""" +Usage: +export GCP_PROJECT_ID=****** +python3 gemini_example_multimodal_chat.py +""" +import sglang as sgl -@function +@sgl.function def image_qa(s, image_file1, image_file2, question): - s += user(image(image_file1) + image(image_file2) + question) - s += assistant(gen("answer_1", max_tokens=256)) + s += sgl.user(sgl.image(image_file1) + sgl.image(image_file2) + question) + s += sgl.assistant(sgl.gen("answer", max_tokens=256)) -set_default_backend(VertexAI("gemini-pro-vision")) -state = image_qa.run( - image_file1="./images/cat.jpeg", - image_file2="./images/dog.jpeg", - question="Describe difference of the 2 images in one sentence.", - stream=True -) +if __name__ == "__main__": + sgl.set_default_backend(sgl.VertexAI("gemini-pro-vision")) -for out in state.text_iter(): - print(out, end="", flush=True) + state = image_qa.run( + image_file1="./images/cat.jpeg", + image_file2="./images/dog.jpeg", + question="Describe difference of the two images in one sentence.", + stream=True + ) + + for out in state.text_iter("answer"): + print(out, end="", flush=True) + print() + + print(state["answer"]) diff --git a/examples/quick_start/gemini_example_stream.py b/examples/quick_start/gemini_example_stream.py deleted file mode 100644 index 431e7115d..000000000 --- a/examples/quick_start/gemini_example_stream.py +++ /dev/null @@ -1,20 +0,0 @@ -from sglang import function, user, assistant, gen, set_default_backend, VertexAI - - -@function -def multi_turn_question(s, question_1, question_2): - s += user(question_1) - s += assistant(gen("answer_1", max_tokens=256)) - s += user(question_2) - s += assistant(gen("answer_2", max_tokens=256)) - -set_default_backend(VertexAI("gemini-pro")) - -state = multi_turn_question.run( - question_1="What is the capital of the United States?", - question_2="List two local attractions.", - stream=True -) - -for out in state.text_iter(): - print(out, end="", flush=True) diff --git a/examples/quick_start/openai_example_chat.py b/examples/quick_start/openai_example_chat.py index bdd5b171c..591cd178a 100644 --- a/examples/quick_start/openai_example_chat.py +++ b/examples/quick_start/openai_example_chat.py @@ -1,20 +1,68 @@ -from sglang import function, system, user, assistant, gen, set_default_backend, OpenAI +""" +Usage: +export OPENAI_API_KEY=sk-****** +python3 openai_example_chat.py +""" +import sglang as sgl -@function +@sgl.function def multi_turn_question(s, question_1, question_2): - s += system("You are a helpful assistant.") - s += user(question_1) - s += assistant(gen("answer_1", max_tokens=256)) - s += user(question_2) - s += assistant(gen("answer_2", max_tokens=256)) + s += sgl.system("You are a helpful assistant.") + s += sgl.user(question_1) + s += sgl.assistant(sgl.gen("answer_1", max_tokens=256)) + s += sgl.user(question_2) + s += sgl.assistant(sgl.gen("answer_2", max_tokens=256)) -set_default_backend(OpenAI("gpt-3.5-turbo")) -state = multi_turn_question.run( - question_1="What is the capital of the United States?", - question_2="List two local attractions.", -) +def single(): + state = multi_turn_question.run( + question_1="What is the capital of the United States?", + question_2="List two local attractions.", + ) -for m in state.messages(): - print(m["role"], ":", m["content"]) + for m in state.messages(): + print(m["role"], ":", m["content"]) + + print("answer_1", state["answer_1"]) + + +def stream(): + state = multi_turn_question.run( + question_1="What is the capital of the United States?", + question_2="List two local attractions.", + stream=True + ) + + for out in state.text_iter(): + print(out, end="", flush=True) + print() + + +def batch(): + states = multi_turn_question.run_batch([ + {"question_1": "What is the capital of the United States?", + "question_2": "List two local attractions."}, + + {"question_1": "What is the capital of France?", + "question_2": "What is the population of this city?"}, + ]) + + for s in states: + print(s.messages()) + + +if __name__ == "__main__": + sgl.set_default_backend(sgl.OpenAI("gpt-3.5-turbo")) + + # Run a single request + print("\n========== single ==========\n") + single() + + # Stream output + print("\n========== stream ==========\n") + stream() + + # Run a batch of requests + print("\n========== batch ==========\n") + batch() diff --git a/examples/quick_start/openai_example_complete.py b/examples/quick_start/openai_example_complete.py index fd74fba69..41b3c9904 100644 --- a/examples/quick_start/openai_example_complete.py +++ b/examples/quick_start/openai_example_complete.py @@ -1,7 +1,13 @@ -from sglang import function, gen, set_default_backend, OpenAI +""" +Usage: +export OPENAI_API_KEY=sk-****** +python3 openai_example_complete.py +""" + +import sglang as sgl -@function +@sgl.function def few_shot_qa(s, question): s += ( """The following are questions with answers. @@ -13,14 +19,49 @@ Q: What is the capital of Italy? A: Rome """) s += "Q: " + question + "\n" - s += "A:" + gen("answer", stop="\n", temperature=0) + s += "A:" + sgl.gen("answer", stop="\n", temperature=0) -set_default_backend(OpenAI("gpt-3.5-turbo-instruct")) +def single(): + state = few_shot_qa.run(question="What is the capital of the United States?") + answer = state["answer"].strip().lower() -state = few_shot_qa.run(question="What is the capital of the United States?") -answer = state["answer"].strip().lower() + assert "washington" in answer, f"answer: {state['answer']}" -assert "washington" in answer, f"answer: {state['answer']}" + print(state.text()) -print(state.text()) + +def stream(): + state = few_shot_qa.run( + question="What is the capital of the United States?", + stream=True) + + for out in state.text_iter("answer"): + print(out, end="", flush=True) + print() + + +def batch(): + states = few_shot_qa.run_batch([ + {"question": "What is the capital of the United States?"}, + {"question": "What is the capital of China?"}, + ]) + + for s in states: + print(s["answer"]) + + +if __name__ == "__main__": + sgl.set_default_backend(sgl.OpenAI("gpt-3.5-turbo-instruct")) + + # Run a single request + print("\n========== single ==========\n") + single() + + # Stream output + print("\n========== stream ==========\n") + stream() + + # Run a batch of requests + print("\n========== batch ==========\n") + batch() diff --git a/examples/quick_start/openai_example_stream.py b/examples/quick_start/openai_example_stream.py deleted file mode 100644 index 0ed010701..000000000 --- a/examples/quick_start/openai_example_stream.py +++ /dev/null @@ -1,21 +0,0 @@ -from sglang import function, system, user, assistant, gen, set_default_backend, OpenAI - - -@function -def multi_turn_question(s, question_1, question_2): - s += system("You are a helpful assistant.") - s += user(question_1) - s += assistant(gen("answer_1", max_tokens=256)) - s += user(question_2) - s += assistant(gen("answer_2", max_tokens=256)) - -set_default_backend(OpenAI("gpt-3.5-turbo")) - -state = multi_turn_question.run( - question_1="What is the capital of the United States?", - question_2="List two local attractions.", - stream=True -) - -for out in state.text_iter(): - print(out, end="", flush=True) diff --git a/examples/quick_start/srt_example_chat.py b/examples/quick_start/srt_example_chat.py index a5130dca3..657c19c91 100644 --- a/examples/quick_start/srt_example_chat.py +++ b/examples/quick_start/srt_example_chat.py @@ -1,26 +1,69 @@ -from sglang import function, system, user, assistant, gen, set_default_backend, Runtime +""" +Usage: +python3 srt_example_chat.py +""" +import sglang as sgl -@function +@sgl.function def multi_turn_question(s, question_1, question_2): - s += system("You are a helpful assistant.") - s += user(question_1) - s += assistant(gen("answer_1", max_tokens=256)) - s += user(question_2) - s += assistant(gen("answer_2", max_tokens=256)) + s += sgl.user(question_1) + s += sgl.assistant(sgl.gen("answer_1", max_tokens=256)) + s += sgl.user(question_2) + s += sgl.assistant(sgl.gen("answer_2", max_tokens=256)) -runtime = Runtime(model_path="meta-llama/Llama-2-7b-chat-hf") -#runtime = Runtime(model_path="mistralai/Mixtral-8x7B-Instruct-v0.1") -set_default_backend(runtime) +def single(): + state = multi_turn_question.run( + question_1="What is the capital of the United States?", + question_2="List two local attractions.", + ) -state = multi_turn_question.run( - question_1="What is the capital of the United States?", - question_2="List two local attractions.", -) + for m in state.messages(): + print(m["role"], ":", m["content"]) -for m in state.messages(): - print(m["role"], ":", m["content"]) + print("answer_1", state["answer_1"]) -runtime.shutdown() +def stream(): + state = multi_turn_question.run( + question_1="What is the capital of the United States?", + question_2="List two local attractions.", + stream=True + ) + + for out in state.text_iter(): + print(out, end="", flush=True) + print() + + +def batch(): + states = multi_turn_question.run_batch([ + {"question_1": "What is the capital of the United States?", + "question_2": "List two local attractions."}, + + {"question_1": "What is the capital of France?", + "question_2": "What is the population of this city?"}, + ]) + + for s in states: + print(s.messages()) + + +if __name__ == "__main__": + runtime = sgl.Runtime(model_path="meta-llama/Llama-2-7b-chat-hf") + sgl.set_default_backend(runtime) + + # Run a single request + print("\n========== single ==========\n") + single() + + # Stream output + print("\n========== stream ==========\n") + stream() + + # Run a batch of requests + print("\n========== batch ==========\n") + batch() + + runtime.shutdown() diff --git a/examples/quick_start/srt_example_complete.py b/examples/quick_start/srt_example_complete.py index 61e2facbf..200891670 100644 --- a/examples/quick_start/srt_example_complete.py +++ b/examples/quick_start/srt_example_complete.py @@ -1,7 +1,11 @@ -from sglang import function, gen, set_default_backend, Runtime +""" +Usage: +python3 srt_example_complete.py +""" +import sglang as sgl -@function +@sgl.function def few_shot_qa(s, question): s += ( """The following are questions with answers. @@ -13,16 +17,52 @@ Q: What is the capital of Italy? A: Rome """) s += "Q: " + question + "\n" - s += "A:" + gen("answer", stop="\n", temperature=0) + s += "A:" + sgl.gen("answer", stop="\n", temperature=0) -runtime = Runtime(model_path="meta-llama/Llama-2-7b-chat-hf") -set_default_backend(runtime) +def single(): + state = few_shot_qa.run(question="What is the capital of the United States?") + answer = state["answer"].strip().lower() -state = few_shot_qa.run(question="What is the capital of the United States?") + assert "washington" in answer, f"answer: {state['answer']}" -answer = state["answer"].strip().lower() -assert "washington" in answer, f"answer: {state['answer']}" -print(state.text()) + print(state.text()) -runtime.shutdown() + +def stream(): + state = few_shot_qa.run( + question="What is the capital of the United States?", + stream=True) + + for out in state.text_iter("answer"): + print(out, end="", flush=True) + print() + + +def batch(): + states = few_shot_qa.run_batch([ + {"question": "What is the capital of the United States?"}, + {"question": "What is the capital of China?"}, + ]) + + for s in states: + print(s["answer"]) + + +if __name__ == "__main__": + runtime = sgl.Runtime(model_path="meta-llama/Llama-2-7b-chat-hf") + sgl.set_default_backend(runtime) + + # Run a single request + print("\n========== single ==========\n") + single() + + # Stream output + print("\n========== stream ==========\n") + stream() + + # Run a batch of requests + print("\n========== batch ==========\n") + batch() + + runtime.shutdown() diff --git a/examples/quick_start/srt_example_llava.py b/examples/quick_start/srt_example_llava.py index 2f4c7b02a..f374d0a0f 100644 --- a/examples/quick_start/srt_example_llava.py +++ b/examples/quick_start/srt_example_llava.py @@ -10,29 +10,53 @@ def image_qa(s, image_path, question): s += sgl.assistant(sgl.gen("answer")) -runtime = sgl.Runtime(model_path="liuhaotian/llava-v1.5-7b", - tokenizer_path="llava-hf/llava-1.5-7b-hf") -sgl.set_default_backend(runtime) +def single(): + state = image_qa.run( + image_path="images/cat.jpeg", + question="What is this?", + max_new_tokens=64) + print(state["answer"], "\n") -# Single -state = image_qa.run( - image_path="images/cat.jpeg", - question="What is this?", - max_new_tokens=64) -print(state["answer"], "\n") +def stream(): + state = image_qa.run( + image_path="images/cat.jpeg", + question="What is this?", + max_new_tokens=64, + stream=True) + + for out in state.text_iter("answer"): + print(out, end="", flush=True) + print() -# Batch -states = image_qa.run_batch( - [ - {"image_path": "images/cat.jpeg", "question":"What is this?"}, - {"image_path": "images/dog.jpeg", "question":"What is this?"}, - ], - max_new_tokens=64, -) -for s in states: - print(s["answer"], "\n") +def batch(): + states = image_qa.run_batch( + [ + {"image_path": "images/cat.jpeg", "question":"What is this?"}, + {"image_path": "images/dog.jpeg", "question":"What is this?"}, + ], + max_new_tokens=64, + ) + for s in states: + print(s["answer"], "\n") -runtime.shutdown() +if __name__ == "__main__": + runtime = sgl.Runtime(model_path="liuhaotian/llava-v1.5-7b", + tokenizer_path="llava-hf/llava-1.5-7b-hf") + sgl.set_default_backend(runtime) + + # Run a single request + print("\n========== single ==========\n") + single() + + # Stream output + print("\n========== stream ==========\n") + stream() + + # Run a batch of requests + print("\n========== batch ==========\n") + batch() + + runtime.shutdown() diff --git a/examples/quick_start/srt_example_stream.py b/examples/quick_start/srt_example_stream.py deleted file mode 100644 index 8f03bd146..000000000 --- a/examples/quick_start/srt_example_stream.py +++ /dev/null @@ -1,26 +0,0 @@ -from sglang import function, system, user, assistant, gen, set_default_backend, Runtime - - -@function -def multi_turn_question(s, question_1, question_2): - s += system("You are a helpful assistant.") - s += user(question_1) - s += assistant(gen("answer_1", max_tokens=256)) - s += user(question_2) - s += assistant(gen("answer_2", max_tokens=256)) - -runtime = Runtime("meta-llama/Llama-2-7b-chat-hf") -set_default_backend(runtime) - -state = multi_turn_question.run( - question_1="What is the capital of the United States?", - question_2="List two local attractions.", - temperature=0, - stream=True, -) - -for out in state.text_iter(): - print(out, end="", flush=True) -print() - -runtime.shutdown() diff --git a/examples/quick_start/srt_example_regex.py b/examples/usage/srt_example_regex.py similarity index 100% rename from examples/quick_start/srt_example_regex.py rename to examples/usage/srt_example_regex.py diff --git a/python/sglang/lang/interpreter.py b/python/sglang/lang/interpreter.py index f281b0b2a..48fbbac31 100644 --- a/python/sglang/lang/interpreter.py +++ b/python/sglang/lang/interpreter.py @@ -651,7 +651,7 @@ class ProgramState: def sync(self): return self.stream_executor.sync() - def text_iter(self, var_name=None): + def text_iter(self, var_name: Optional[str] = None): if self.stream_executor.stream: prev = 0 if var_name is None: @@ -682,7 +682,9 @@ class ProgramState: else: yield self.get_var(name) - async def text_async_iter(self, var_name=None, return_meta_data=False): + async def text_async_iter( + self, var_name: Optional[str] = None, return_meta_data: bool = False + ): loop = asyncio.get_running_loop() if self.stream_executor.stream: diff --git a/python/sglang/lang/ir.py b/python/sglang/lang/ir.py index 2803de51d..85f6f3f29 100644 --- a/python/sglang/lang/ir.py +++ b/python/sglang/lang/ir.py @@ -74,7 +74,9 @@ class SglSamplingParams: ) return { "max_tokens_to_sample": self.max_new_tokens, - "stop_sequences": self.stop, + "stop_sequences": self.stop + if isinstance(self.stop, (list, tuple)) + else [self.stop], "temperature": self.temperature, "top_p": self.top_p, "top_k": self.top_k, diff --git a/python/sglang/srt/models/qwen2.py b/python/sglang/srt/models/qwen2.py index e4dabfe30..e77d4082e 100644 --- a/python/sglang/srt/models/qwen2.py +++ b/python/sglang/srt/models/qwen2.py @@ -8,7 +8,6 @@ from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.radix_attention import RadixAttention from sglang.srt.managers.router.model_runner import InputMetadata from torch import nn -from transformers import Qwen2Config from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( @@ -30,6 +29,8 @@ from vllm.model_executor.weight_utils import ( hf_model_weights_iterator, ) +Qwen2Config = None + class Qwen2MLP(nn.Module): def __init__( diff --git a/python/sglang/srt/server.py b/python/sglang/srt/server.py index 9750c4e72..06d1b3def 100644 --- a/python/sglang/srt/server.py +++ b/python/sglang/srt/server.py @@ -445,18 +445,26 @@ class Runtime: pipe_reader, pipe_writer = mp.Pipe(duplex=False) proc = mp.Process(target=launch_server, args=(self.server_args, pipe_writer)) proc.start() + pipe_writer.close() self.pid = proc.pid - init_state = pipe_reader.recv() + try: + init_state = pipe_reader.recv() + except EOFError: + init_state = "" + if init_state != "init ok": self.shutdown() - raise RuntimeError("Launch failed") + raise RuntimeError("Launch failed. Please see the error messages above.") self.endpoint = RuntimeEndpoint(self.url) def shutdown(self): if self.pid is not None: - parent = psutil.Process(self.pid) + try: + parent = psutil.Process(self.pid) + except psutil.NoSuchProcess: + return children = parent.children(recursive=True) for child in children: child.kill()