diff --git a/examples/quick_start/anthropic_example_chat.py b/examples/quick_start/anthropic_example_chat.py index fe0b31a9b..5bba83fb0 100644 --- a/examples/quick_start/anthropic_example_chat.py +++ b/examples/quick_start/anthropic_example_chat.py @@ -23,7 +23,7 @@ def single(): for m in state.messages(): print(m["role"], ":", m["content"]) - print("answer_1", state["answer_1"]) + print("\n-- answer_1 --\n", state["answer_1"]) def stream(): diff --git a/examples/quick_start/azure_openai_example_chat.py b/examples/quick_start/azure_openai_example_chat.py new file mode 100644 index 000000000..3c40af8d2 --- /dev/null +++ b/examples/quick_start/azure_openai_example_chat.py @@ -0,0 +1,76 @@ +""" +Usage: +export AZURE_OPENAI_API_KEY=sk-****** +python3 openai_example_chat.py +""" +import sglang as sgl +import os + + +@sgl.function +def multi_turn_question(s, question_1, question_2): + s += sgl.system("You are a helpful assistant.") + s += sgl.user(question_1) + s += sgl.assistant(sgl.gen("answer_1", max_tokens=256)) + s += sgl.user(question_2) + s += sgl.assistant(sgl.gen("answer_2", max_tokens=256)) + + +def single(): + state = multi_turn_question.run( + question_1="What is the capital of the United States?", + question_2="List two local attractions.", + ) + + for m in state.messages(): + print(m["role"], ":", m["content"]) + + print("\n-- answer_1 --\n", state["answer_1"]) + + +def stream(): + state = multi_turn_question.run( + question_1="What is the capital of the United States?", + question_2="List two local attractions.", + stream=True + ) + + for out in state.text_iter(): + print(out, end="", flush=True) + print() + + +def batch(): + states = multi_turn_question.run_batch([ + {"question_1": "What is the capital of the United States?", + "question_2": "List two local attractions."}, + + {"question_1": "What is the capital of France?", + "question_2": "What is the population of this city?"}, + ]) + + for s in states: + print(s.messages()) + + +if __name__ == "__main__": + backend = sgl.OpenAI( + model_name="azure-gpt-4", + api_version="2023-07-01-preview", + azure_endpoint="https://oai-arena-sweden.openai.azure.com/", + api_key=os.environ["AZURE_OPENAI_API_KEY"], + is_azure=True, + ) + sgl.set_default_backend(backend) + + # Run a single request + print("\n========== single ==========\n") + single() + + # Stream output + print("\n========== stream ==========\n") + stream() + + # Run a batch of requests + print("\n========== batch ==========\n") + batch() diff --git a/examples/quick_start/gemini_example_chat.py b/examples/quick_start/gemini_example_chat.py index f7f70f499..aafa1665c 100644 --- a/examples/quick_start/gemini_example_chat.py +++ b/examples/quick_start/gemini_example_chat.py @@ -23,7 +23,7 @@ def single(): for m in state.messages(): print(m["role"], ":", m["content"]) - print("answer_1", state["answer_1"]) + print("\n-- answer_1 --\n", state["answer_1"]) def stream(): diff --git a/examples/quick_start/openai_example_chat.py b/examples/quick_start/openai_example_chat.py index 591cd178a..66b8536c0 100644 --- a/examples/quick_start/openai_example_chat.py +++ b/examples/quick_start/openai_example_chat.py @@ -24,7 +24,7 @@ def single(): for m in state.messages(): print(m["role"], ":", m["content"]) - print("answer_1", state["answer_1"]) + print("\n-- answer_1 --\n", state["answer_1"]) def stream(): diff --git a/examples/quick_start/srt_example_chat.py b/examples/quick_start/srt_example_chat.py index 657c19c91..2f261b095 100644 --- a/examples/quick_start/srt_example_chat.py +++ b/examples/quick_start/srt_example_chat.py @@ -22,7 +22,7 @@ def single(): for m in state.messages(): print(m["role"], ":", m["content"]) - print("answer_1", state["answer_1"]) + print("\n-- answer_1 --\n", state["answer_1"]) def stream(): diff --git a/examples/quick_start/together_example_chat.py b/examples/quick_start/together_example_chat.py new file mode 100644 index 000000000..d2834f44e --- /dev/null +++ b/examples/quick_start/together_example_chat.py @@ -0,0 +1,74 @@ +""" +Usage: +export TOGETHER_API_KEY=sk-****** +python3 together_example_chat.py +""" +import sglang as sgl +import os + + +@sgl.function +def multi_turn_question(s, question_1, question_2): + s += sgl.system("You are a helpful assistant.") + s += sgl.user(question_1) + s += sgl.assistant(sgl.gen("answer_1", max_tokens=256)) + s += sgl.user(question_2) + s += sgl.assistant(sgl.gen("answer_2", max_tokens=256)) + + +def single(): + state = multi_turn_question.run( + question_1="What is the capital of the United States?", + question_2="List two local attractions.", + ) + + for m in state.messages(): + print(m["role"], ":", m["content"]) + + print("\n-- answer_1 --\n", state["answer_1"]) + + +def stream(): + state = multi_turn_question.run( + question_1="What is the capital of the United States?", + question_2="List two local attractions.", + stream=True + ) + + for out in state.text_iter(): + print(out, end="", flush=True) + print() + + +def batch(): + states = multi_turn_question.run_batch([ + {"question_1": "What is the capital of the United States?", + "question_2": "List two local attractions."}, + + {"question_1": "What is the capital of France?", + "question_2": "What is the population of this city?"}, + ]) + + for s in states: + print(s.messages()) + + +if __name__ == "__main__": + backend = sgl.OpenAI( + model_name="mistralai/Mixtral-8x7B-Instruct-v0.1", + base_url="https://api.together.xyz/v1", + api_key=os.environ.get("TOGETHER_API_KEY"), + ) + sgl.set_default_backend(backend) + + # Run a single request + print("\n========== single ==========\n") + single() + + # Stream output + print("\n========== stream ==========\n") + stream() + + # Run a batch of requests + print("\n========== batch ==========\n") + batch() diff --git a/examples/quick_start/together_example_complete.py b/examples/quick_start/together_example_complete.py new file mode 100644 index 000000000..011c652fd --- /dev/null +++ b/examples/quick_start/together_example_complete.py @@ -0,0 +1,74 @@ +""" +Usage: +export TOGETHER_API_KEY=sk-****** +python3 together_example_complete.py +""" + +import sglang as sgl +import os + + +@sgl.function +def few_shot_qa(s, question): + s += ( +"""The following are questions with answers. +Q: What is the capital of France? +A: Paris +Q: What is the capital of Germany? +A: Berlin +Q: What is the capital of Italy? +A: Rome +""") + s += "Q: " + question + "\n" + s += "A:" + sgl.gen("answer", stop="\n", temperature=0) + + +def single(): + state = few_shot_qa.run(question="What is the capital of the United States?") + answer = state["answer"].strip().lower() + + assert "washington" in answer, f"answer: {state['answer']}" + + print(state.text()) + + +def stream(): + state = few_shot_qa.run( + question="What is the capital of the United States?", + stream=True) + + for out in state.text_iter("answer"): + print(out, end="", flush=True) + print() + + +def batch(): + states = few_shot_qa.run_batch([ + {"question": "What is the capital of the United States?"}, + {"question": "What is the capital of China?"}, + ]) + + for s in states: + print(s["answer"]) + + +if __name__ == "__main__": + backend = sgl.OpenAI( + model_name="mistralai/Mixtral-8x7B-Instruct-v0.1", + is_chat_model=False, + base_url="https://api.together.xyz/v1", + api_key=os.environ.get("TOGETHER_API_KEY"), + ) + sgl.set_default_backend(backend) + + # Run a single request + print("\n========== single ==========\n") + single() + + # Stream output + print("\n========== stream ==========\n") + stream() + + # Run a batch of requests + print("\n========== batch ==========\n") + batch() diff --git a/python/sglang/backend/openai.py b/python/sglang/backend/openai.py index a1cd98c5d..86d9d9a8a 100644 --- a/python/sglang/backend/openai.py +++ b/python/sglang/backend/openai.py @@ -4,7 +4,7 @@ from typing import Callable, List, Optional, Union import numpy as np from sglang.backend.base_backend import BaseBackend -from sglang.lang.chat_template import get_chat_template +from sglang.lang.chat_template import get_chat_template_by_model_path, ChatTemplate from sglang.lang.interpreter import StreamExecutor from sglang.lang.ir import SglSamplingParams @@ -41,23 +41,39 @@ INSTRUCT_MODEL_NAMES = [ class OpenAI(BaseBackend): - def __init__(self, model_name, *args, **kwargs): + def __init__(self, model_name: str, + is_chat_model: Optional[bool] = None, + chat_template: Optional[ChatTemplate] = None, + is_azure: bool = False, + *args, **kwargs): super().__init__() if isinstance(openai, Exception): raise openai - self.client = openai.OpenAI(*args, **kwargs) + if is_azure: + self.client = openai.AzureOpenAI(*args, **kwargs) + else: + self.client = openai.OpenAI(*args, **kwargs) + self.model_name = model_name - self.tokenizer = tiktoken.encoding_for_model(model_name) + try: + self.tokenizer = tiktoken.encoding_for_model(model_name) + except KeyError: + self.tokenizer = tiktoken.get_encoding("cl100k_base") self.logit_bias_int = create_logit_bias_int(self.tokenizer) - if model_name in INSTRUCT_MODEL_NAMES: - self.is_chat_model = False - else: - self.is_chat_model = True + self.chat_template = chat_template or get_chat_template_by_model_path(model_name) - self.chat_template = get_chat_template("default") + if is_chat_model is not None: + self.is_chat_model = is_chat_model + else: + if model_name in INSTRUCT_MODEL_NAMES: + self.is_chat_model = False + else: + self.is_chat_model = True + + self.chat_begin_str = self.chat_template.role_prefix_and_suffix["assistant"][0] def get_chat_template(self): return self.chat_template @@ -69,7 +85,7 @@ class OpenAI(BaseBackend): ): if sampling_params.dtype is None: if self.is_chat_model: - if not s.text_.endswith("ASSISTANT:"): + if not s.text_.endswith(self.chat_begin_str): raise RuntimeError( "This use case is not supported. " "For OpenAI chat models, sgl.gen must be right after sgl.assistant" @@ -122,7 +138,11 @@ class OpenAI(BaseBackend): ): if sampling_params.dtype is None: if self.is_chat_model: - assert s.text_.endswith("ASSISTANT:") + if not s.text_.endswith(self.chat_begin_str): + raise RuntimeError( + "This use case is not supported. " + "For OpenAI chat models, sgl.gen must be right after sgl.assistant" + ) prompt = s.messages_ else: prompt = s.text_ @@ -241,7 +261,10 @@ def openai_completion_stream(client, retries=3, is_chat=None, prompt=None, **kwa messages=prompt, stream=True, **kwargs ) for ret in generator: - content = ret.choices[0].delta.content + try: + content = ret.choices[0].delta.content + except IndexError: + content = None yield content or "", {} else: generator = client.completions.create(