Add Together and AzureOpenAI examples (#184)

2024-02-12 01:06:38 -08:00
parent 931213245c
commit bb824da41a
8 changed files with 263 additions and 16 deletions
--- a/examples/quick_start/anthropic_example_chat.py
+++ b/examples/quick_start/anthropic_example_chat.py
@@ -23,7 +23,7 @@ def single():
    for m in state.messages():
        print(m["role"], ":", m["content"])
-    print("answer_1", state["answer_1"])
+    print("\n-- answer_1 --\n", state["answer_1"])
 def stream():
--- a/examples/quick_start/azure_openai_example_chat.py
+++ b/examples/quick_start/azure_openai_example_chat.py
@@ -0,0 +1,76 @@
 """
 Usage:
 export AZURE_OPENAI_API_KEY=sk-******
 python3 openai_example_chat.py
 """
 import sglang as sgl
 import os
@sgl.function
 def multi_turn_question(s, question_1, question_2):
    s += sgl.system("You are a helpful assistant.")
    s += sgl.user(question_1)
    s += sgl.assistant(sgl.gen("answer_1", max_tokens=256))
    s += sgl.user(question_2)
    s += sgl.assistant(sgl.gen("answer_2", max_tokens=256))
 def single():
    state = multi_turn_question.run(
        question_1="What is the capital of the United States?",
        question_2="List two local attractions.",
    )
    for m in state.messages():
        print(m["role"], ":", m["content"])
    print("\n-- answer_1 --\n", state["answer_1"])
 def stream():
    state = multi_turn_question.run(
        question_1="What is the capital of the United States?",
        question_2="List two local attractions.",
        stream=True
    )
    for out in state.text_iter():
        print(out, end="", flush=True)
    print()
 def batch():
    states = multi_turn_question.run_batch([
        {"question_1": "What is the capital of the United States?",
         "question_2": "List two local attractions."},
        {"question_1": "What is the capital of France?",
         "question_2": "What is the population of this city?"},
    ])
    for s in states:
        print(s.messages())
 if __name__ == "__main__":
    backend = sgl.OpenAI(
        model_name="azure-gpt-4",
        api_version="2023-07-01-preview",
        azure_endpoint="https://oai-arena-sweden.openai.azure.com/",
        api_key=os.environ["AZURE_OPENAI_API_KEY"],
        is_azure=True,
    )
    sgl.set_default_backend(backend)
    # Run a single request
    print("\n========== single ==========\n")
    single()
    # Stream output
    print("\n========== stream ==========\n")
    stream()
    # Run a batch of requests
    print("\n========== batch ==========\n")
    batch()
--- a/examples/quick_start/gemini_example_chat.py
+++ b/examples/quick_start/gemini_example_chat.py
@@ -23,7 +23,7 @@ def single():
    for m in state.messages():
        print(m["role"], ":", m["content"])
-    print("answer_1", state["answer_1"])
+    print("\n-- answer_1 --\n", state["answer_1"])
 def stream():
--- a/examples/quick_start/openai_example_chat.py
+++ b/examples/quick_start/openai_example_chat.py
@@ -24,7 +24,7 @@ def single():
    for m in state.messages():
        print(m["role"], ":", m["content"])
-    print("answer_1", state["answer_1"])
+    print("\n-- answer_1 --\n", state["answer_1"])
 def stream():
--- a/examples/quick_start/srt_example_chat.py
+++ b/examples/quick_start/srt_example_chat.py
@@ -22,7 +22,7 @@ def single():
    for m in state.messages():
        print(m["role"], ":", m["content"])
-    print("answer_1", state["answer_1"])
+    print("\n-- answer_1 --\n", state["answer_1"])
 def stream():
--- a/examples/quick_start/together_example_chat.py
+++ b/examples/quick_start/together_example_chat.py
@@ -0,0 +1,74 @@
 """
 Usage:
 export TOGETHER_API_KEY=sk-******
 python3 together_example_chat.py
 """
 import sglang as sgl
 import os
@sgl.function
 def multi_turn_question(s, question_1, question_2):
    s += sgl.system("You are a helpful assistant.")
    s += sgl.user(question_1)
    s += sgl.assistant(sgl.gen("answer_1", max_tokens=256))
    s += sgl.user(question_2)
    s += sgl.assistant(sgl.gen("answer_2", max_tokens=256))
 def single():
    state = multi_turn_question.run(
        question_1="What is the capital of the United States?",
        question_2="List two local attractions.",
    )
    for m in state.messages():
        print(m["role"], ":", m["content"])
    print("\n-- answer_1 --\n", state["answer_1"])
 def stream():
    state = multi_turn_question.run(
        question_1="What is the capital of the United States?",
        question_2="List two local attractions.",
        stream=True
    )
    for out in state.text_iter():
        print(out, end="", flush=True)
    print()
 def batch():
    states = multi_turn_question.run_batch([
        {"question_1": "What is the capital of the United States?",
         "question_2": "List two local attractions."},
        {"question_1": "What is the capital of France?",
         "question_2": "What is the population of this city?"},
    ])
    for s in states:
        print(s.messages())
 if __name__ == "__main__":
    backend = sgl.OpenAI(
        model_name="mistralai/Mixtral-8x7B-Instruct-v0.1",
        base_url="https://api.together.xyz/v1",
        api_key=os.environ.get("TOGETHER_API_KEY"),
    )
    sgl.set_default_backend(backend)
    # Run a single request
    print("\n========== single ==========\n")
    single()
    # Stream output
    print("\n========== stream ==========\n")
    stream()
    # Run a batch of requests
    print("\n========== batch ==========\n")
    batch()
--- a/examples/quick_start/together_example_complete.py
+++ b/examples/quick_start/together_example_complete.py
@@ -0,0 +1,74 @@
 """
 Usage:
 export TOGETHER_API_KEY=sk-******
 python3 together_example_complete.py
 """
 import sglang as sgl
 import os
@sgl.function
 def few_shot_qa(s, question):
    s += (
 """The following are questions with answers.
 Q: What is the capital of France?
 A: Paris
 Q: What is the capital of Germany?
 A: Berlin
 Q: What is the capital of Italy?
 A: Rome
 """)
    s += "Q: " + question + "\n"
    s += "A:" + sgl.gen("answer", stop="\n", temperature=0)
 def single():
    state = few_shot_qa.run(question="What is the capital of the United States?")
    answer = state["answer"].strip().lower()
    assert "washington" in answer, f"answer: {state['answer']}"
    print(state.text())
 def stream():
    state = few_shot_qa.run(
        question="What is the capital of the United States?",
        stream=True)
    for out in state.text_iter("answer"):
        print(out, end="", flush=True)
    print()
 def batch():
    states = few_shot_qa.run_batch([
        {"question": "What is the capital of the United States?"},
        {"question": "What is the capital of China?"},
    ])
    for s in states:
        print(s["answer"])
 if __name__ == "__main__":
    backend = sgl.OpenAI(
        model_name="mistralai/Mixtral-8x7B-Instruct-v0.1",
        is_chat_model=False,
        base_url="https://api.together.xyz/v1",
        api_key=os.environ.get("TOGETHER_API_KEY"),
    )
    sgl.set_default_backend(backend)
    # Run a single request
    print("\n========== single ==========\n")
    single()
    # Stream output
    print("\n========== stream ==========\n")
    stream()
    # Run a batch of requests
    print("\n========== batch ==========\n")
    batch()
--- a/python/sglang/backend/openai.py
+++ b/python/sglang/backend/openai.py
@@ -4,7 +4,7 @@ from typing import Callable, List, Optional, Union
 import numpy as np
 from sglang.backend.base_backend import BaseBackend
-from sglang.lang.chat_template import get_chat_template
+from sglang.lang.chat_template import get_chat_template_by_model_path, ChatTemplate
 from sglang.lang.interpreter import StreamExecutor
 from sglang.lang.ir import SglSamplingParams
@@ -41,23 +41,39 @@ INSTRUCT_MODEL_NAMES = [
 class OpenAI(BaseBackend):
-    def __init__(self, model_name, *args, **kwargs):
+    def __init__(self, model_name: str,
                 is_chat_model: Optional[bool] = None,
                 chat_template: Optional[ChatTemplate] = None,
                 is_azure: bool = False,
                 *args, **kwargs):
        super().__init__()
        if isinstance(openai, Exception):
            raise openai
-        self.client = openai.OpenAI(*args, **kwargs)
+        if is_azure:
            self.client = openai.AzureOpenAI(*args, **kwargs)
        else:
            self.client = openai.OpenAI(*args, **kwargs)
        self.model_name = model_name
-        self.tokenizer = tiktoken.encoding_for_model(model_name)
+        try:
            self.tokenizer = tiktoken.encoding_for_model(model_name)
        except KeyError:
            self.tokenizer = tiktoken.get_encoding("cl100k_base")
        self.logit_bias_int = create_logit_bias_int(self.tokenizer)
-        if model_name in INSTRUCT_MODEL_NAMES:
+        self.chat_template = chat_template or get_chat_template_by_model_path(model_name)
            self.is_chat_model = False
        else:
            self.is_chat_model = True
-        self.chat_template = get_chat_template("default")
+        if is_chat_model is not None:
            self.is_chat_model = is_chat_model
        else:
            if model_name in INSTRUCT_MODEL_NAMES:
                self.is_chat_model = False
            else:
                self.is_chat_model = True
        self.chat_begin_str = self.chat_template.role_prefix_and_suffix["assistant"][0]
    def get_chat_template(self):
        return self.chat_template
@@ -69,7 +85,7 @@ class OpenAI(BaseBackend):
    ):
        if sampling_params.dtype is None:
            if self.is_chat_model:
-                if not s.text_.endswith("ASSISTANT:"):
+                if not s.text_.endswith(self.chat_begin_str):
                    raise RuntimeError(
                        "This use case is not supported. "
                        "For OpenAI chat models, sgl.gen must be right after sgl.assistant"
@@ -122,7 +138,11 @@ class OpenAI(BaseBackend):
    ):
        if sampling_params.dtype is None:
            if self.is_chat_model:
-                assert s.text_.endswith("ASSISTANT:")
+                if not s.text_.endswith(self.chat_begin_str):
                    raise RuntimeError(
                        "This use case is not supported. "
                        "For OpenAI chat models, sgl.gen must be right after sgl.assistant"
                    )
                prompt = s.messages_
            else:
                prompt = s.text_
@@ -241,7 +261,10 @@ def openai_completion_stream(client, retries=3, is_chat=None, prompt=None, **kwa
                    messages=prompt, stream=True, **kwargs
                )
                for ret in generator:
-                    content = ret.choices[0].delta.content
+                    try:
                        content = ret.choices[0].delta.content
                    except IndexError:
                        content = None
                    yield content or "", {}
            else:
                generator = client.completions.create(