[Feat] Add llava qwen, llava mistral (#419)

Co-authored-by: Bo Li <drluodian@gmail.com>
2024-05-14 13:17:50 +08:00
parent e0ae5d42ec
commit 664287b2a7
7 changed files with 1021 additions and 1 deletions
--- a/examples/usage/llava/http_llama3_llava_test.py
+++ b/examples/usage/llava/http_llama3_llava_test.py
@@ -0,0 +1,117 @@
+"""
+Usage:
+# Installing latest llava-next: pip install git+https://github.com/LLaVA-VL/LLaVA-NeXT.git
+# Installing latest sglang.
+
+# Endpoint Service CLI: 
+# python -m sglang.launch_server --model-path lmms-lab/llama3-llava-next-8b --tokenizer-path lmms-lab/llama3-llava-next-8b-tokenizer --port=30000 --host="127.0.0.1" --tp-size=4
+
+python3 http_llama3_llava_test.py
+
+Output:
+"Friends posing for a fun photo with a life-sized teddy bear, creating a playful and memorable moment."
+"""
+
+import argparse
+import asyncio
+import json
+import time
+import copy
+
+import aiohttp
+import requests
+
+from llava.conversation import (
+    default_conversation,
+    conv_templates,
+    SeparatorStyle,
+    conv_llava_llama_3,
+    conv_qwen,
+)
+
+
+async def send_request(url, data, delay=0):
+    await asyncio.sleep(delay)
+    async with aiohttp.ClientSession() as session:
+        async with session.post(url, json=data) as resp:
+            output = await resp.json()
+    return output
+
+
+async def test_concurrent(args):
+    url = f"{args.host}:{args.port}"
+
+    prompt = "<image>\nPlease generate caption towards this image."
+    conv_template = copy.deepcopy(conv_llava_llama_3)
+    conv_template.append_message(role="user", message=prompt)
+    prompt_with_template = conv_template.get_prompt()
+    response = []
+    for i in range(1):
+        response.append(
+            send_request(
+                url + "/generate",
+                {
+                    "text": prompt_with_template,
+                    "image_data": "https://farm4.staticflickr.com/3175/2653711032_804ff86d81_z.jpg",
+                    "sampling_params": {
+                        "max_new_tokens": 1024,
+                        "temperature": 0,
+                        "top_p": 1.0,
+                        "presence_penalty": 2,
+                        "frequency_penalty": 2,
+                        "stop": "<|eot_id|>",
+                    },
+                },
+            )
+        )
+
+    rets = await asyncio.gather(*response)
+    for ret in rets:
+        print(ret["text"])
+
+
+def test_streaming(args):
+    url = f"{args.host}:{args.port}"
+    prompt = "<image>\nPlease generate caption towards this image."
+    conv_template = copy.deepcopy(conv_llava_llama_3)
+    conv_template.append_message(role="user", message=prompt)
+    prompt_with_template = conv_template.get_prompt()
+    pload = {
+        "text": prompt_with_template,
+        "sampling_params": {
+            "max_new_tokens": 1024,
+            "temperature": 0,
+            "top_p": 1.0,
+            "presence_penalty": 2,
+            "frequency_penalty": 2,
+            "stop": "<|eot_id|>",
+        },
+        "image_data": "https://farm4.staticflickr.com/3175/2653711032_804ff86d81_z.jpg",
+        "stream": True,
+    }
+    response = requests.post(
+        url + "/generate",
+        json=pload,
+        stream=True,
+    )
+
+    prev = 0
+    for chunk in response.iter_lines(decode_unicode=False):
+        chunk = chunk.decode("utf-8")
+        if chunk and chunk.startswith("data:"):
+            if chunk == "data: [DONE]":
+                break
+            data = json.loads(chunk[5:].strip("\n"))
+            output = data["text"].strip()
+            print(output[prev:], end="", flush=True)
+            prev = len(output)
+    print("")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", type=str, default="http://127.0.0.1")
+    parser.add_argument("--port", type=int, default=30000)
+    args = parser.parse_args()
+    asyncio.run(test_concurrent(args))
+    test_streaming(args)
--- a/examples/usage/llava/http_qwen_llava_test.py
+++ b/examples/usage/llava/http_qwen_llava_test.py
@@ -0,0 +1,117 @@
+"""
+Usage:
+# Installing latest llava-next: pip install git+https://github.com/LLaVA-VL/LLaVA-NeXT.git
+# Installing latest sglang.
+
+# Endpoint Service CLI: 
+# python -m sglang.launch_server --model-path lmms-lab/llava-next-72b --tokenizer-path lmms-lab/llavanext-qwen-tokenizer --port=30000 --host="127.0.0.1" --tp-size=4
+
+python3 http_qwen_llava_test.py
+
+Output:
+"Two children pose with a large teddy bear, one holding a smaller stuffed bear, in a room with an American flag and potted plants."
+"""
+
+import argparse
+import asyncio
+import json
+import time
+import copy
+
+import aiohttp
+import requests
+
+from llava.conversation import (
+    default_conversation,
+    conv_templates,
+    SeparatorStyle,
+    conv_llava_llama_3,
+    conv_qwen,
+)
+
+
+async def send_request(url, data, delay=0):
+    await asyncio.sleep(delay)
+    async with aiohttp.ClientSession() as session:
+        async with session.post(url, json=data) as resp:
+            output = await resp.json()
+    return output
+
+
+async def test_concurrent(args):
+    url = f"{args.host}:{args.port}"
+
+    prompt = "<image>\nPlease generate caption towards this image."
+    conv_template = copy.deepcopy(conv_qwen)
+    conv_template.append_message(role="user", message=prompt)
+    prompt_with_template = conv_template.get_prompt()
+    response = []
+    for i in range(1):
+        response.append(
+            send_request(
+                url + "/generate",
+                {
+                    "text": prompt_with_template,
+                    "image_data": "https://farm4.staticflickr.com/3175/2653711032_804ff86d81_z.jpg",
+                    "sampling_params": {
+                        "max_new_tokens": 1024,
+                        "temperature": 0,
+                        "top_p": 1.0,
+                        "presence_penalty": 2,
+                        "frequency_penalty": 2,
+                        "stop": "<|im_end|>",
+                    },
+                },
+            )
+        )
+
+    rets = await asyncio.gather(*response)
+    for ret in rets:
+        print(ret["text"])
+
+
+def test_streaming(args):
+    url = f"{args.host}:{args.port}"
+    prompt = "<image>\nPlease generate caption towards this image."
+    conv_template = copy.deepcopy(conv_qwen)
+    conv_template.append_message(role="user", message=prompt)
+    prompt_with_template = conv_template.get_prompt()
+    pload = {
+        "text": prompt_with_template,
+        "sampling_params": {
+            "max_new_tokens": 1024,
+            "temperature": 0,
+            "top_p": 1.0,
+            "presence_penalty": 2,
+            "frequency_penalty": 2,
+            "stop": "<|im_end|>",
+        },
+        "image_data": "https://farm4.staticflickr.com/3175/2653711032_804ff86d81_z.jpg",
+        "stream": True,
+    }
+    response = requests.post(
+        url + "/generate",
+        json=pload,
+        stream=True,
+    )
+
+    prev = 0
+    for chunk in response.iter_lines(decode_unicode=False):
+        chunk = chunk.decode("utf-8")
+        if chunk and chunk.startswith("data:"):
+            if chunk == "data: [DONE]":
+                break
+            data = json.loads(chunk[5:].strip("\n"))
+            output = data["text"].strip()
+            print(output[prev:], end="", flush=True)
+            prev = len(output)
+    print("")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", type=str, default="http://127.0.0.1")
+    parser.add_argument("--port", type=int, default=30000)
+    args = parser.parse_args()
+    # asyncio.run(test_concurrent(args))
+    test_streaming(args)
--- a/examples/usage/llava/srt_llava_next_test.py
+++ b/examples/usage/llava/srt_llava_next_test.py
@@ -0,0 +1,88 @@
+"""
+Usage: python3 srt_example_llava.py
+"""
+
+import sglang as sgl
+from sglang.srt.utils import load_image
+from sglang.lang.chat_template import get_chat_template
+
+from PIL import ImageFile
+ImageFile.LOAD_TRUNCATED_IMAGES = True  # Allow loading of truncated images
+
+@sgl.function
+def image_qa(s, image, question):
+    s += sgl.user(sgl.image(image) + question)
+    s += sgl.assistant(sgl.gen("answer"))
+
+
+def single():
+    image_url = "https://farm4.staticflickr.com/3175/2653711032_804ff86d81_z.jpg"
+    pil_image = load_image(image_url)
+    state = image_qa.run(image=pil_image, question="What is this?", max_new_tokens=512)
+    print(state["answer"], "\n")
+
+
+def stream():
+    image_url = "https://farm4.staticflickr.com/3175/2653711032_804ff86d81_z.jpg"
+    pil_image = load_image(image_url)
+    state = image_qa.run(
+        image=pil_image,
+        question="Please generate short caption for this image.",
+        max_new_tokens=512,
+        temperature=0,
+        stream=True,
+    )
+
+    for out in state.text_iter("answer"):
+        print(out, end="", flush=True)
+    print()
+
+
+def batch():
+    image_url = "https://farm4.staticflickr.com/3175/2653711032_804ff86d81_z.jpg"
+    pil_image = load_image(image_url)
+    states = image_qa.run_batch(
+        [
+            {"image": pil_image, "question": "What is this?"},
+            {"image": pil_image, "question": "What is this?"},
+        ],
+        max_new_tokens=512,
+    )
+    for s in states:
+        print(s["answer"], "\n")
+
+
+if __name__ == "__main__":
+    import multiprocessing as mp
+
+    mp.set_start_method("spawn", force=True)
+    runtime = sgl.Runtime(
+        model_path="lmms-lab/llama3-llava-next-8b",
+        tokenizer_path="lmms-lab/llama3-llava-next-8b-tokenizer",
+    )
+    runtime.endpoint.chat_template = get_chat_template("llama-3-instruct")
+    # runtime = sgl.Runtime(
+    #     model_path="lmms-lab/llava-next-72b",
+    #     tokenizer_path="lmms-lab/llavanext-qwen-tokenizer",
+    # )
+    # runtime.endpoint.chat_template = get_chat_template("chatml-llava")
+    sgl.set_default_backend(runtime)
+    print(f"chat template: {runtime.endpoint.chat_template.name}")
+
+    # Or you can use API models
+    # sgl.set_default_backend(sgl.OpenAI("gpt-4-vision-preview"))
+    # sgl.set_default_backend(sgl.VertexAI("gemini-pro-vision"))
+
+    # Run a single request
+    print("\n========== single ==========\n")
+    single()
+
+    # Stream output
+    print("\n========== stream ==========\n")
+    stream()
+
+    # Run a batch of requests
+    print("\n========== batch ==========\n")
+    batch()
+
+    runtime.shutdown()