[Feat] Add llava qwen, llava mistral (#419)
Co-authored-by: Bo Li <drluodian@gmail.com>
This commit is contained in:
committed by
GitHub
parent
e0ae5d42ec
commit
664287b2a7
117
examples/usage/llava/http_llama3_llava_test.py
Normal file
117
examples/usage/llava/http_llama3_llava_test.py
Normal file
@@ -0,0 +1,117 @@
|
||||
"""
|
||||
Usage:
|
||||
# Installing latest llava-next: pip install git+https://github.com/LLaVA-VL/LLaVA-NeXT.git
|
||||
# Installing latest sglang.
|
||||
|
||||
# Endpoint Service CLI:
|
||||
# python -m sglang.launch_server --model-path lmms-lab/llama3-llava-next-8b --tokenizer-path lmms-lab/llama3-llava-next-8b-tokenizer --port=30000 --host="127.0.0.1" --tp-size=4
|
||||
|
||||
python3 http_llama3_llava_test.py
|
||||
|
||||
Output:
|
||||
"Friends posing for a fun photo with a life-sized teddy bear, creating a playful and memorable moment."
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import json
|
||||
import time
|
||||
import copy
|
||||
|
||||
import aiohttp
|
||||
import requests
|
||||
|
||||
from llava.conversation import (
|
||||
default_conversation,
|
||||
conv_templates,
|
||||
SeparatorStyle,
|
||||
conv_llava_llama_3,
|
||||
conv_qwen,
|
||||
)
|
||||
|
||||
|
||||
async def send_request(url, data, delay=0):
|
||||
await asyncio.sleep(delay)
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.post(url, json=data) as resp:
|
||||
output = await resp.json()
|
||||
return output
|
||||
|
||||
|
||||
async def test_concurrent(args):
|
||||
url = f"{args.host}:{args.port}"
|
||||
|
||||
prompt = "<image>\nPlease generate caption towards this image."
|
||||
conv_template = copy.deepcopy(conv_llava_llama_3)
|
||||
conv_template.append_message(role="user", message=prompt)
|
||||
prompt_with_template = conv_template.get_prompt()
|
||||
response = []
|
||||
for i in range(1):
|
||||
response.append(
|
||||
send_request(
|
||||
url + "/generate",
|
||||
{
|
||||
"text": prompt_with_template,
|
||||
"image_data": "https://farm4.staticflickr.com/3175/2653711032_804ff86d81_z.jpg",
|
||||
"sampling_params": {
|
||||
"max_new_tokens": 1024,
|
||||
"temperature": 0,
|
||||
"top_p": 1.0,
|
||||
"presence_penalty": 2,
|
||||
"frequency_penalty": 2,
|
||||
"stop": "<|eot_id|>",
|
||||
},
|
||||
},
|
||||
)
|
||||
)
|
||||
|
||||
rets = await asyncio.gather(*response)
|
||||
for ret in rets:
|
||||
print(ret["text"])
|
||||
|
||||
|
||||
def test_streaming(args):
|
||||
url = f"{args.host}:{args.port}"
|
||||
prompt = "<image>\nPlease generate caption towards this image."
|
||||
conv_template = copy.deepcopy(conv_llava_llama_3)
|
||||
conv_template.append_message(role="user", message=prompt)
|
||||
prompt_with_template = conv_template.get_prompt()
|
||||
pload = {
|
||||
"text": prompt_with_template,
|
||||
"sampling_params": {
|
||||
"max_new_tokens": 1024,
|
||||
"temperature": 0,
|
||||
"top_p": 1.0,
|
||||
"presence_penalty": 2,
|
||||
"frequency_penalty": 2,
|
||||
"stop": "<|eot_id|>",
|
||||
},
|
||||
"image_data": "https://farm4.staticflickr.com/3175/2653711032_804ff86d81_z.jpg",
|
||||
"stream": True,
|
||||
}
|
||||
response = requests.post(
|
||||
url + "/generate",
|
||||
json=pload,
|
||||
stream=True,
|
||||
)
|
||||
|
||||
prev = 0
|
||||
for chunk in response.iter_lines(decode_unicode=False):
|
||||
chunk = chunk.decode("utf-8")
|
||||
if chunk and chunk.startswith("data:"):
|
||||
if chunk == "data: [DONE]":
|
||||
break
|
||||
data = json.loads(chunk[5:].strip("\n"))
|
||||
output = data["text"].strip()
|
||||
print(output[prev:], end="", flush=True)
|
||||
prev = len(output)
|
||||
print("")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--host", type=str, default="http://127.0.0.1")
|
||||
parser.add_argument("--port", type=int, default=30000)
|
||||
args = parser.parse_args()
|
||||
asyncio.run(test_concurrent(args))
|
||||
test_streaming(args)
|
||||
117
examples/usage/llava/http_qwen_llava_test.py
Normal file
117
examples/usage/llava/http_qwen_llava_test.py
Normal file
@@ -0,0 +1,117 @@
|
||||
"""
|
||||
Usage:
|
||||
# Installing latest llava-next: pip install git+https://github.com/LLaVA-VL/LLaVA-NeXT.git
|
||||
# Installing latest sglang.
|
||||
|
||||
# Endpoint Service CLI:
|
||||
# python -m sglang.launch_server --model-path lmms-lab/llava-next-72b --tokenizer-path lmms-lab/llavanext-qwen-tokenizer --port=30000 --host="127.0.0.1" --tp-size=4
|
||||
|
||||
python3 http_qwen_llava_test.py
|
||||
|
||||
Output:
|
||||
"Two children pose with a large teddy bear, one holding a smaller stuffed bear, in a room with an American flag and potted plants."
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import json
|
||||
import time
|
||||
import copy
|
||||
|
||||
import aiohttp
|
||||
import requests
|
||||
|
||||
from llava.conversation import (
|
||||
default_conversation,
|
||||
conv_templates,
|
||||
SeparatorStyle,
|
||||
conv_llava_llama_3,
|
||||
conv_qwen,
|
||||
)
|
||||
|
||||
|
||||
async def send_request(url, data, delay=0):
|
||||
await asyncio.sleep(delay)
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.post(url, json=data) as resp:
|
||||
output = await resp.json()
|
||||
return output
|
||||
|
||||
|
||||
async def test_concurrent(args):
|
||||
url = f"{args.host}:{args.port}"
|
||||
|
||||
prompt = "<image>\nPlease generate caption towards this image."
|
||||
conv_template = copy.deepcopy(conv_qwen)
|
||||
conv_template.append_message(role="user", message=prompt)
|
||||
prompt_with_template = conv_template.get_prompt()
|
||||
response = []
|
||||
for i in range(1):
|
||||
response.append(
|
||||
send_request(
|
||||
url + "/generate",
|
||||
{
|
||||
"text": prompt_with_template,
|
||||
"image_data": "https://farm4.staticflickr.com/3175/2653711032_804ff86d81_z.jpg",
|
||||
"sampling_params": {
|
||||
"max_new_tokens": 1024,
|
||||
"temperature": 0,
|
||||
"top_p": 1.0,
|
||||
"presence_penalty": 2,
|
||||
"frequency_penalty": 2,
|
||||
"stop": "<|im_end|>",
|
||||
},
|
||||
},
|
||||
)
|
||||
)
|
||||
|
||||
rets = await asyncio.gather(*response)
|
||||
for ret in rets:
|
||||
print(ret["text"])
|
||||
|
||||
|
||||
def test_streaming(args):
|
||||
url = f"{args.host}:{args.port}"
|
||||
prompt = "<image>\nPlease generate caption towards this image."
|
||||
conv_template = copy.deepcopy(conv_qwen)
|
||||
conv_template.append_message(role="user", message=prompt)
|
||||
prompt_with_template = conv_template.get_prompt()
|
||||
pload = {
|
||||
"text": prompt_with_template,
|
||||
"sampling_params": {
|
||||
"max_new_tokens": 1024,
|
||||
"temperature": 0,
|
||||
"top_p": 1.0,
|
||||
"presence_penalty": 2,
|
||||
"frequency_penalty": 2,
|
||||
"stop": "<|im_end|>",
|
||||
},
|
||||
"image_data": "https://farm4.staticflickr.com/3175/2653711032_804ff86d81_z.jpg",
|
||||
"stream": True,
|
||||
}
|
||||
response = requests.post(
|
||||
url + "/generate",
|
||||
json=pload,
|
||||
stream=True,
|
||||
)
|
||||
|
||||
prev = 0
|
||||
for chunk in response.iter_lines(decode_unicode=False):
|
||||
chunk = chunk.decode("utf-8")
|
||||
if chunk and chunk.startswith("data:"):
|
||||
if chunk == "data: [DONE]":
|
||||
break
|
||||
data = json.loads(chunk[5:].strip("\n"))
|
||||
output = data["text"].strip()
|
||||
print(output[prev:], end="", flush=True)
|
||||
prev = len(output)
|
||||
print("")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--host", type=str, default="http://127.0.0.1")
|
||||
parser.add_argument("--port", type=int, default=30000)
|
||||
args = parser.parse_args()
|
||||
# asyncio.run(test_concurrent(args))
|
||||
test_streaming(args)
|
||||
88
examples/usage/llava/srt_llava_next_test.py
Normal file
88
examples/usage/llava/srt_llava_next_test.py
Normal file
@@ -0,0 +1,88 @@
|
||||
"""
|
||||
Usage: python3 srt_example_llava.py
|
||||
"""
|
||||
|
||||
import sglang as sgl
|
||||
from sglang.srt.utils import load_image
|
||||
from sglang.lang.chat_template import get_chat_template
|
||||
|
||||
from PIL import ImageFile
|
||||
ImageFile.LOAD_TRUNCATED_IMAGES = True # Allow loading of truncated images
|
||||
|
||||
@sgl.function
|
||||
def image_qa(s, image, question):
|
||||
s += sgl.user(sgl.image(image) + question)
|
||||
s += sgl.assistant(sgl.gen("answer"))
|
||||
|
||||
|
||||
def single():
|
||||
image_url = "https://farm4.staticflickr.com/3175/2653711032_804ff86d81_z.jpg"
|
||||
pil_image = load_image(image_url)
|
||||
state = image_qa.run(image=pil_image, question="What is this?", max_new_tokens=512)
|
||||
print(state["answer"], "\n")
|
||||
|
||||
|
||||
def stream():
|
||||
image_url = "https://farm4.staticflickr.com/3175/2653711032_804ff86d81_z.jpg"
|
||||
pil_image = load_image(image_url)
|
||||
state = image_qa.run(
|
||||
image=pil_image,
|
||||
question="Please generate short caption for this image.",
|
||||
max_new_tokens=512,
|
||||
temperature=0,
|
||||
stream=True,
|
||||
)
|
||||
|
||||
for out in state.text_iter("answer"):
|
||||
print(out, end="", flush=True)
|
||||
print()
|
||||
|
||||
|
||||
def batch():
|
||||
image_url = "https://farm4.staticflickr.com/3175/2653711032_804ff86d81_z.jpg"
|
||||
pil_image = load_image(image_url)
|
||||
states = image_qa.run_batch(
|
||||
[
|
||||
{"image": pil_image, "question": "What is this?"},
|
||||
{"image": pil_image, "question": "What is this?"},
|
||||
],
|
||||
max_new_tokens=512,
|
||||
)
|
||||
for s in states:
|
||||
print(s["answer"], "\n")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import multiprocessing as mp
|
||||
|
||||
mp.set_start_method("spawn", force=True)
|
||||
runtime = sgl.Runtime(
|
||||
model_path="lmms-lab/llama3-llava-next-8b",
|
||||
tokenizer_path="lmms-lab/llama3-llava-next-8b-tokenizer",
|
||||
)
|
||||
runtime.endpoint.chat_template = get_chat_template("llama-3-instruct")
|
||||
# runtime = sgl.Runtime(
|
||||
# model_path="lmms-lab/llava-next-72b",
|
||||
# tokenizer_path="lmms-lab/llavanext-qwen-tokenizer",
|
||||
# )
|
||||
# runtime.endpoint.chat_template = get_chat_template("chatml-llava")
|
||||
sgl.set_default_backend(runtime)
|
||||
print(f"chat template: {runtime.endpoint.chat_template.name}")
|
||||
|
||||
# Or you can use API models
|
||||
# sgl.set_default_backend(sgl.OpenAI("gpt-4-vision-preview"))
|
||||
# sgl.set_default_backend(sgl.VertexAI("gemini-pro-vision"))
|
||||
|
||||
# Run a single request
|
||||
print("\n========== single ==========\n")
|
||||
single()
|
||||
|
||||
# Stream output
|
||||
print("\n========== stream ==========\n")
|
||||
stream()
|
||||
|
||||
# Run a batch of requests
|
||||
print("\n========== batch ==========\n")
|
||||
batch()
|
||||
|
||||
runtime.shutdown()
|
||||
Reference in New Issue
Block a user