model(vlm): pixtral (#5084)
This commit is contained in:
@@ -33,9 +33,10 @@ The `hidden_states` folder contains examples on how to extract hidden states usi
|
||||
* `hidden_states_engine.py`: An example how to extract hidden states using the Engine API.
|
||||
* `hidden_states_server.py`: An example how to extract hidden states using the Server API.
|
||||
|
||||
## LLaVA-NeXT
|
||||
## Multimodal
|
||||
|
||||
SGLang supports multimodal inputs for various model architectures. The `multimodal` folder contains examples showing how to use urls, files or encoded data to make requests to multimodal models. Examples include querying the [Llava-OneVision](multimodal/llava_onevision_server.py) model (image, multi-image, video), Llava-backed [Qwen-Llava](multimodal/qwen_llava_server.py) and [Llama3-Llava](multimodal/llama3_llava_server.py) models (image, multi-image), and Mistral AI's [Pixtral](multimodal/pixtral_server.py) (image, multi-image).
|
||||
|
||||
SGLang support LLaVA-OneVision with single-image, multi-image and video are supported. The folder `llava_onevision` shows how to do this.
|
||||
|
||||
## Token In, Token Out
|
||||
|
||||
|
||||
@@ -6,7 +6,7 @@ Usage:
|
||||
# Endpoint Service CLI:
|
||||
python -m sglang.launch_server --model-path lmms-lab/llama3-llava-next-8b --port=30000
|
||||
|
||||
python3 http_llama3_llava_test.py
|
||||
python3 llama3_llava_server.py
|
||||
|
||||
Output:
|
||||
"Friends posing for a fun photo with a life-sized teddy bear, creating a playful and memorable moment."
|
||||
@@ -3,7 +3,7 @@ Usage:
|
||||
|
||||
python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-72b-ov --port=30000 --tp-size=8
|
||||
|
||||
python3 http_llava_onevision_test.py
|
||||
python3 llava_onevision_server.py
|
||||
"""
|
||||
|
||||
import base64
|
||||
127
examples/runtime/multimodal/pixtral_server.py
Normal file
127
examples/runtime/multimodal/pixtral_server.py
Normal file
@@ -0,0 +1,127 @@
|
||||
"""
|
||||
Usage:
|
||||
# Run a Pixtral model with SGLang:
|
||||
# HuggingFace:
|
||||
python -m sglang.launch_server --model-path mistral-community/pixtral-12b --port=30000
|
||||
# ModelScope:
|
||||
python -m sglang.launch_server --model-path AI-ModelScope/pixtral-12b --port=30000
|
||||
|
||||
# Then test it with:
|
||||
python pixtral_server.py
|
||||
|
||||
This script tests Pixtral model with both single and multiple images.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import json
|
||||
|
||||
import aiohttp
|
||||
import requests
|
||||
|
||||
IMAGE_TOKEN_SEP = "\n[IMG]"
|
||||
ROUTE = "/generate"
|
||||
|
||||
|
||||
async def send_request(url, data, delay=0):
|
||||
await asyncio.sleep(delay)
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.post(url, json=data) as resp:
|
||||
output = await resp.json()
|
||||
return output
|
||||
|
||||
|
||||
async def test_concurrent(args):
|
||||
url = f"{args.host}:{args.port}{ROUTE}"
|
||||
|
||||
# Single image test
|
||||
if args.single_image:
|
||||
prompt = f"<s>[INST]Describe this image in detail.{IMAGE_TOKEN_SEP}[/INST]"
|
||||
image_url = "https://picsum.photos/id/237/400/300"
|
||||
modality = ["image"]
|
||||
# Multiple images test
|
||||
else:
|
||||
image_urls = [
|
||||
"https://picsum.photos/id/237/400/300",
|
||||
"https://picsum.photos/id/27/500/500",
|
||||
]
|
||||
prompt = f"<s>[INST]How many photos are there? Describe each in a very short sentence.{IMAGE_TOKEN_SEP * len(image_urls)}[/INST]"
|
||||
image_url = image_urls
|
||||
modality = ["multi-images"]
|
||||
|
||||
response = await send_request(
|
||||
url,
|
||||
{
|
||||
"text": prompt,
|
||||
"image_data": image_url,
|
||||
"sampling_params": {
|
||||
"max_new_tokens": 100,
|
||||
"temperature": 0.7,
|
||||
"top_p": 0.9,
|
||||
},
|
||||
"modalities": modality,
|
||||
},
|
||||
)
|
||||
|
||||
print(f"Response: {response}")
|
||||
if "text" in response:
|
||||
print("\nOutput text:", response["text"])
|
||||
|
||||
|
||||
def test_streaming(args):
|
||||
url = f"{args.host}:{args.port}/generate"
|
||||
|
||||
# Single image test
|
||||
if args.single_image:
|
||||
prompt = f"<s>[INST]Describe this image in detail.{IMAGE_TOKEN_SEP}[/INST]"
|
||||
image_data = "https://picsum.photos/id/237/400/300"
|
||||
modality = ["image"]
|
||||
# Multiple images test
|
||||
else:
|
||||
image_urls = [
|
||||
"https://picsum.photos/id/237/400/300",
|
||||
"https://picsum.photos/id/27/500/500",
|
||||
]
|
||||
prompt = f"<s>[INST]How many photos are there? Describe each in a very short sentence.{IMAGE_TOKEN_SEP * len(image_urls)}[/INST]"
|
||||
image_data = image_urls
|
||||
modality = ["multi-images"]
|
||||
|
||||
pload = {
|
||||
"text": prompt,
|
||||
"image_data": image_data,
|
||||
"sampling_params": {"max_new_tokens": 100, "temperature": 0.7, "top_p": 0.9},
|
||||
"modalities": modality,
|
||||
"stream": True,
|
||||
}
|
||||
|
||||
response = requests.post(url, json=pload, stream=True)
|
||||
|
||||
print("Streaming response:")
|
||||
prev = 0
|
||||
for chunk in response.iter_lines(decode_unicode=False):
|
||||
chunk = chunk.decode("utf-8")
|
||||
if chunk and chunk.startswith("data:"):
|
||||
if chunk == "data: [DONE]":
|
||||
break
|
||||
data = json.loads(chunk[5:].strip("\n"))
|
||||
output = data["text"].strip()
|
||||
print(output[prev:], end="", flush=True)
|
||||
prev = len(output)
|
||||
print("\n")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--host", type=str, default="http://127.0.0.1")
|
||||
parser.add_argument("--port", type=int, default=30000)
|
||||
parser.add_argument(
|
||||
"--single-image",
|
||||
action="store_true",
|
||||
help="Test with single image instead of multiple images",
|
||||
)
|
||||
parser.add_argument("--no-stream", action="store_true", help="Don't test streaming")
|
||||
args = parser.parse_args()
|
||||
|
||||
asyncio.run(test_concurrent(args))
|
||||
if not args.no_stream:
|
||||
test_streaming(args)
|
||||
@@ -6,7 +6,7 @@ Usage:
|
||||
# Endpoint Service CLI:
|
||||
python -m sglang.launch_server --model-path lmms-lab/llava-next-72b --port=30000 --tp-size=8
|
||||
|
||||
python3 http_qwen_llava_test.py
|
||||
python3 qwen_llava_server.py
|
||||
|
||||
Output:
|
||||
"Two children pose with a large teddy bear, one holding a smaller stuffed bear, in a room with an American flag and potted plants."
|
||||
Reference in New Issue
Block a user