Support precomputed multimodal features for Qwen-VL and Gemma3 models. (#6136)
Co-authored-by: Yury Sulsky <ysulsky@tesla.com>
This commit is contained in:
163
docs/backend/vlm_query.ipynb
Normal file
163
docs/backend/vlm_query.ipynb
Normal file
@@ -0,0 +1,163 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "0",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Querying Qwen-VL"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "1",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import nest_asyncio\n",
|
||||
"\n",
|
||||
"nest_asyncio.apply() # Run this first.\n",
|
||||
"\n",
|
||||
"model_path = \"Qwen/Qwen2.5-VL-3B-Instruct\"\n",
|
||||
"chat_template = \"qwen2-vl\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "2",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Lets create a prompt.\n",
|
||||
"\n",
|
||||
"from io import BytesIO\n",
|
||||
"import requests\n",
|
||||
"from PIL import Image\n",
|
||||
"\n",
|
||||
"from sglang.srt.openai_api.protocol import ChatCompletionRequest\n",
|
||||
"from sglang.srt.conversation import chat_templates\n",
|
||||
"\n",
|
||||
"image = Image.open(\n",
|
||||
" BytesIO(\n",
|
||||
" requests.get(\n",
|
||||
" \"https://github.com/sgl-project/sglang/blob/main/test/lang/example_image.png?raw=true\"\n",
|
||||
" ).content\n",
|
||||
" )\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"conv = chat_templates[chat_template].copy()\n",
|
||||
"conv.append_message(conv.roles[0], f\"What's shown here: {conv.image_token}?\")\n",
|
||||
"conv.append_message(conv.roles[1], \"\")\n",
|
||||
"conv.image_data = [image]\n",
|
||||
"\n",
|
||||
"print(conv.get_prompt())\n",
|
||||
"image"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "3",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Query via the offline Engine API"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "4",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from sglang import Engine\n",
|
||||
"\n",
|
||||
"llm = Engine(\n",
|
||||
" model_path=model_path, chat_template=chat_template, mem_fraction_static=0.8\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "5",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"out = llm.generate(prompt=conv.get_prompt(), image_data=[image])\n",
|
||||
"print(out[\"text\"])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "6",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Query via the offline Engine API, but send precomputed embeddings"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "7",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Compute the image embeddings using Huggingface.\n",
|
||||
"\n",
|
||||
"from transformers import AutoProcessor\n",
|
||||
"from transformers import Qwen2_5_VLForConditionalGeneration\n",
|
||||
"\n",
|
||||
"processor = AutoProcessor.from_pretrained(model_path, use_fast=True)\n",
|
||||
"vision = (\n",
|
||||
" Qwen2_5_VLForConditionalGeneration.from_pretrained(model_path).eval().visual.cuda()\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "8",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"processed_prompt = processor(\n",
|
||||
" images=[image], text=conv.get_prompt(), return_tensors=\"pt\"\n",
|
||||
")\n",
|
||||
"input_ids = processed_prompt[\"input_ids\"][0].detach().cpu().tolist()\n",
|
||||
"precomputed_features = vision(\n",
|
||||
" processed_prompt[\"pixel_values\"].cuda(), processed_prompt[\"image_grid_thw\"].cuda()\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"mm_item = dict(\n",
|
||||
" modality=\"IMAGE\",\n",
|
||||
" image_grid_thws=processed_prompt[\"image_grid_thw\"],\n",
|
||||
" precomputed_features=precomputed_features,\n",
|
||||
")\n",
|
||||
"out = llm.generate(input_ids=input_ids, image_data=[mm_item])\n",
|
||||
"print(out[\"text\"])"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"jupytext": {
|
||||
"cell_metadata_filter": "-all",
|
||||
"custom_cell_magics": "kql",
|
||||
"encoding": "# -*- coding: utf-8 -*-"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
Reference in New Issue
Block a user