diff --git a/docs/backend/backend.md b/docs/backend/backend.md index 47298c039..3692d7217 100644 --- a/docs/backend/backend.md +++ b/docs/backend/backend.md @@ -84,7 +84,8 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct - To enable torchao quantization, add `--torchao-config int4wo-128`. It supports various quantization strategies. - To enable fp8 weight quantization, add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments. - To enable fp8 kv cache quantization, add `--kv-cache-dtype fp8_e5m2`. -- If the model does not have a chat template in the Hugging Face tokenizer, you can specify a [custom chat template](https://sgl-project.github.io/custom_chat_template.html). +- If the model does not have a chat template in the Hugging Face tokenizer, you can specify a [custom chat template](https://sgl-project.github.io/references/custom_chat_template.html). + - To run tensor parallelism on multiple nodes, add `--nnodes 2`. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port, you can use the following commands. If you meet deadlock, please try to add `--disable-cuda-graph` ``` # Node 0 diff --git a/docs/backend/openai_api.ipynb b/docs/backend/openai_api_completions.ipynb similarity index 80% rename from docs/backend/openai_api.ipynb rename to docs/backend/openai_api_completions.ipynb index 9b9ba7ab0..dc89500f6 100644 --- a/docs/backend/openai_api.ipynb +++ b/docs/backend/openai_api_completions.ipynb @@ -4,166 +4,74 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# OpenAI Compatible API\n", + "# OpenAI-Compatible APIs - Completions\n", "\n", - "SGLang provides an OpenAI compatible API for smooth transition from OpenAI services. Full reference of the API is available at [OpenAI API Reference](https://platform.openai.com/docs/api-reference).\n", + "SGLang provides OpenAI-compatible APIs to enable a smooth transition from OpenAI services to self-hosted local models.\n", + "A complete reference for the API is available in the [OpenAI API Reference](https://platform.openai.com/docs/api-reference).\n", "\n", - "This tutorial covers these popular APIs:\n", + "This tutorial covers the following popular APIs:\n", "\n", "- `chat/completions`\n", "- `completions`\n", "- `batches`\n", - "- `embeddings`(refer to [embedding_model.ipynb](embedding_model.ipynb))" + "\n", + "Check out other tutorials to learn about vision APIs for vision-language models and embedding APIs for embedding models." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Chat Completions\n", + "## Launch A Server\n", "\n", - "### Usage\n", + "This code block is equivalent to executing \n", "\n", - "Similar to [send_request.ipynb](send_request.ipynb), we can send a chat completion request to SGLang server with OpenAI API format." + "```bash\n", + "python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \\\n", + "--port 30000 --host 0.0.0.0\n", + "```\n", + "\n", + "in your terminal and wait for the server to be ready." ] }, { "cell_type": "code", "execution_count": 1, - "metadata": { - "execution": { - "iopub.execute_input": "2024-11-01T02:44:46.419815Z", - "iopub.status.busy": "2024-11-01T02:44:46.419509Z", - "iopub.status.idle": "2024-11-01T02:45:16.621648Z", - "shell.execute_reply": "2024-11-01T02:45:16.620659Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "/home/chenyang/miniconda3/envs/AlphaMeemory/lib/python3.11/site-packages/transformers/utils/hub.py:128: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n", - " warnings.warn(\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-10-31 19:44:51] server_args=ServerArgs(model_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, context_length=None, device='cuda', served_model_name='meta-llama/Meta-Llama-3.1-8B-Instruct', chat_template=None, is_embedding=False, host='0.0.0.0', port=30000, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, tp_size=1, stream_interval=1, random_seed=357249111, constrained_json_whitespace_pattern=None, decode_log_interval=40, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, watchdog_timeout=600, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_flashinfer=False, disable_flashinfer_sampling=False, disable_radix_cache=False, disable_regex_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_penalizer=False, disable_nan_detection=False, enable_overlap_schedule=False, enable_mixed_chunk=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config='', enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1)\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "/home/chenyang/miniconda3/envs/AlphaMeemory/lib/python3.11/site-packages/transformers/utils/hub.py:128: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n", - " warnings.warn(\n", - "/home/chenyang/miniconda3/envs/AlphaMeemory/lib/python3.11/site-packages/transformers/utils/hub.py:128: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n", - " warnings.warn(\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-10-31 19:44:57 TP0] Init torch distributed begin.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-10-31 19:44:58 TP0] Load weight begin. avail mem=47.27 GB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-10-31 19:44:59 TP0] lm_eval is not installed, GPTQ may not be usable\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "INFO 10-31 19:44:59 weight_utils.py:243] Using model weights format ['*.safetensors']\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\r", - "Loading safetensors checkpoint shards: 0% Completed | 0/4 [00:00Response: ChatCompletion(id='e04fce6c460d4764af68007fc82763e1', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are 3 countries and their capitals:\\n\\n1. **Country:** Japan\\n**Capital:** Tokyo\\n\\n2. **Country:** Australia\\n**Capital:** Canberra\\n\\n3. **Country:** Brazil\\n**Capital:** Brasília', refusal=None, role='assistant', function_call=None, tool_calls=None), matched_stop=128009)], created=1730429118, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=46, prompt_tokens=49, total_tokens=95, prompt_tokens_details=None))" + "Response: ChatCompletion(id='bb74a7e9fcae4df7af2ee59e25aa75a5', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are 3 countries and their capitals:\\n\\n1. **Country:** Japan\\n**Capital:** Tokyo\\n\\n2. **Country:** Australia\\n**Capital:** Canberra\\n\\n3. **Country:** Brazil\\n**Capital:** Brasília', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None), matched_stop=128009)], created=1730506084, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=46, prompt_tokens=49, total_tokens=95, completion_tokens_details=None, prompt_tokens_details=None))" ], "text/plain": [ "" @@ -285,41 +194,17 @@ "name": "stdout", "output_type": "stream", "text": [ - "[2024-10-31 19:45:18 TP0] Prefill batch. #new-seq: 1, #new-token: 48, #cached-token: 28, cache hit rate: 21.97%, token usage: 0.00, #running-req: 0, #queue-req: 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-10-31 19:45:18 TP0] Decode batch. #running-req: 1, #token: 104, token usage: 0.00, gen throughput (token/s): 39.15, #queue-req: 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-10-31 19:45:19 TP0] Decode batch. #running-req: 1, #token: 144, token usage: 0.00, gen throughput (token/s): 41.80, #queue-req: 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-10-31 19:45:20 TP0] Decode batch. #running-req: 1, #token: 184, token usage: 0.00, gen throughput (token/s): 41.81, #queue-req: 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-10-31 19:45:21] INFO: 127.0.0.1:37738 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n" + "[2024-11-02 00:08:08 TP0] Prefill batch. #new-seq: 1, #new-token: 48, #cached-token: 28, cache hit rate: 21.97%, token usage: 0.00, #running-req: 0, #queue-req: 0\n", + "[2024-11-02 00:08:08 TP0] Decode batch. #running-req: 1, #token: 104, token usage: 0.00, gen throughput (token/s): 9.89, #queue-req: 0\n", + "[2024-11-02 00:08:09 TP0] Decode batch. #running-req: 1, #token: 144, token usage: 0.00, gen throughput (token/s): 132.64, #queue-req: 0\n", + "[2024-11-02 00:08:09 TP0] Decode batch. #running-req: 1, #token: 184, token usage: 0.00, gen throughput (token/s): 132.28, #queue-req: 0\n", + "[2024-11-02 00:08:09] INFO: 127.0.0.1:51178 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n" ] }, { "data": { "text/html": [ - "Ancient Rome's major achievements include:

1. **Engineering and Architecture**: They built iconic structures like the Colosseum, Pantheon, and Roman Forum, showcasing their mastery of concrete, arches, and aqueducts.
2. **Law and Governance**: The Romans developed the 12 Tables (450 BCE), which formed the basis of their laws, and established the concept of citizenship, paving the way for modern democracy.
3. **Military Conquests**: Rome expanded its territories through a series of wars, creating a vast empire that lasted for centuries, stretching from Britain to Egypt.
4. **Language and Literature**: Latin became
" + "Ancient Rome's major achievements include:

1. **Law and Governance**: The Twelve Tables (450 BCE) and the Julian Laws (5th century BCE) established a foundation for Roman law, which influenced modern Western law. The Roman Republic (509-27 BCE) and Empire (27 BCE-476 CE) developed a system of governance that included the concept of citizenship, representation, and checks on power.

2. **Architecture and Engineering**: Romans developed impressive architectural styles, such as the arch, dome, and aqueducts. Iconic structures like the Colosseum, Pantheon, and Roman Forum showcased their engineering prowess.

" ], "text/plain": [ "" @@ -379,37 +264,9 @@ "name": "stdout", "output_type": "stream", "text": [ - "[2024-10-31 19:45:21] INFO: 127.0.0.1:37738 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n", - "[2024-10-31 19:45:21 TP0] Prefill batch. #new-seq: 1, #new-token: 15, #cached-token: 25, cache hit rate: 31.40%, token usage: 0.00, #running-req: 0, #queue-req: 0\n", - "It looks like you're ready to" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " begin" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - ". What kind of test would you like" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " to" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " conduct?" + "[2024-11-02 00:08:19] INFO: 127.0.0.1:44218 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n", + "[2024-11-02 00:08:19 TP0] Prefill batch. #new-seq: 1, #new-token: 15, #cached-token: 25, cache hit rate: 31.40%, token usage: 0.00, #running-req: 0, #queue-req: 0\n", + "This is only a test." ] } ], @@ -432,7 +289,7 @@ "\n", "### Usage\n", "\n", - "Completions API is similar to Chat Completions API, but without the `messages` parameter." + "Completions API is similar to Chat Completions API, but without the `messages` parameter or chat templates." ] }, { @@ -451,28 +308,16 @@ "name": "stdout", "output_type": "stream", "text": [ - "[2024-10-31 19:45:21 TP0] Prefill batch. #new-seq: 1, #new-token: 8, #cached-token: 1, cache hit rate: 30.39%, token usage: 0.00, #running-req: 0, #queue-req: 0\n", - "[2024-10-31 19:45:21 TP0] Decode batch. #running-req: 1, #token: 11, token usage: 0.00, gen throughput (token/s): 39.18, #queue-req: 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-10-31 19:45:22 TP0] Decode batch. #running-req: 1, #token: 51, token usage: 0.00, gen throughput (token/s): 42.85, #queue-req: 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-10-31 19:45:23] INFO: 127.0.0.1:37738 - \"POST /v1/completions HTTP/1.1\" 200 OK\n" + "[2024-11-02 00:08:25 TP0] Prefill batch. #new-seq: 1, #new-token: 8, #cached-token: 1, cache hit rate: 30.39%, token usage: 0.00, #running-req: 0, #queue-req: 0\n", + "[2024-11-02 00:08:25 TP0] Decode batch. #running-req: 1, #token: 24, token usage: 0.00, gen throughput (token/s): 2.45, #queue-req: 0\n", + "[2024-11-02 00:08:25 TP0] Decode batch. #running-req: 1, #token: 64, token usage: 0.00, gen throughput (token/s): 142.10, #queue-req: 0\n", + "[2024-11-02 00:08:26] INFO: 127.0.0.1:37290 - \"POST /v1/completions HTTP/1.1\" 200 OK\n" ] }, { "data": { "text/html": [ - "Response: Completion(id='84ca7b4df182449697c4b38a454b8834', choices=[CompletionChoice(finish_reason='length', index=0, logprobs=None, text=' 1. 2. 3.\\n1. United States Washington D.C. 2. Japan Tokyo 3. Australia Canberra\\nList 3 countries and their capitals. 1. 2. 3.\\n1. China Beijing 2. Brazil Bras', matched_stop=None)], created=1730429123, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='text_completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=64, prompt_tokens=9, total_tokens=73, prompt_tokens_details=None))" + "Response: Completion(id='25412696fce14364b40430b5671fc11e', choices=[CompletionChoice(finish_reason='length', index=0, logprobs=None, text=' 1. 2. 3.\\n1. United States - Washington D.C. 2. Japan - Tokyo 3. Australia - Canberra\\nList 3 countries and their capitals. 1. 2. 3.\\n1. China - Beijing 2. Brazil - Bras', matched_stop=None)], created=1730506106, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='text_completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=64, prompt_tokens=9, total_tokens=73, completion_tokens_details=None, prompt_tokens_details=None))" ], "text/plain": [ "" @@ -596,7 +441,7 @@ "source": [ "## Batches\n", "\n", - "We have implemented the batches API for chat completions and completions. You can upload your requests in `jsonl` files, create a batch job, and retrieve the results when the batch job is completed (which takes longer but costs less).\n", + "Batches API for chat completions and completions are also supported. You can upload your requests in `jsonl` files, create a batch job, and retrieve the results when the batch job is completed (which takes longer but costs less).\n", "\n", "The batches APIs are:\n", "\n", @@ -1448,7 +1293,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 6, "metadata": { "execution": { "iopub.execute_input": "2024-11-01T02:46:07.896114Z", @@ -1479,7 +1324,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.7" + "version": "3.10.12" } }, "nbformat": 4, diff --git a/docs/backend/openai_api_vision.ipynb b/docs/backend/openai_api_vision.ipynb new file mode 100644 index 000000000..4b7482c49 --- /dev/null +++ b/docs/backend/openai_api_vision.ipynb @@ -0,0 +1,382 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# OpenAI-Compatible APIs - Vision\n", + "\n", + "SGLang provides OpenAI-compatible APIs to enable a smooth transition from OpenAI services to self-hosted local models.\n", + "A complete reference for the API is available in the [OpenAI API Reference](https://platform.openai.com/docs/guides/vision).\n", + "This tutorial covers the vision APIs for vision language models.\n", + "\n", + "SGLang supports vision language models such as Llama 3.2, LLaVA-OneVision, and QWen-VL2 \n", + "- [meta-llama/Llama-3.2-11B-Vision-Instruct](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct) \n", + "- [lmms-lab/llava-onevision-qwen2-72b-ov-chat](https://huggingface.co/lmms-lab/llava-onevision-qwen2-72b-ov-chat) \n", + "- [Qwen/Qwen2-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct) " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Launch A Server\n", + "\n", + "This code block is equivalent to executing \n", + "\n", + "```bash\n", + "python3 -m sglang.launch_server --model-path meta-llama/Llama-3.2-11B-Vision-Instruct \\\n", + " --port 30010 --chat-template llama_3_vision\n", + "```\n", + "in your terminal and wait for the server to be ready.\n", + "\n", + "Remember to add `--chat-template llama_3_vision` to specify the vision chat template, otherwise the server only supports text.\n", + "We need to specify `--chat-template` for vision language models because the chat template provided in Hugging Face tokenizer only supports text." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2024-11-02 00:24:10.542705: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n", + "2024-11-02 00:24:10.554725: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n", + "2024-11-02 00:24:10.554758: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1442] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n", + "2024-11-02 00:24:11.063662: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n", + "[2024-11-02 00:24:19] server_args=ServerArgs(model_path='meta-llama/Llama-3.2-11B-Vision-Instruct', tokenizer_path='meta-llama/Llama-3.2-11B-Vision-Instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, context_length=None, device='cuda', served_model_name='meta-llama/Llama-3.2-11B-Vision-Instruct', chat_template='llama_3_vision', is_embedding=False, host='127.0.0.1', port=30010, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, tp_size=1, stream_interval=1, random_seed=553831757, constrained_json_whitespace_pattern=None, decode_log_interval=40, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, watchdog_timeout=600, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_flashinfer=False, disable_flashinfer_sampling=False, disable_radix_cache=False, disable_regex_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_penalizer=False, disable_nan_detection=False, enable_overlap_schedule=False, enable_mixed_chunk=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config='', enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1)\n", + "[2024-11-02 00:24:20] Use chat template for the OpenAI-compatible API server: llama_3_vision\n", + "[2024-11-02 00:24:29 TP0] Automatically turn off --chunked-prefill-size and adjust --mem-fraction-static for multimodal models.\n", + "[2024-11-02 00:24:29 TP0] Init torch distributed begin.\n", + "[2024-11-02 00:24:32 TP0] Load weight begin. avail mem=76.83 GB\n", + "[2024-11-02 00:24:32 TP0] lm_eval is not installed, GPTQ may not be usable\n", + "INFO 11-02 00:24:32 weight_utils.py:243] Using model weights format ['*.safetensors']\n", + "Loading safetensors checkpoint shards: 0% Completed | 0/5 [00:00

NOTE: Typically, the server runs in a separate terminal.
In this notebook, we run the server and notebook code together, so their outputs are combined.
To improve clarity, the server logs are displayed in the original black color, while the notebook outputs are highlighted in blue.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from sglang.utils import (\n", + " execute_shell_command,\n", + " wait_for_server,\n", + " terminate_process,\n", + " print_highlight,\n", + ")\n", + "\n", + "embedding_process = execute_shell_command(\n", + "\"\"\"\n", + "python3 -m sglang.launch_server --model-path meta-llama/Llama-3.2-11B-Vision-Instruct \\\n", + " --port=30010 --chat-template=llama_3_vision\n", + "\"\"\"\n", + ")\n", + "\n", + "wait_for_server(\"http://localhost:30010\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Using cURL\n", + "\n", + "Once the server is up, you can send test requests using curl." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " % Total % Received % Xferd Average Speed Time Time Time Current\n", + " Dload Upload Total Spent Left Speed\n", + "100 485 0 0 100 485 0 2420 --:--:-- --:--:-- --:--:-- 2412" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-11-02 00:26:23 TP0] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 6462, cache hit rate: 49.97%, token usage: 0.02, #running-req: 0, #queue-req: 0\n", + "[2024-11-02 00:26:24] INFO: 127.0.0.1:39828 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100 965 100 480 100 485 789 797 --:--:-- --:--:-- --:--:-- 1584\n" + ] + }, + { + "data": { + "text/html": [ + "{\"id\":\"5e9e1c80809f492a926a2634c3d162d0\",\"object\":\"chat.completion\",\"created\":1730507184,\"model\":\"meta-llama/Llama-3.2-11B-Vision-Instruct\",\"choices\":[{\"index\":0,\"message\":{\"role\":\"assistant\",\"content\":\"The image depicts a man ironing clothes on an ironing board that is placed on the back of a yellow taxi cab.\"},\"logprobs\":null,\"finish_reason\":\"stop\",\"matched_stop\":128009}],\"usage\":{\"prompt_tokens\":6463,\"total_tokens\":6489,\"completion_tokens\":26,\"prompt_tokens_details\":null}}" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import subprocess\n", + "\n", + "curl_command = \"\"\"\n", + "curl http://localhost:30010/v1/chat/completions \\\n", + " -H \"Content-Type: application/json\" \\\n", + " -H \"Authorization: Bearer None\" \\\n", + " -d '{\n", + " \"model\": \"meta-llama/Llama-3.2-11B-Vision-Instruct\",\n", + " \"messages\": [\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": [\n", + " {\n", + " \"type\": \"text\",\n", + " \"text\": \"What’s in this image?\"\n", + " },\n", + " {\n", + " \"type\": \"image_url\",\n", + " \"image_url\": {\n", + " \"url\": \"https://github.com/sgl-project/sglang/blob/main/test/lang/example_image.png?raw=true\"\n", + " }\n", + " }\n", + " ]\n", + " }\n", + " ],\n", + " \"max_tokens\": 300\n", + " }'\n", + "\"\"\"\n", + "\n", + "response = subprocess.check_output(curl_command, shell=True).decode()\n", + "print_highlight(response)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Using OpenAI Python Client\n", + "\n", + "You can use the OpenAI Python API library to send requests." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-11-02 00:26:33 TP0] Prefill batch. #new-seq: 1, #new-token: 11, #cached-token: 6452, cache hit rate: 66.58%, token usage: 0.02, #running-req: 0, #queue-req: 0\n", + "[2024-11-02 00:26:34 TP0] Decode batch. #running-req: 1, #token: 6477, token usage: 0.02, gen throughput (token/s): 0.77, #queue-req: 0\n", + "[2024-11-02 00:26:34] INFO: 127.0.0.1:43258 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n" + ] + }, + { + "data": { + "text/html": [ + "The image shows a man ironing clothes on the back of a yellow taxi cab." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from openai import OpenAI\n", + "\n", + "client = OpenAI(base_url=\"http://localhost:30010/v1\", api_key=\"None\")\n", + "\n", + "response = client.chat.completions.create(\n", + " model=\"meta-llama/Llama-3.2-11B-Vision-Instruct\",\n", + " messages=[\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": [\n", + " {\n", + " \"type\": \"text\",\n", + " \"text\": \"What is in this image?\",\n", + " },\n", + " {\n", + " \"type\": \"image_url\",\n", + " \"image_url\": {\"url\": \"https://github.com/sgl-project/sglang/blob/main/test/lang/example_image.png?raw=true\"},\n", + " },\n", + " ],\n", + " }\n", + " ],\n", + " max_tokens=300,\n", + ")\n", + "\n", + "print_highlight(response.choices[0].message.content)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Multiple-Image Inputs\n", + "\n", + "The server also supports multiple images and interleaved text and images if the model supports it." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-11-02 00:20:30 TP0] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 12894, cache hit rate: 83.27%, token usage: 0.04, #running-req: 0, #queue-req: 0\n", + "[2024-11-02 00:20:30 TP0] Decode batch. #running-req: 1, #token: 12903, token usage: 0.04, gen throughput (token/s): 2.02, #queue-req: 0\n", + "[2024-11-02 00:20:30 TP0] Decode batch. #running-req: 1, #token: 12943, token usage: 0.04, gen throughput (token/s): 105.52, #queue-req: 0\n", + "[2024-11-02 00:20:30] INFO: 127.0.0.1:41386 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n" + ] + }, + { + "data": { + "text/html": [ + "The first image shows a man in a yellow shirt ironing a shirt on the back of a yellow taxi cab, with a red line connecting the two objects. The second image shows a large orange \"S\" and \"G\" on a white background, with a red line connecting them." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from openai import OpenAI\n", + "\n", + "client = OpenAI(base_url=\"http://localhost:30010/v1\", api_key=\"None\")\n", + "\n", + "response = client.chat.completions.create(\n", + " model=\"meta-llama/Llama-3.2-11B-Vision-Instruct\",\n", + " messages=[\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": [\n", + " {\n", + " \"type\": \"image_url\",\n", + " \"image_url\": {\n", + " \"url\": \"https://github.com/sgl-project/sglang/blob/main/test/lang/example_image.png?raw=true\",\n", + " },\n", + " },\n", + " {\n", + " \"type\": \"image_url\",\n", + " \"image_url\": {\n", + " \"url\": \"https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png\",\n", + " },\n", + " },\n", + " {\n", + " \"type\": \"text\",\n", + " \"text\": \"I have two very different images. They are not related at all. \"\n", + " \"Please describe the first image in one sentence, and then describe the second image in another sentence.\",\n", + " },\n", + " ],\n", + " }\n", + " ],\n", + " temperature=0,\n", + ")\n", + "\n", + "print_highlight(response.choices[0].message.content)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "terminate_process(embedding_process)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Chat Template\n", + "\n", + "As mentioned before, if you do not specify a vision model's `--chat-template`, the server uses Hugging Face's default template, which only supports text.\n", + "\n", + "We list popular vision models with their chat templates:\n", + "\n", + "- [meta-llama/Llama-3.2-Vision](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct) uses `llama_3_vision`.\n", + "- [LLaVA-NeXT](https://huggingface.co/collections/lmms-lab/llava-next-6623288e2d61edba3ddbf5ff) uses `chatml-llava`.\n", + "- [LlaVA-OneVision](https://huggingface.co/lmms-lab/llava-onevision-qwen2-7b-ov) uses `chatml-llava`.\n", + "- [Llama3-LLaVA-NeXT](https://huggingface.co/lmms-lab/llama3-llava-next-8b) uses `llava_llama_3`.\n", + "- [LLaVA-v1.5 / 1.6](https://huggingface.co/liuhaotian/llava-v1.6-34b) uses `vicuna_v1.1`." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/backend/vision_language_model.ipynb b/docs/backend/vision_language_model.ipynb deleted file mode 100644 index 769c9a9d5..000000000 --- a/docs/backend/vision_language_model.ipynb +++ /dev/null @@ -1,431 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Vision Language Model\n", - "\n", - "SGLang supports vision language models in the same way as completion models. Here are some example models:\n", - "\n", - "- [meta-llama/Llama-3.2-11B-Vision-Instruct](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct)\n", - "- [lmms-lab/llava-onevision-qwen2-7b-ov](https://huggingface.co/lmms-lab/llava-onevision-qwen2-7b-ov)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Launch A Server\n", - "\n", - "The following code is equivalent to running this in the shell:\n", - "\n", - "```bash\n", - "python3 -m sglang.launch_server --model-path meta-llama/Llama-3.2-11B-Vision-Instruct \\\n", - " --port=30010 --chat-template=llama_3_vision\n", - "```\n", - "\n", - "Remember to add `--chat-template=llama_3_vision` to specify the vision chat template, otherwise the server only supports text." - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "/home/chenyang/miniconda3/envs/AlphaMeemory/lib/python3.11/site-packages/transformers/utils/hub.py:128: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n", - " warnings.warn(\n", - "[2024-10-31 23:10:49] server_args=ServerArgs(model_path='meta-llama/Llama-3.2-11B-Vision-Instruct', tokenizer_path='meta-llama/Llama-3.2-11B-Vision-Instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, context_length=None, device='cuda', served_model_name='meta-llama/Llama-3.2-11B-Vision-Instruct', chat_template='llama_3_vision', is_embedding=False, host='127.0.0.1', port=30010, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, tp_size=1, stream_interval=1, random_seed=178735948, constrained_json_whitespace_pattern=None, decode_log_interval=40, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, watchdog_timeout=600, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_flashinfer=False, disable_flashinfer_sampling=False, disable_radix_cache=False, disable_regex_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_penalizer=False, disable_nan_detection=False, enable_overlap_schedule=False, enable_mixed_chunk=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config='', enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1)\n", - "/home/chenyang/miniconda3/envs/AlphaMeemory/lib/python3.11/site-packages/transformers/utils/hub.py:128: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n", - " warnings.warn(\n", - "/home/chenyang/miniconda3/envs/AlphaMeemory/lib/python3.11/site-packages/transformers/utils/hub.py:128: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n", - " warnings.warn(\n", - "[2024-10-31 23:10:51] Use chat template for the OpenAI-compatible API server: llama_3_vision\n", - "[2024-10-31 23:10:56 TP0] Automatically turn off --chunked-prefill-size and adjust --mem-fraction-static for multimodal models.\n", - "[2024-10-31 23:10:56 TP0] Init torch distributed begin.\n", - "[2024-10-31 23:10:56 TP0] Load weight begin. avail mem=47.27 GB\n", - "[2024-10-31 23:10:57 TP0] lm_eval is not installed, GPTQ may not be usable\n", - "INFO 10-31 23:10:57 weight_utils.py:243] Using model weights format ['*.safetensors']\n", - "Loading safetensors checkpoint shards: 0% Completed | 0/5 [00:00

NOTE: Typically, the server runs in a separate terminal.
In this notebook, we run the server and notebook code together, so their outputs are combined.
To improve clarity, the server logs are displayed in the original black color, while the notebook outputs are highlighted in blue.
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "from sglang.utils import (\n", - " execute_shell_command,\n", - " wait_for_server,\n", - " terminate_process,\n", - " print_highlight,\n", - ")\n", - "\n", - "embedding_process = execute_shell_command(\n", - " \"\"\"\n", - " python3 -m sglang.launch_server --model-path meta-llama/Llama-3.2-11B-Vision-Instruct \\\n", - " --port=30010 --chat-template=llama_3_vision\n", - "\n", - "\"\"\"\n", - ")\n", - "\n", - "wait_for_server(\"http://localhost:30010\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Use Curl" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - " % Total % Received % Xferd Average Speed Time Time Time Current\n", - " Dload Upload Total Spent Left Speed\n", - "100 559 0 0 100 559 0 253 0:00:02 0:00:02 --:--:-- 253" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "/home/chenyang/miniconda3/envs/AlphaMeemory/lib/python3.11/site-packages/torch/storage.py:414: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n", - " return torch.load(io.BytesIO(b))\n", - "[2024-10-31 23:11:18 TP0] Prefill batch. #new-seq: 1, #new-token: 6463, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100 559 0 0 100 559 0 174 0:00:03 0:00:03 --:--:-- 174" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-10-31 23:11:20 TP0] Decode batch. #running-req: 1, #token: 6496, token usage: 0.05, gen throughput (token/s): 3.90, #queue-req: 0\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100 559 0 0 100 559 0 107 0:00:05 0:00:05 --:--:-- 107" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-10-31 23:11:21 TP0] Decode batch. #running-req: 1, #token: 6536, token usage: 0.05, gen throughput (token/s): 33.67, #queue-req: 0\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100 559 0 0 100 559 0 90 0:00:06 0:00:06 --:--:-- 0" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-10-31 23:11:22 TP0] Decode batch. #running-req: 1, #token: 6576, token usage: 0.05, gen throughput (token/s): 33.60, #queue-req: 0\n", - "[2024-10-31 23:11:22] INFO: 127.0.0.1:54224 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100 1544 100 985 100 559 142 80 0:00:06 0:00:06 --:--:-- 265\n" - ] - }, - { - "data": { - "text/html": [ - "{'id': 'f618453e8f3e4408b893a958f2868a44', 'object': 'chat.completion', 'created': 1730441482, 'model': 'meta-llama/Llama-3.2-11B-Vision-Instruct', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': 'The image depicts a serene and peaceful landscape featuring a wooden boardwalk that meanders through a lush grassy field, set against a backdrop of trees and a bright blue sky with wispy clouds. The boardwalk is made of weathered wooden planks and is surrounded by tall grass on either side, creating a sense of depth and texture. The surrounding trees add a touch of natural beauty to the scene, while the blue sky with wispy clouds provides a sense of calmness and serenity. The overall atmosphere of the image is one of tranquility and relaxation, inviting the viewer to step into the peaceful world depicted.'}, 'logprobs': None, 'finish_reason': 'stop', 'matched_stop': 128009}], 'usage': {'prompt_tokens': 6463, 'total_tokens': 6588, 'completion_tokens': 125, 'prompt_tokens_details': None}}" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "import subprocess, json, os\n", - "\n", - "curl_command = \"\"\"\n", - "curl http://localhost:30010/v1/chat/completions \\\n", - " -H \"Content-Type: application/json\" \\\n", - " -H \"Authorization: Bearer None\" \\\n", - " -d '{\n", - " \"model\": \"meta-llama/Llama-3.2-11B-Vision-Instruct\",\n", - " \"messages\": [\n", - " {\n", - " \"role\": \"user\",\n", - " \"content\": [\n", - " {\n", - " \"type\": \"text\",\n", - " \"text\": \"What’s in this image?\"\n", - " },\n", - " {\n", - " \"type\": \"image_url\",\n", - " \"image_url\": {\n", - " \"url\": \"https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg\"\n", - " }\n", - " }\n", - " ]\n", - " }\n", - " ],\n", - " \"max_tokens\": 300\n", - " }'\n", - "\"\"\"\n", - "\n", - "response = json.loads(subprocess.check_output(curl_command, shell=True))\n", - "print_highlight(response)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Using OpenAI Compatible API" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-10-31 23:11:23 TP0] Prefill batch. #new-seq: 1, #new-token: 6463, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0\n", - "[2024-10-31 23:11:24 TP0] Decode batch. #running-req: 1, #token: 6492, token usage: 0.05, gen throughput (token/s): 20.07, #queue-req: 0\n", - "[2024-10-31 23:11:25 TP0] Decode batch. #running-req: 1, #token: 6532, token usage: 0.05, gen throughput (token/s): 33.68, #queue-req: 0\n", - "[2024-10-31 23:11:26 TP0] Decode batch. #running-req: 1, #token: 6572, token usage: 0.05, gen throughput (token/s): 33.62, #queue-req: 0\n", - "[2024-10-31 23:11:27 TP0] Decode batch. #running-req: 1, #token: 6612, token usage: 0.05, gen throughput (token/s): 33.62, #queue-req: 0\n", - "[2024-10-31 23:11:28] INFO: 127.0.0.1:54228 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n" - ] - }, - { - "data": { - "text/html": [ - "The image depicts a serene and peaceful scene of a wooden boardwalk leading through a lush field of tall grass, set against a backdrop of trees and a blue sky with clouds. The boardwalk is made of light-colored wood and has a simple design, with the wooden planks running parallel to each other. It stretches out into the distance, disappearing into the horizon.

The field is filled with tall, vibrant green grass that sways gently in the breeze, creating a sense of movement and life. The trees in the background are also lush and green, adding depth and texture to the scene. The blue sky above is dotted with white clouds, which are scattered across the horizon. The overall atmosphere of the image is one of tranquility and serenity, inviting the viewer to step into the peaceful world depicted.
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "import base64, requests\n", - "from openai import OpenAI\n", - "\n", - "client = OpenAI(base_url=\"http://localhost:30010/v1\", api_key=\"None\")\n", - "\n", - "\n", - "def encode_image(image_path):\n", - " with open(image_path, \"rb\") as image_file:\n", - " return base64.b64encode(image_file.read()).decode(\"utf-8\")\n", - "\n", - "\n", - "def download_image(image_url, image_path):\n", - " response = requests.get(image_url)\n", - " response.raise_for_status()\n", - " with open(image_path, \"wb\") as f:\n", - " f.write(response.content)\n", - "\n", - "\n", - "image_url = \"https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg\"\n", - "image_path = \"boardwalk.jpeg\"\n", - "download_image(image_url, image_path)\n", - "\n", - "base64_image = encode_image(image_path)\n", - "\n", - "response = client.chat.completions.create(\n", - " model=\"meta-llama/Llama-3.2-11B-Vision-Instruct\",\n", - " messages=[\n", - " {\n", - " \"role\": \"user\",\n", - " \"content\": [\n", - " {\n", - " \"type\": \"text\",\n", - " \"text\": \"What is in this image?\",\n", - " },\n", - " {\n", - " \"type\": \"image_url\",\n", - " \"image_url\": {\"url\": f\"data:image/jpeg;base64,{base64_image}\"},\n", - " },\n", - " ],\n", - " }\n", - " ],\n", - " max_tokens=300,\n", - ")\n", - "\n", - "print_highlight(response.choices[0].message.content)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Multiple Images Input" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-10-31 23:11:28 TP0] Prefill batch. #new-seq: 1, #new-token: 12871, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0\n", - "[2024-10-31 23:11:30 TP0] Decode batch. #running-req: 1, #token: 12899, token usage: 0.10, gen throughput (token/s): 15.36, #queue-req: 0\n", - "[2024-10-31 23:11:31 TP0] Decode batch. #running-req: 1, #token: 12939, token usage: 0.10, gen throughput (token/s): 33.33, #queue-req: 0\n", - "[2024-10-31 23:11:32 TP0] Decode batch. #running-req: 1, #token: 12979, token usage: 0.10, gen throughput (token/s): 33.28, #queue-req: 0\n", - "[2024-10-31 23:11:33] INFO: 127.0.0.1:50966 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n", - "Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='The two images depict a serene and idyllic scene, with the first image showing a well-trodden wooden path through a field, while the second image shows an overgrown, less-traveled path through the same field. The first image features a clear and well-maintained wooden path, whereas the second image shows a more neglected and overgrown path that is not as well-defined. The first image has a more vibrant and inviting atmosphere, while the second image appears more peaceful and serene. Overall, both images evoke a sense of tranquility and connection to nature.', refusal=None, role='assistant', function_call=None, tool_calls=None), matched_stop=128009)\n" - ] - } - ], - "source": [ - "from openai import OpenAI\n", - "\n", - "client = OpenAI(base_url=\"http://localhost:30010/v1\", api_key=\"None\")\n", - "\n", - "response = client.chat.completions.create(\n", - " model=\"meta-llama/Llama-3.2-11B-Vision-Instruct\",\n", - " messages=[\n", - " {\n", - " \"role\": \"user\",\n", - " \"content\": [\n", - " {\n", - " \"type\": \"text\",\n", - " \"text\": \"Are there any differences between these two images?\",\n", - " },\n", - " {\n", - " \"type\": \"image_url\",\n", - " \"image_url\": {\n", - " \"url\": \"https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg\",\n", - " },\n", - " },\n", - " {\n", - " \"type\": \"image_url\",\n", - " \"image_url\": {\n", - " \"url\": \"https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg\",\n", - " },\n", - " },\n", - " ],\n", - " }\n", - " ],\n", - " max_tokens=300,\n", - ")\n", - "print(response.choices[0])" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [], - "source": [ - "terminate_process(embedding_process)\n", - "os.remove(image_path)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Chat Template\n", - "\n", - "As mentioned before, if you do not specify a vision model's `chat-template`, the server uses Hugging Face's default template, which only supports text.\n", - "\n", - "You can add your custom chat template by referring to the [custom chat template](../references/custom_chat_template.md).\n", - "\n", - "We list popular vision models with their chat templates:\n", - "\n", - "- [meta-llama/Llama-3.2-Vision](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct) uses `llama_3_vision`.\n", - "- [LLaVA-NeXT](https://huggingface.co/collections/lmms-lab/llava-next-6623288e2d61edba3ddbf5ff) uses `chatml-llava`.\n", - "- [llama3-llava-next](https://huggingface.co/lmms-lab/llama3-llava-next-8b) uses `llava_llama_3`.\n", - "- [llava-onevision](https://huggingface.co/lmms-lab/llava-onevision-qwen2-7b-ov) uses `chatml-llava`.\n", - "- [liuhaotian/llava-v1.5 / 1.6](https://huggingface.co/liuhaotian/llava-v1.5-13b) uses `vicuna_v1.1`." - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "AlphaMeemory", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.7" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/docs/index.rst b/docs/index.rst index 6601c57d5..b365f5701 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -23,8 +23,8 @@ The core features include: :maxdepth: 1 :caption: Backend Tutorial - backend/openai_api.ipynb - backend/vision_language_model.ipynb + backend/openai_api_completions.ipynb + backend/openai_api_vision.ipynb backend/backend.md @@ -46,5 +46,5 @@ The core features include: references/choices_methods.md references/benchmark_and_profiling.md references/troubleshooting.md - references/embedding_model.ipynb + references/custom_chat_template.md references/learn_more.md diff --git a/docs/references/custom_chat_template.md b/docs/references/custom_chat_template.md index 64b33a0a4..0a5225da2 100644 --- a/docs/references/custom_chat_template.md +++ b/docs/references/custom_chat_template.md @@ -1,3 +1,5 @@ +.. _custom-chat-template: + # Custom Chat Template in SGLang Runtime **NOTE**: There are two chat template systems in SGLang project. This document is about setting a custom chat template for the OpenAI-compatible API server (defined at [conversation.py](https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/conversation.py)). It is NOT related to the chat template used in the SGLang language frontend (defined at [chat_template.py](https://github.com/sgl-project/sglang/blob/main/python/sglang/lang/chat_template.py)). diff --git a/docs/references/sampling_params.md b/docs/references/sampling_params.md index 78d5193c2..062e0c99b 100644 --- a/docs/references/sampling_params.md +++ b/docs/references/sampling_params.md @@ -1,3 +1,5 @@ +.. _sampling-parameters: + # Sampling Parameters in SGLang Runtime This doc describes the sampling parameters of the SGLang Runtime. It is the low-level endpoint of the runtime. diff --git a/docs/starts/send_request.ipynb b/docs/starts/send_request.ipynb index dda2371b5..2b095c259 100644 --- a/docs/starts/send_request.ipynb +++ b/docs/starts/send_request.ipynb @@ -22,12 +22,12 @@ "--port 30000 --host 0.0.0.0\n", "```\n", "\n", - "in your command line and wait for the server to be ready." + "in your terminal and wait for the server to be ready." ] }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 7, "metadata": { "execution": { "iopub.execute_input": "2024-11-01T02:46:13.611212Z", @@ -41,127 +41,35 @@ "name": "stdout", "output_type": "stream", "text": [ - "/home/chenyang/miniconda3/envs/AlphaMeemory/lib/python3.11/site-packages/transformers/utils/hub.py:128: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n", - " warnings.warn(\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-10-31 19:46:18] server_args=ServerArgs(model_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, context_length=None, device='cuda', served_model_name='meta-llama/Meta-Llama-3.1-8B-Instruct', chat_template=None, is_embedding=False, host='0.0.0.0', port=30000, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, tp_size=1, stream_interval=1, random_seed=706578968, constrained_json_whitespace_pattern=None, decode_log_interval=40, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, watchdog_timeout=600, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_flashinfer=False, disable_flashinfer_sampling=False, disable_radix_cache=False, disable_regex_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_penalizer=False, disable_nan_detection=False, enable_overlap_schedule=False, enable_mixed_chunk=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config='', enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1)\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "/home/chenyang/miniconda3/envs/AlphaMeemory/lib/python3.11/site-packages/transformers/utils/hub.py:128: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n", - " warnings.warn(\n", - "/home/chenyang/miniconda3/envs/AlphaMeemory/lib/python3.11/site-packages/transformers/utils/hub.py:128: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n", - " warnings.warn(\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-10-31 19:46:24 TP0] Init torch distributed begin.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-10-31 19:46:24 TP0] Load weight begin. avail mem=47.27 GB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-10-31 19:46:25 TP0] lm_eval is not installed, GPTQ may not be usable\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "INFO 10-31 19:46:26 weight_utils.py:243] Using model weights format ['*.safetensors']\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\r", - "Loading safetensors checkpoint shards: 0% Completed | 0/4 [00:00{\"id\":\"a0714277fab546c5b6d91724aa3e27a3\",\"object\":\"chat.completion\",\"created\":1730507329,\"model\":\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\"choices\":[{\"index\":0,\"message\":{\"role\":\"assistant\",\"content\":\"An LLM, or Large Language Model, is a type of artificial intelligence (AI) designed to process and generate human-like language, often used in applications such as chatbots, virtual assistants, and language translation software.\"},\"logprobs\":null,\"finish_reason\":\"stop\",\"matched_stop\":128009}],\"usage\":{\"prompt_tokens\":53,\"total_tokens\":98,\"completion_tokens\":45,\"prompt_tokens_details\":null}}" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" } ], "source": [ - "!curl http://localhost:30000/v1/chat/completions \\\n", - " -H \"Content-Type: application/json\" \\\n", - " -H \"Authorization: Bearer None\" \\\n", - " -d '{\"model\": \"meta-llama/Meta-Llama-3.1-8B-Instruct\", \"messages\": [{\"role\": \"system\", \"content\": \"You are a helpful assistant.\"}, {\"role\": \"user\", \"content\": \"What is a LLM?\"}]}'" + "import subprocess\n", + "\n", + "curl_command = \"\"\"\n", + "curl http://localhost:30000/v1/chat/completions \\\\\n", + " -H \"Content-Type: application/json\" \\\\\n", + " -H \"Authorization: Bearer None\" \\\\\n", + " -d '{\n", + " \"model\": \"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n", + " \"messages\": [\n", + " {\n", + " \"role\": \"system\",\n", + " \"content\": \"You are a helpful assistant.\"\n", + " },\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": \"What is an LLM? Tell me in one sentence.\"\n", + " }\n", + " ]\n", + " }'\n", + "\"\"\"\n", + "\n", + "response = subprocess.check_output(curl_command, shell=True).decode()\n", + "\n", + "print_highlight(response)" ] }, { @@ -301,7 +195,7 @@ "source": [ "## Using OpenAI Python Client\n", "\n", - "You can also use the OpenAI Python API library to send requests." + "You can use the OpenAI Python API library to send requests." ] }, { @@ -320,22 +214,15 @@ "name": "stdout", "output_type": "stream", "text": [ - "[2024-10-31 19:46:51 TP0] Prefill batch. #new-seq: 1, #new-token: 20, #cached-token: 29, cache hit rate: 29.13%, token usage: 0.00, #running-req: 0, #queue-req: 0\n", - "[2024-10-31 19:46:51 TP0] Decode batch. #running-req: 1, #token: 50, token usage: 0.00, gen throughput (token/s): 27.57, #queue-req: 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-10-31 19:46:52 TP0] Decode batch. #running-req: 1, #token: 90, token usage: 0.00, gen throughput (token/s): 42.69, #queue-req: 0\n", - "[2024-10-31 19:46:52] INFO: 127.0.0.1:40952 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n" + "[2024-11-02 00:03:52 TP0] Prefill batch. #new-seq: 1, #new-token: 20, #cached-token: 29, cache hit rate: 29.13%, token usage: 0.00, #running-req: 0, #queue-req: 0\n", + "[2024-11-02 00:03:52 TP0] Decode batch. #running-req: 1, #token: 65, token usage: 0.00, gen throughput (token/s): 11.33, #queue-req: 0\n", + "[2024-11-02 00:03:53] INFO: 127.0.0.1:57008 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n" ] }, { "data": { "text/html": [ - "ChatCompletion(id='c563abb8fe74496f83203fe21ec4ff61', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are 3 countries and their capitals:\\n\\n1. **Country:** Japan\\n**Capital:** Tokyo\\n\\n2. **Country:** Australia\\n**Capital:** Canberra\\n\\n3. **Country:** Brazil\\n**Capital:** Brasília', refusal=None, role='assistant', function_call=None, tool_calls=None), matched_stop=128009)], created=1730429212, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=46, prompt_tokens=49, total_tokens=95, prompt_tokens_details=None))" + "ChatCompletion(id='a6590143c40f4732a5c57d4c91b43f05', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are 3 countries and their capitals:\\n\\n1. **Country:** Japan\\n**Capital:** Tokyo\\n\\n2. **Country:** Australia\\n**Capital:** Canberra\\n\\n3. **Country:** Brazil\\n**Capital:** Brasília', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None), matched_stop=128009)], created=1730505833, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=46, prompt_tokens=49, total_tokens=95, completion_tokens_details=None, prompt_tokens_details=None))" ], "text/plain": [ "" @@ -359,12 +246,67 @@ " temperature=0,\n", " max_tokens=64,\n", ")\n", + "\n", "print_highlight(response)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Using Native Generation APIs\n", + "\n", + "You can also use the native `/generate` endpoint. It provides more flexiblity.\n", + "An API reference is available at [Sampling Parameters](https://sgl-project.github.io/references/sampling_params.html)." + ] + }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-11-02 00:05:04 TP0] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 5, cache hit rate: 33.04%, token usage: 0.00, #running-req: 0, #queue-req: 0\n", + "[2024-11-02 00:05:04 TP0] Decode batch. #running-req: 1, #token: 26, token usage: 0.00, gen throughput (token/s): 3.10, #queue-req: 0\n", + "[2024-11-02 00:05:04] INFO: 127.0.0.1:60536 - \"POST /generate HTTP/1.1\" 200 OK\n" + ] + }, + { + "data": { + "text/html": [ + "{'text': ' a city of romance, art, fashion, and history. Paris is a must-visit destination for anyone who loves culture, architecture, and cuisine. From the', 'meta_info': {'prompt_tokens': 6, 'completion_tokens': 32, 'completion_tokens_wo_jump_forward': 32, 'cached_tokens': 5, 'finish_reason': {'type': 'length', 'length': 32}, 'id': 'd882513c180d4c5981488257ccab4b9f'}, 'index': 0}" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import requests\n", + "\n", + "response = requests.post(\n", + " \"http://localhost:30000/generate\",\n", + " json={\n", + " \"text\": \"The capital of France is\",\n", + " \"sampling_params\": {\n", + " \"temperature\": 0,\n", + " \"max_new_tokens\": 32,\n", + " },\n", + " },\n", + ")\n", + "\n", + "print_highlight(response.json())" + ] + }, + { + "cell_type": "code", + "execution_count": 6, "metadata": { "execution": { "iopub.execute_input": "2024-11-01T02:46:52.898411Z", @@ -384,18 +326,6 @@ "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.7" } }, "nbformat": 4, diff --git a/test/srt/test_vision_openai_server.py b/test/srt/test_vision_openai_server.py index f44bc98e2..a5bf302e2 100644 --- a/test/srt/test_vision_openai_server.py +++ b/test/srt/test_vision_openai_server.py @@ -132,7 +132,7 @@ class TestOpenAIVisionServer(unittest.TestCase): assert response.usage.completion_tokens > 0 assert response.usage.total_tokens > 0 - def test_mult_images_chat_completion(self): + def test_multi_images_chat_completion(self): client = openai.Client(api_key=self.api_key, base_url=self.base_url) response = client.chat.completions.create(