diff --git a/docs/backend/embedding_model.ipynb b/docs/backend/embedding_model.ipynb index 589e66843..45928587b 100644 --- a/docs/backend/embedding_model.ipynb +++ b/docs/backend/embedding_model.ipynb @@ -30,7 +30,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": { "execution": { "iopub.execute_input": "2024-11-01T02:47:32.337369Z", @@ -39,59 +39,7 @@ "shell.execute_reply": "2024-11-01T02:47:59.539861Z" } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "/home/chenyang/miniconda3/envs/AlphaMeemory/lib/python3.11/site-packages/transformers/utils/hub.py:128: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n", - " warnings.warn(\n", - "[2024-10-31 22:40:37] server_args=ServerArgs(model_path='Alibaba-NLP/gte-Qwen2-7B-instruct', tokenizer_path='Alibaba-NLP/gte-Qwen2-7B-instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, context_length=None, device='cuda', served_model_name='Alibaba-NLP/gte-Qwen2-7B-instruct', chat_template=None, is_embedding=True, host='0.0.0.0', port=30010, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, tp_size=1, stream_interval=1, random_seed=309155486, constrained_json_whitespace_pattern=None, decode_log_interval=40, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, watchdog_timeout=600, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_flashinfer=False, disable_flashinfer_sampling=False, disable_radix_cache=False, disable_regex_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_penalizer=False, disable_nan_detection=False, enable_overlap_schedule=False, enable_mixed_chunk=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config='', enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1)\n", - "/home/chenyang/miniconda3/envs/AlphaMeemory/lib/python3.11/site-packages/transformers/utils/hub.py:128: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n", - " warnings.warn(\n", - "/home/chenyang/miniconda3/envs/AlphaMeemory/lib/python3.11/site-packages/transformers/utils/hub.py:128: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n", - " warnings.warn(\n", - "[2024-10-31 22:40:42 TP0] Init torch distributed begin.\n", - "[2024-10-31 22:40:43 TP0] Load weight begin. avail mem=47.27 GB\n", - "[2024-10-31 22:40:43 TP0] lm_eval is not installed, GPTQ may not be usable\n", - "INFO 10-31 22:40:44 weight_utils.py:243] Using model weights format ['*.safetensors']\n", - "Loading safetensors checkpoint shards: 0% Completed | 0/7 [00:00

NOTE: Typically, the server runs in a separate terminal.
In this notebook, we run the server and notebook code together, so their outputs are combined.
To improve clarity, the server logs are displayed in the original black color, while the notebook outputs are highlighted in blue.
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "from sglang.utils import (\n", " execute_shell_command,\n", @@ -119,7 +67,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": { "execution": { "iopub.execute_input": "2024-11-01T02:47:59.543958Z", @@ -128,28 +76,7 @@ "shell.execute_reply": "2024-11-01T02:47:59.590809Z" } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-10-31 22:40:57 TP0] Prefill batch. #new-seq: 1, #new-token: 4, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0\n", - "[2024-10-31 22:40:57] INFO: 127.0.0.1:51746 - \"POST /v1/embeddings HTTP/1.1\" 200 OK\n" - ] - }, - { - "data": { - "text/html": [ - "Text embedding (first 10): [0.0083160400390625, 0.0006804466247558594, -0.00809478759765625, -0.0006995201110839844, 0.0143890380859375, -0.0090179443359375, 0.01238250732421875, 0.00209808349609375, 0.0062103271484375, -0.003047943115234375]" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "import subprocess, json\n", "\n", @@ -176,7 +103,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": { "execution": { "iopub.execute_input": "2024-11-01T02:47:59.594229Z", @@ -185,28 +112,7 @@ "shell.execute_reply": "2024-11-01T02:48:00.005255Z" } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-10-31 22:40:58 TP0] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 3, cache hit rate: 21.43%, token usage: 0.00, #running-req: 0, #queue-req: 0\n", - "[2024-10-31 22:40:58] INFO: 127.0.0.1:51750 - \"POST /v1/embeddings HTTP/1.1\" 200 OK\n" - ] - }, - { - "data": { - "text/html": [ - "Text embedding (first 10): [0.00829315185546875, 0.0007004737854003906, -0.00809478759765625, -0.0006799697875976562, 0.01438140869140625, -0.00897979736328125, 0.0123748779296875, 0.0020923614501953125, 0.006195068359375, -0.0030498504638671875]" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "import openai\n", "\n", @@ -233,7 +139,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": { "execution": { "iopub.execute_input": "2024-11-01T02:48:00.008858Z", @@ -242,36 +148,7 @@ "shell.execute_reply": "2024-11-01T02:48:01.871573Z" } }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/chenyang/miniconda3/envs/AlphaMeemory/lib/python3.11/site-packages/transformers/utils/hub.py:128: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n", - " warnings.warn(\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-10-31 22:41:00 TP0] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 3, cache hit rate: 33.33%, token usage: 0.00, #running-req: 0, #queue-req: 0\n", - "[2024-10-31 22:41:00] INFO: 127.0.0.1:51762 - \"POST /v1/embeddings HTTP/1.1\" 200 OK\n" - ] - }, - { - "data": { - "text/html": [ - "Input IDs embedding (first 10): [0.00829315185546875, 0.0007004737854003906, -0.00809478759765625, -0.0006799697875976562, 0.01438140869140625, -0.00897979736328125, 0.0123748779296875, 0.0020923614501953125, 0.006195068359375, -0.0030498504638671875]" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "import json\n", "import os\n", diff --git a/docs/backend/native_api.ipynb b/docs/backend/native_api.ipynb new file mode 100644 index 000000000..65cbbab18 --- /dev/null +++ b/docs/backend/native_api.ipynb @@ -0,0 +1,286 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Native Server API\n", + "\n", + "Apart from the OpenAI compatible API, the SGLang Runtime also provides its native server API. We introduce these following API:\n", + "\n", + "- `/generate`\n", + "- `/update_weights`\n", + "- `/get_server_args`\n", + "- `/get_model_info`\n", + "- `/health`\n", + "- `/health_generate`\n", + "- `/flush_cache`\n", + "- `/get_memory_pool_size`\n", + "\n", + "We mainly use `requests` to test these APIs in the following examples. You can also use `curl`." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Launch A Server" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sglang.utils import (\n", + " execute_shell_command,\n", + " wait_for_server,\n", + " terminate_process,\n", + " print_highlight,\n", + ")\n", + "import subprocess, json\n", + "\n", + "server_process = execute_shell_command(\n", + "\"\"\"\n", + "python3 -m sglang.launch_server --model-path meta-llama/Llama-3.2-1B-Instruct --port=30010\n", + "\"\"\"\n", + ")\n", + "\n", + "wait_for_server(\"http://localhost:30010\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Generate\n", + "\n", + "Used to generate completion from the model, similar to the `/v1/completions` API in OpenAI. Detailed parameters can be found in the [sampling parameters](https://sgl-project.github.io/references/sampling_params.html)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import requests\n", + "\n", + "url = \"http://localhost:30010/generate\"\n", + "data = {\"text\": \"List 3 countries and their capitals.\"}\n", + "\n", + "response = requests.post(url, json=data)\n", + "print_highlight(response.text)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Get Server Args\n", + "\n", + "Used to get the serving args when the server is launched." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "url = \"http://localhost:30010/get_server_args\"\n", + "\n", + "response = requests.get(url)\n", + "print_highlight(response.json())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Get Model Info\n", + "\n", + "Used to get the model info.\n", + "\n", + "- `model_path`: The path/name of the model.\n", + "- `is_generation`: Whether the model is used as generation model or embedding model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "url = \"http://localhost:30010/get_model_info\"\n", + "\n", + "response = requests.get(url)\n", + "response_json = response.json()\n", + "print_highlight(response_json)\n", + "assert response_json[\"model_path\"] == \"meta-llama/Llama-3.2-1B-Instruct\"\n", + "assert response_json[\"is_generation\"] == True\n", + "assert response_json.keys() == {\"model_path\", \"is_generation\"}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Health and Health Generate\n", + "\n", + "- `/health`: Check the health of the server.\n", + "- `/health_generate`: Check the health of the server by generating one token." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "url = \"http://localhost:30010/health_generate\"\n", + "\n", + "response = requests.get(url)\n", + "print_highlight(response.text)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "url = \"http://localhost:30010/health\"\n", + "\n", + "response = requests.get(url)\n", + "print_highlight(response.text)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Flush Cache\n", + "\n", + "Used to flush the radix cache. It will be automatically triggered when the model weights are updated by the `/update_weights` API." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# flush cache\n", + "\n", + "url = \"http://localhost:30010/flush_cache\"\n", + "\n", + "response = requests.post(url)\n", + "print_highlight(response.text)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Get Memory Pool Size\n", + "\n", + "Get the memory pool size in number of tokens.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# get_memory_pool_size\n", + "\n", + "url = \"http://localhost:30010/get_memory_pool_size\"\n", + "\n", + "response = requests.get(url)\n", + "print_highlight(response.text)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Update Weights\n", + "\n", + "Update model weights without restarting the server. Use for continuous evaluation during training. Only applicable for models with the same architecture and parameter size." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# successful update with same architecture and size\n", + "\n", + "url = \"http://localhost:30010/update_weights\"\n", + "data = {\"model_path\": \"meta-llama/Llama-3.2-1B\"}\n", + "\n", + "response = requests.post(url, json=data)\n", + "print_highlight(response.text)\n", + "assert response.json()[\"success\"] == True\n", + "assert response.json()[\"message\"] == \"Succeeded to update model weights.\"\n", + "assert response.json().keys() == {\"success\", \"message\"}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# failed update with different parameter size\n", + "\n", + "url = \"http://localhost:30010/update_weights\"\n", + "data = {\"model_path\": \"meta-llama/Llama-3.2-3B\"}\n", + "\n", + "response = requests.post(url, json=data)\n", + "response_json = response.json()\n", + "print_highlight(response_json)\n", + "assert response_json[\"success\"] == False\n", + "assert response_json[\"message\"] == (\n", + " \"Failed to update weights: The size of tensor a (2048) must match \"\n", + " \"the size of tensor b (3072) at non-singleton dimension 1.\\n\"\n", + " \"Rolling back to original weights.\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "terminate_process(server_process)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "AlphaMeemory", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/backend/openai_api_completions.ipynb b/docs/backend/openai_api_completions.ipynb index 6b649a4e4..13ea0acdb 100644 --- a/docs/backend/openai_api_completions.ipynb +++ b/docs/backend/openai_api_completions.ipynb @@ -38,55 +38,7 @@ "cell_type": "code", "execution_count": 1, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-11-02 00:06:33.051950: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n", - "2024-11-02 00:06:33.063961: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n", - "2024-11-02 00:06:33.063983: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1442] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n", - "2024-11-02 00:06:33.581526: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n", - "[2024-11-02 00:06:41] server_args=ServerArgs(model_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, context_length=None, device='cuda', served_model_name='meta-llama/Meta-Llama-3.1-8B-Instruct', chat_template=None, is_embedding=False, host='0.0.0.0', port=30000, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, tp_size=1, stream_interval=1, random_seed=73322355, constrained_json_whitespace_pattern=None, decode_log_interval=40, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, watchdog_timeout=600, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_flashinfer=False, disable_flashinfer_sampling=False, disable_radix_cache=False, disable_regex_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_penalizer=False, disable_nan_detection=False, enable_overlap_schedule=False, enable_mixed_chunk=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config='', enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1)\n", - "[2024-11-02 00:06:51 TP0] Init torch distributed begin.\n", - "[2024-11-02 00:06:54 TP0] Load weight begin. avail mem=76.83 GB\n", - "[2024-11-02 00:06:54 TP0] lm_eval is not installed, GPTQ may not be usable\n", - "INFO 11-02 00:06:54 weight_utils.py:243] Using model weights format ['*.safetensors']\n", - "Loading safetensors checkpoint shards: 0% Completed | 0/4 [00:00

NOTE: Typically, the server runs in a separate terminal.
In this notebook, we run the server and notebook code together, so their outputs are combined.
To improve clarity, the server logs are displayed in the original black color, while the notebook outputs are highlighted in blue.
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "from sglang.utils import (\n", " execute_shell_command,\n", @@ -96,7 +48,7 @@ ")\n", "\n", "server_process = execute_shell_command(\n", - "\"python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --port 30000 --host 0.0.0.0\"\n", + " \"python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --port 30000 --host 0.0.0.0\"\n", ")\n", "\n", "wait_for_server(\"http://localhost:30000\")" @@ -126,29 +78,7 @@ "shell.execute_reply": "2024-11-01T02:45:18.086450Z" } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-11-02 00:08:04 TP0] Prefill batch. #new-seq: 1, #new-token: 48, #cached-token: 1, cache hit rate: 1.79%, token usage: 0.00, #running-req: 0, #queue-req: 0\n", - "[2024-11-02 00:08:04 TP0] Decode batch. #running-req: 1, #token: 82, token usage: 0.00, gen throughput (token/s): 0.72, #queue-req: 0\n", - "[2024-11-02 00:08:04] INFO: 127.0.0.1:51178 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n" - ] - }, - { - "data": { - "text/html": [ - "Response: ChatCompletion(id='bb74a7e9fcae4df7af2ee59e25aa75a5', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are 3 countries and their capitals:\\n\\n1. **Country:** Japan\\n**Capital:** Tokyo\\n\\n2. **Country:** Australia\\n**Capital:** Canberra\\n\\n3. **Country:** Brazil\\n**Capital:** Brasília', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None), matched_stop=128009)], created=1730506084, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=46, prompt_tokens=49, total_tokens=95, completion_tokens_details=None, prompt_tokens_details=None))" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "import openai\n", "\n", @@ -189,31 +119,7 @@ "shell.execute_reply": "2024-11-01T02:45:21.192539Z" } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-11-02 00:08:08 TP0] Prefill batch. #new-seq: 1, #new-token: 48, #cached-token: 28, cache hit rate: 21.97%, token usage: 0.00, #running-req: 0, #queue-req: 0\n", - "[2024-11-02 00:08:08 TP0] Decode batch. #running-req: 1, #token: 104, token usage: 0.00, gen throughput (token/s): 9.89, #queue-req: 0\n", - "[2024-11-02 00:08:09 TP0] Decode batch. #running-req: 1, #token: 144, token usage: 0.00, gen throughput (token/s): 132.64, #queue-req: 0\n", - "[2024-11-02 00:08:09 TP0] Decode batch. #running-req: 1, #token: 184, token usage: 0.00, gen throughput (token/s): 132.28, #queue-req: 0\n", - "[2024-11-02 00:08:09] INFO: 127.0.0.1:51178 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n" - ] - }, - { - "data": { - "text/html": [ - "Ancient Rome's major achievements include:

1. **Law and Governance**: The Twelve Tables (450 BCE) and the Julian Laws (5th century BCE) established a foundation for Roman law, which influenced modern Western law. The Roman Republic (509-27 BCE) and Empire (27 BCE-476 CE) developed a system of governance that included the concept of citizenship, representation, and checks on power.

2. **Architecture and Engineering**: Romans developed impressive architectural styles, such as the arch, dome, and aqueducts. Iconic structures like the Colosseum, Pantheon, and Roman Forum showcased their engineering prowess.

" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "response = client.chat.completions.create(\n", " model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n", @@ -259,17 +165,7 @@ "shell.execute_reply": "2024-11-01T02:45:21.675050Z" } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-11-02 00:08:19] INFO: 127.0.0.1:44218 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n", - "[2024-11-02 00:08:19 TP0] Prefill batch. #new-seq: 1, #new-token: 15, #cached-token: 25, cache hit rate: 31.40%, token usage: 0.00, #running-req: 0, #queue-req: 0\n", - "This is only a test." - ] - } - ], + "outputs": [], "source": [ "stream = client.chat.completions.create(\n", " model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n", @@ -303,30 +199,7 @@ "shell.execute_reply": "2024-11-01T02:45:23.181695Z" } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-11-02 00:08:25 TP0] Prefill batch. #new-seq: 1, #new-token: 8, #cached-token: 1, cache hit rate: 30.39%, token usage: 0.00, #running-req: 0, #queue-req: 0\n", - "[2024-11-02 00:08:25 TP0] Decode batch. #running-req: 1, #token: 24, token usage: 0.00, gen throughput (token/s): 2.45, #queue-req: 0\n", - "[2024-11-02 00:08:25 TP0] Decode batch. #running-req: 1, #token: 64, token usage: 0.00, gen throughput (token/s): 142.10, #queue-req: 0\n", - "[2024-11-02 00:08:26] INFO: 127.0.0.1:37290 - \"POST /v1/completions HTTP/1.1\" 200 OK\n" - ] - }, - { - "data": { - "text/html": [ - "Response: Completion(id='25412696fce14364b40430b5671fc11e', choices=[CompletionChoice(finish_reason='length', index=0, logprobs=None, text=' 1. 2. 3.\\n1. United States - Washington D.C. 2. Japan - Tokyo 3. Australia - Canberra\\nList 3 countries and their capitals. 1. 2. 3.\\n1. China - Beijing 2. Brazil - Bras', matched_stop=None)], created=1730506106, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='text_completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=64, prompt_tokens=9, total_tokens=73, completion_tokens_details=None, prompt_tokens_details=None))" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "response = client.completions.create(\n", " model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n", @@ -362,62 +235,7 @@ "shell.execute_reply": "2024-11-01T02:45:26.769299Z" } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-10-31 19:45:23 TP0] Prefill batch. #new-seq: 1, #new-token: 9, #cached-token: 1, cache hit rate: 29.32%, token usage: 0.00, #running-req: 0, #queue-req: 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-10-31 19:45:23 TP0] Decode batch. #running-req: 1, #token: 29, token usage: 0.00, gen throughput (token/s): 40.76, #queue-req: 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-10-31 19:45:24 TP0] Decode batch. #running-req: 1, #token: 69, token usage: 0.00, gen throughput (token/s): 42.13, #queue-req: 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-10-31 19:45:25 TP0] Decode batch. #running-req: 1, #token: 109, token usage: 0.00, gen throughput (token/s): 42.01, #queue-req: 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-10-31 19:45:26 TP0] Decode batch. #running-req: 1, #token: 149, token usage: 0.00, gen throughput (token/s): 41.87, #queue-req: 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-10-31 19:45:26] INFO: 127.0.0.1:37738 - \"POST /v1/completions HTTP/1.1\" 200 OK\n" - ] - }, - { - "data": { - "text/html": [ - "Response: Completion(id='fe384c17aece4a5ca5fb5238dcd1adec', choices=[CompletionChoice(finish_reason='length', index=0, logprobs=None, text=\" This can be a sci-fi story, and you have the ability to create a unique and imaginative universe.\\nIn the depths of space, a lone space explorer named Kaelin Vex navigated through the swirling vortex of the Aurora Nebula. Her ship, the Starweaver, was an extension of herself, its advanced AI system linked directly to her mind. Together, they danced through the cosmos, searching for answers to the mysteries of the universe.\\nKaelin's mission was to uncover the secrets of the ancient alien civilization known as the Architects. Legends spoke of their unparalleled technological prowess and their ability to manipulate reality itself. Many believed they had transcended their physical forms, becoming one with the cosmos.\\nAs Kaelin delved deeper into\", matched_stop=None)], created=1730429126, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='text_completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=150, prompt_tokens=10, total_tokens=160, prompt_tokens_details=None))" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "response = client.completions.create(\n", " model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n", @@ -463,29 +281,7 @@ "shell.execute_reply": "2024-11-01T02:45:26.793811Z" } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-10-31 19:45:26] INFO: 127.0.0.1:57182 - \"POST /v1/files HTTP/1.1\" 200 OK\n", - "[2024-10-31 19:45:26] INFO: 127.0.0.1:57182 - \"POST /v1/batches HTTP/1.1\" 200 OK\n", - "[2024-10-31 19:45:26 TP0] Prefill batch. #new-seq: 2, #new-token: 20, #cached-token: 60, cache hit rate: 42.80%, token usage: 0.00, #running-req: 0, #queue-req: 0\n" - ] - }, - { - "data": { - "text/html": [ - "Batch job created with ID: batch_d9af5b49-ad3d-423e-8c30-4aaafa5c18c4" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "import json\n", "import time\n", @@ -547,93 +343,7 @@ "shell.execute_reply": "2024-11-01T02:45:29.810041Z" } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-10-31 19:45:27 TP0] Decode batch. #running-req: 1, #token: 69, token usage: 0.00, gen throughput (token/s): 51.72, #queue-req: 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Batch job status: validating...trying again in 3 seconds...\n", - "[2024-10-31 19:45:29] INFO: 127.0.0.1:57182 - \"GET /v1/batches/batch_d9af5b49-ad3d-423e-8c30-4aaafa5c18c4 HTTP/1.1\" 200 OK\n", - "Batch job completed successfully!\n", - "Request counts: BatchRequestCounts(completed=2, failed=0, total=2)\n", - "[2024-10-31 19:45:29] INFO: 127.0.0.1:57182 - \"GET /v1/files/backend_result_file-4ed79bf4-1e07-4fa9-9638-7448aa4e074b/content HTTP/1.1\" 200 OK\n" - ] - }, - { - "data": { - "text/html": [ - "Request request-1:" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Response: {'status_code': 200, 'request_id': 'request-1', 'body': {'id': 'request-1', 'object': 'chat.completion', 'created': 1730429127, 'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'choices': {'index': 0, 'message': {'role': 'assistant', 'content': 'Why do programmers prefer dark mode?\\n\\nBecause light attracts bugs.'}, 'logprobs': None, 'finish_reason': 'stop', 'matched_stop': 128009}, 'usage': {'prompt_tokens': 41, 'completion_tokens': 13, 'total_tokens': 54}, 'system_fingerprint': None}}" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Request request-2:" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Response: {'status_code': 200, 'request_id': 'request-2', 'body': {'id': 'request-2', 'object': 'chat.completion', 'created': 1730429127, 'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'choices': {'index': 0, 'message': {'role': 'assistant', 'content': '**What is Python?**\\n\\nPython is a high-level, interpreted programming language that is widely used for various purposes such as web development, scientific computing, data analysis, artificial intelligence, and more. It was created in the late 1980s by'}, 'logprobs': None, 'finish_reason': 'length', 'matched_stop': None}, 'usage': {'prompt_tokens': 39, 'completion_tokens': 50, 'total_tokens': 89}, 'system_fingerprint': None}}" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Cleaning up files..." - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-10-31 19:45:29] INFO: 127.0.0.1:57182 - \"DELETE /v1/files/backend_result_file-4ed79bf4-1e07-4fa9-9638-7448aa4e074b HTTP/1.1\" 200 OK\n" - ] - } - ], + "outputs": [], "source": [ "while batch_response.status not in [\"completed\", \"failed\", \"cancelled\"]:\n", " time.sleep(3)\n", @@ -688,287 +398,7 @@ "shell.execute_reply": "2024-11-01T02:45:54.850668Z" } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-10-31 19:45:29] INFO: 127.0.0.1:57186 - \"POST /v1/files HTTP/1.1\" 200 OK\n", - "[2024-10-31 19:45:29] INFO: 127.0.0.1:57186 - \"POST /v1/batches HTTP/1.1\" 200 OK\n" - ] - }, - { - "data": { - "text/html": [ - "Created batch job with ID: batch_3d1a7f8e-af5a-4a14-8391-1001aadfe1b2" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Initial status: validating" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-10-31 19:45:29 TP0] Prefill batch. #new-seq: 27, #new-token: 810, #cached-token: 675, cache hit rate: 45.05%, token usage: 0.00, #running-req: 0, #queue-req: 0\n", - "[2024-10-31 19:45:29 TP0] Prefill batch. #new-seq: 73, #new-token: 2190, #cached-token: 1825, cache hit rate: 45.33%, token usage: 0.00, #running-req: 27, #queue-req: 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-10-31 19:45:30 TP0] Decode batch. #running-req: 100, #token: 5125, token usage: 0.02, gen throughput (token/s): 636.38, #queue-req: 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-10-31 19:45:31 TP0] Decode batch. #running-req: 100, #token: 9125, token usage: 0.04, gen throughput (token/s): 3507.97, #queue-req: 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-10-31 19:45:33 TP0] Decode batch. #running-req: 100, #token: 13125, token usage: 0.06, gen throughput (token/s): 3417.06, #queue-req: 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-10-31 19:45:34 TP0] Decode batch. #running-req: 100, #token: 17125, token usage: 0.08, gen throughput (token/s): 3332.03, #queue-req: 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-10-31 19:45:35 TP0] Decode batch. #running-req: 100, #token: 21125, token usage: 0.10, gen throughput (token/s): 3252.29, #queue-req: 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-10-31 19:45:36 TP0] Decode batch. #running-req: 100, #token: 25125, token usage: 0.12, gen throughput (token/s): 3173.87, #queue-req: 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-10-31 19:45:38 TP0] Decode batch. #running-req: 100, #token: 29125, token usage: 0.13, gen throughput (token/s): 3101.31, #queue-req: 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-10-31 19:45:39 TP0] Decode batch. #running-req: 100, #token: 33125, token usage: 0.15, gen throughput (token/s): 3030.90, #queue-req: 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-10-31 19:45:39] INFO: 127.0.0.1:37782 - \"GET /v1/batches/batch_3d1a7f8e-af5a-4a14-8391-1001aadfe1b2 HTTP/1.1\" 200 OK\n" - ] - }, - { - "data": { - "text/html": [ - "Batch job details (check 1 / 5) // ID: batch_3d1a7f8e-af5a-4a14-8391-1001aadfe1b2 // Status: in_progress // Created at: 1730429129 // Input file ID: backend_input_file-f42b27b5-05ee-4d27-9a37-ff04c3b4a427 // Output file ID: None" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Request counts: Total: 0 // Completed: 0 // Failed: 0" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-10-31 19:45:40 TP0] Decode batch. #running-req: 100, #token: 37125, token usage: 0.17, gen throughput (token/s): 2961.37, #queue-req: 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-10-31 19:45:42 TP0] Decode batch. #running-req: 100, #token: 41125, token usage: 0.19, gen throughput (token/s): 2899.29, #queue-req: 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-10-31 19:45:42] INFO: 127.0.0.1:37782 - \"GET /v1/batches/batch_3d1a7f8e-af5a-4a14-8391-1001aadfe1b2 HTTP/1.1\" 200 OK\n" - ] - }, - { - "data": { - "text/html": [ - "Batch job details (check 2 / 5) // ID: batch_3d1a7f8e-af5a-4a14-8391-1001aadfe1b2 // Status: in_progress // Created at: 1730429129 // Input file ID: backend_input_file-f42b27b5-05ee-4d27-9a37-ff04c3b4a427 // Output file ID: None" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Request counts: Total: 0 // Completed: 0 // Failed: 0" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-10-31 19:45:43 TP0] Decode batch. #running-req: 100, #token: 45125, token usage: 0.21, gen throughput (token/s): 2836.50, #queue-req: 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-10-31 19:45:45 TP0] Decode batch. #running-req: 100, #token: 49125, token usage: 0.23, gen throughput (token/s): 2777.80, #queue-req: 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-10-31 19:45:45] INFO: 127.0.0.1:37782 - \"GET /v1/batches/batch_3d1a7f8e-af5a-4a14-8391-1001aadfe1b2 HTTP/1.1\" 200 OK\n" - ] - }, - { - "data": { - "text/html": [ - "Batch job details (check 3 / 5) // ID: batch_3d1a7f8e-af5a-4a14-8391-1001aadfe1b2 // Status: in_progress // Created at: 1730429129 // Input file ID: backend_input_file-f42b27b5-05ee-4d27-9a37-ff04c3b4a427 // Output file ID: None" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Request counts: Total: 0 // Completed: 0 // Failed: 0" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-10-31 19:45:48] INFO: 127.0.0.1:37782 - \"GET /v1/batches/batch_3d1a7f8e-af5a-4a14-8391-1001aadfe1b2 HTTP/1.1\" 200 OK\n" - ] - }, - { - "data": { - "text/html": [ - "Batch job details (check 4 / 5) // ID: batch_3d1a7f8e-af5a-4a14-8391-1001aadfe1b2 // Status: completed // Created at: 1730429129 // Input file ID: backend_input_file-f42b27b5-05ee-4d27-9a37-ff04c3b4a427 // Output file ID: backend_result_file-dc391511-07f2-4f94-90cb-3ed09bc4b8a3" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Request counts: Total: 100 // Completed: 100 // Failed: 0" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-10-31 19:45:51] INFO: 127.0.0.1:37782 - \"GET /v1/batches/batch_3d1a7f8e-af5a-4a14-8391-1001aadfe1b2 HTTP/1.1\" 200 OK\n" - ] - }, - { - "data": { - "text/html": [ - "Batch job details (check 5 / 5) // ID: batch_3d1a7f8e-af5a-4a14-8391-1001aadfe1b2 // Status: completed // Created at: 1730429129 // Input file ID: backend_input_file-f42b27b5-05ee-4d27-9a37-ff04c3b4a427 // Output file ID: backend_result_file-dc391511-07f2-4f94-90cb-3ed09bc4b8a3" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Request counts: Total: 100 // Completed: 100 // Failed: 0" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "import json\n", "import time\n", @@ -1051,163 +481,7 @@ "shell.execute_reply": "2024-11-01T02:46:07.892310Z" } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-10-31 19:45:54] INFO: 127.0.0.1:33180 - \"POST /v1/files HTTP/1.1\" 200 OK\n", - "[2024-10-31 19:45:54] INFO: 127.0.0.1:33180 - \"POST /v1/batches HTTP/1.1\" 200 OK\n" - ] - }, - { - "data": { - "text/html": [ - "Created batch job with ID: batch_c30756c3-8c09-4142-9630-9590d6124986" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Initial status: validating" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-10-31 19:45:54 TP0] Prefill batch. #new-seq: 135, #new-token: 1150, #cached-token: 6275, cache hit rate: 67.38%, token usage: 0.01, #running-req: 0, #queue-req: 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-10-31 19:45:55 TP0] Prefill batch. #new-seq: 274, #new-token: 8192, #cached-token: 6850, cache hit rate: 55.74%, token usage: 0.02, #running-req: 135, #queue-req: 91\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-10-31 19:45:56 TP0] Prefill batch. #new-seq: 92, #new-token: 2758, #cached-token: 2302, cache hit rate: 54.19%, token usage: 0.06, #running-req: 408, #queue-req: 1\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-10-31 19:45:56 TP0] Decode batch. #running-req: 500, #token: 16025, token usage: 0.07, gen throughput (token/s): 409.21, #queue-req: 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-10-31 19:46:00 TP0] Decode batch. #running-req: 500, #token: 36025, token usage: 0.17, gen throughput (token/s): 5777.09, #queue-req: 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-10-31 19:46:03 TP0] Decode batch. #running-req: 500, #token: 56025, token usage: 0.26, gen throughput (token/s): 5530.76, #queue-req: 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-10-31 19:46:04] INFO: 127.0.0.1:57728 - \"POST /v1/batches/batch_c30756c3-8c09-4142-9630-9590d6124986/cancel HTTP/1.1\" 200 OK\n" - ] - }, - { - "data": { - "text/html": [ - "Cancellation initiated. Status: cancelling" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-10-31 19:46:07] INFO: 127.0.0.1:57728 - \"GET /v1/batches/batch_c30756c3-8c09-4142-9630-9590d6124986 HTTP/1.1\" 200 OK\n" - ] - }, - { - "data": { - "text/html": [ - "Current status: cancelled" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Batch job successfully cancelled" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-10-31 19:46:07] INFO: 127.0.0.1:57728 - \"DELETE /v1/files/backend_input_file-0fbf83a7-301c-488e-a221-b702e24df6a5 HTTP/1.1\" 200 OK\n" - ] - }, - { - "data": { - "text/html": [ - "Successfully cleaned up input file" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Successfully deleted local batch_requests.jsonl file" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "import json\n", "import time\n", diff --git a/docs/backend/openai_api_vision.ipynb b/docs/backend/openai_api_vision.ipynb index 25742cb6e..6b006606b 100644 --- a/docs/backend/openai_api_vision.ipynb +++ b/docs/backend/openai_api_vision.ipynb @@ -38,58 +38,7 @@ "cell_type": "code", "execution_count": 13, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-11-02 00:24:10.542705: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n", - "2024-11-02 00:24:10.554725: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n", - "2024-11-02 00:24:10.554758: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1442] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n", - "2024-11-02 00:24:11.063662: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n", - "[2024-11-02 00:24:19] server_args=ServerArgs(model_path='meta-llama/Llama-3.2-11B-Vision-Instruct', tokenizer_path='meta-llama/Llama-3.2-11B-Vision-Instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, context_length=None, device='cuda', served_model_name='meta-llama/Llama-3.2-11B-Vision-Instruct', chat_template='llama_3_vision', is_embedding=False, host='127.0.0.1', port=30010, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, tp_size=1, stream_interval=1, random_seed=553831757, constrained_json_whitespace_pattern=None, decode_log_interval=40, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, watchdog_timeout=600, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_flashinfer=False, disable_flashinfer_sampling=False, disable_radix_cache=False, disable_regex_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_penalizer=False, disable_nan_detection=False, enable_overlap_schedule=False, enable_mixed_chunk=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config='', enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1)\n", - "[2024-11-02 00:24:20] Use chat template for the OpenAI-compatible API server: llama_3_vision\n", - "[2024-11-02 00:24:29 TP0] Automatically turn off --chunked-prefill-size and adjust --mem-fraction-static for multimodal models.\n", - "[2024-11-02 00:24:29 TP0] Init torch distributed begin.\n", - "[2024-11-02 00:24:32 TP0] Load weight begin. avail mem=76.83 GB\n", - "[2024-11-02 00:24:32 TP0] lm_eval is not installed, GPTQ may not be usable\n", - "INFO 11-02 00:24:32 weight_utils.py:243] Using model weights format ['*.safetensors']\n", - "Loading safetensors checkpoint shards: 0% Completed | 0/5 [00:00

NOTE: Typically, the server runs in a separate terminal.
In this notebook, we run the server and notebook code together, so their outputs are combined.
To improve clarity, the server logs are displayed in the original black color, while the notebook outputs are highlighted in blue.
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "from sglang.utils import (\n", " execute_shell_command,\n", @@ -99,7 +48,7 @@ ")\n", "\n", "embedding_process = execute_shell_command(\n", - "\"\"\"\n", + " \"\"\"\n", "python3 -m sglang.launch_server --model-path meta-llama/Llama-3.2-11B-Vision-Instruct \\\n", " --port=30010 --chat-template=llama_3_vision\n", "\"\"\"\n", @@ -121,44 +70,7 @@ "cell_type": "code", "execution_count": 15, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - " % Total % Received % Xferd Average Speed Time Time Time Current\n", - " Dload Upload Total Spent Left Speed\n", - "100 485 0 0 100 485 0 2420 --:--:-- --:--:-- --:--:-- 2412" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-11-02 00:26:23 TP0] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 6462, cache hit rate: 49.97%, token usage: 0.02, #running-req: 0, #queue-req: 0\n", - "[2024-11-02 00:26:24] INFO: 127.0.0.1:39828 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100 965 100 480 100 485 789 797 --:--:-- --:--:-- --:--:-- 1584\n" - ] - }, - { - "data": { - "text/html": [ - "{\"id\":\"5e9e1c80809f492a926a2634c3d162d0\",\"object\":\"chat.completion\",\"created\":1730507184,\"model\":\"meta-llama/Llama-3.2-11B-Vision-Instruct\",\"choices\":[{\"index\":0,\"message\":{\"role\":\"assistant\",\"content\":\"The image depicts a man ironing clothes on an ironing board that is placed on the back of a yellow taxi cab.\"},\"logprobs\":null,\"finish_reason\":\"stop\",\"matched_stop\":128009}],\"usage\":{\"prompt_tokens\":6463,\"total_tokens\":6489,\"completion_tokens\":26,\"prompt_tokens_details\":null}}" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "import subprocess\n", "\n", @@ -206,29 +118,7 @@ "cell_type": "code", "execution_count": 16, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-11-02 00:26:33 TP0] Prefill batch. #new-seq: 1, #new-token: 11, #cached-token: 6452, cache hit rate: 66.58%, token usage: 0.02, #running-req: 0, #queue-req: 0\n", - "[2024-11-02 00:26:34 TP0] Decode batch. #running-req: 1, #token: 6477, token usage: 0.02, gen throughput (token/s): 0.77, #queue-req: 0\n", - "[2024-11-02 00:26:34] INFO: 127.0.0.1:43258 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n" - ] - }, - { - "data": { - "text/html": [ - "The image shows a man ironing clothes on the back of a yellow taxi cab." - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "from openai import OpenAI\n", "\n", @@ -246,7 +136,9 @@ " },\n", " {\n", " \"type\": \"image_url\",\n", - " \"image_url\": {\"url\": \"https://github.com/sgl-project/sglang/blob/main/test/lang/example_image.png?raw=true\"},\n", + " \"image_url\": {\n", + " \"url\": \"https://github.com/sgl-project/sglang/blob/main/test/lang/example_image.png?raw=true\"\n", + " },\n", " },\n", " ],\n", " }\n", @@ -270,30 +162,7 @@ "cell_type": "code", "execution_count": 11, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-11-02 00:20:30 TP0] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 12894, cache hit rate: 83.27%, token usage: 0.04, #running-req: 0, #queue-req: 0\n", - "[2024-11-02 00:20:30 TP0] Decode batch. #running-req: 1, #token: 12903, token usage: 0.04, gen throughput (token/s): 2.02, #queue-req: 0\n", - "[2024-11-02 00:20:30 TP0] Decode batch. #running-req: 1, #token: 12943, token usage: 0.04, gen throughput (token/s): 105.52, #queue-req: 0\n", - "[2024-11-02 00:20:30] INFO: 127.0.0.1:41386 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n" - ] - }, - { - "data": { - "text/html": [ - "The first image shows a man in a yellow shirt ironing a shirt on the back of a yellow taxi cab, with a red line connecting the two objects. The second image shows a large orange \"S\" and \"G\" on a white background, with a red line connecting them." - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "from openai import OpenAI\n", "\n", @@ -320,7 +189,7 @@ " {\n", " \"type\": \"text\",\n", " \"text\": \"I have two very different images. They are not related at all. \"\n", - " \"Please describe the first image in one sentence, and then describe the second image in another sentence.\",\n", + " \"Please describe the first image in one sentence, and then describe the second image in another sentence.\",\n", " },\n", " ],\n", " }\n", diff --git a/docs/index.rst b/docs/index.rst index f5468c88f..48ca25ed6 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -25,6 +25,7 @@ The core features include: backend/openai_api_completions.ipynb backend/openai_api_vision.ipynb + backend/native_api.ipynb backend/backend.md diff --git a/docs/start/send_request.ipynb b/docs/start/send_request.ipynb index 209910185..9a2a8555b 100644 --- a/docs/start/send_request.ipynb +++ b/docs/start/send_request.ipynb @@ -36,55 +36,7 @@ "shell.execute_reply": "2024-11-01T02:46:42.809147Z" } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-11-02 00:27:25.383621: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n", - "2024-11-02 00:27:25.396224: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n", - "2024-11-02 00:27:25.396257: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1442] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n", - "2024-11-02 00:27:25.922262: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n", - "[2024-11-02 00:27:34] server_args=ServerArgs(model_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, context_length=None, device='cuda', served_model_name='meta-llama/Meta-Llama-3.1-8B-Instruct', chat_template=None, is_embedding=False, host='0.0.0.0', port=30000, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, tp_size=1, stream_interval=1, random_seed=259802610, constrained_json_whitespace_pattern=None, decode_log_interval=40, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, watchdog_timeout=600, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_flashinfer=False, disable_flashinfer_sampling=False, disable_radix_cache=False, disable_regex_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_penalizer=False, disable_nan_detection=False, enable_overlap_schedule=False, enable_mixed_chunk=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config='', enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1)\n", - "[2024-11-02 00:27:43 TP0] Init torch distributed begin.\n", - "[2024-11-02 00:27:48 TP0] Load weight begin. avail mem=76.83 GB\n", - "[2024-11-02 00:27:48 TP0] lm_eval is not installed, GPTQ may not be usable\n", - "INFO 11-02 00:27:49 weight_utils.py:243] Using model weights format ['*.safetensors']\n", - "Loading safetensors checkpoint shards: 0% Completed | 0/4 [00:00

NOTE: Typically, the server runs in a separate terminal.
In this notebook, we run the server and notebook code together, so their outputs are combined.
To improve clarity, the server logs are displayed in the original black color, while the notebook outputs are highlighted in blue.
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "from sglang.utils import (\n", " execute_shell_command,\n", @@ -123,45 +75,7 @@ "shell.execute_reply": "2024-11-01T02:46:51.435965Z" } }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - " % Total % Received % Xferd Average Speed Time Time Time Current\n", - " Dload Upload Total Spent Left Speed\n", - "100 278 0 0 100 278 0 1387 --:--:-- --:--:-- --:--:-- 1383" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-11-02 00:28:48 TP0] Prefill batch. #new-seq: 1, #new-token: 11, #cached-token: 42, cache hit rate: 40.19%, token usage: 0.00, #running-req: 0, #queue-req: 0\n", - "[2024-11-02 00:28:48 TP0] Decode batch. #running-req: 1, #token: 75, token usage: 0.00, gen throughput (token/s): 1.46, #queue-req: 0\n", - "[2024-11-02 00:28:49] INFO: 127.0.0.1:53714 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100 871 100 593 100 278 1788 838 --:--:-- --:--:-- --:--:-- 2623\n" - ] - }, - { - "data": { - "text/html": [ - "{\"id\":\"a0714277fab546c5b6d91724aa3e27a3\",\"object\":\"chat.completion\",\"created\":1730507329,\"model\":\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\"choices\":[{\"index\":0,\"message\":{\"role\":\"assistant\",\"content\":\"An LLM, or Large Language Model, is a type of artificial intelligence (AI) designed to process and generate human-like language, often used in applications such as chatbots, virtual assistants, and language translation software.\"},\"logprobs\":null,\"finish_reason\":\"stop\",\"matched_stop\":128009}],\"usage\":{\"prompt_tokens\":53,\"total_tokens\":98,\"completion_tokens\":45,\"prompt_tokens_details\":null}}" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "import subprocess\n", "\n", @@ -209,29 +123,7 @@ "shell.execute_reply": "2024-11-01T02:46:52.895318Z" } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-11-02 00:03:52 TP0] Prefill batch. #new-seq: 1, #new-token: 20, #cached-token: 29, cache hit rate: 29.13%, token usage: 0.00, #running-req: 0, #queue-req: 0\n", - "[2024-11-02 00:03:52 TP0] Decode batch. #running-req: 1, #token: 65, token usage: 0.00, gen throughput (token/s): 11.33, #queue-req: 0\n", - "[2024-11-02 00:03:53] INFO: 127.0.0.1:57008 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n" - ] - }, - { - "data": { - "text/html": [ - "ChatCompletion(id='a6590143c40f4732a5c57d4c91b43f05', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are 3 countries and their capitals:\\n\\n1. **Country:** Japan\\n**Capital:** Tokyo\\n\\n2. **Country:** Australia\\n**Capital:** Canberra\\n\\n3. **Country:** Brazil\\n**Capital:** Brasília', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None), matched_stop=128009)], created=1730505833, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=46, prompt_tokens=49, total_tokens=95, completion_tokens_details=None, prompt_tokens_details=None))" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "import openai\n", "\n", @@ -264,29 +156,7 @@ "cell_type": "code", "execution_count": 5, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-11-02 00:05:04 TP0] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 5, cache hit rate: 33.04%, token usage: 0.00, #running-req: 0, #queue-req: 0\n", - "[2024-11-02 00:05:04 TP0] Decode batch. #running-req: 1, #token: 26, token usage: 0.00, gen throughput (token/s): 3.10, #queue-req: 0\n", - "[2024-11-02 00:05:04] INFO: 127.0.0.1:60536 - \"POST /generate HTTP/1.1\" 200 OK\n" - ] - }, - { - "data": { - "text/html": [ - "{'text': ' a city of romance, art, fashion, and history. Paris is a must-visit destination for anyone who loves culture, architecture, and cuisine. From the', 'meta_info': {'prompt_tokens': 6, 'completion_tokens': 32, 'completion_tokens_wo_jump_forward': 32, 'cached_tokens': 5, 'finish_reason': {'type': 'length', 'length': 32}, 'id': 'd882513c180d4c5981488257ccab4b9f'}, 'index': 0}" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "import requests\n", "\n",