add native api docs (#1883)

Co-authored-by: Chayenne <zhaochenyang@g.ucla.edu>
2024-11-02 00:17:30 -07:00
parent 146f613405
commit 72e979bfb5
6 changed files with 319 additions and 1142 deletions
--- a/docs/backend/openai_api_completions.ipynb
+++ b/docs/backend/openai_api_completions.ipynb
@@ -38,55 +38,7 @@
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2024-11-02 00:06:33.051950: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
-      "2024-11-02 00:06:33.063961: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
-      "2024-11-02 00:06:33.063983: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1442] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
-      "2024-11-02 00:06:33.581526: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n",
-      "[2024-11-02 00:06:41] server_args=ServerArgs(model_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, context_length=None, device='cuda', served_model_name='meta-llama/Meta-Llama-3.1-8B-Instruct', chat_template=None, is_embedding=False, host='0.0.0.0', port=30000, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, tp_size=1, stream_interval=1, random_seed=73322355, constrained_json_whitespace_pattern=None, decode_log_interval=40, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, watchdog_timeout=600, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_flashinfer=False, disable_flashinfer_sampling=False, disable_radix_cache=False, disable_regex_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_penalizer=False, disable_nan_detection=False, enable_overlap_schedule=False, enable_mixed_chunk=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config='', enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1)\n",
-      "[2024-11-02 00:06:51 TP0] Init torch distributed begin.\n",
-      "[2024-11-02 00:06:54 TP0] Load weight begin. avail mem=76.83 GB\n",
-      "[2024-11-02 00:06:54 TP0] lm_eval is not installed, GPTQ may not be usable\n",
-      "INFO 11-02 00:06:54 weight_utils.py:243] Using model weights format ['*.safetensors']\n",
-      "Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]\n",
-      "Loading safetensors checkpoint shards:  25% Completed | 1/4 [00:01<00:03,  1.22s/it]\n",
-      "Loading safetensors checkpoint shards:  50% Completed | 2/4 [00:02<00:02,  1.24s/it]\n",
-      "Loading safetensors checkpoint shards:  75% Completed | 3/4 [00:03<00:01,  1.29s/it]\n",
-      "Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:04<00:00,  1.06it/s]\n",
-      "Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:04<00:00,  1.06s/it]\n",
-      "\n",
-      "[2024-11-02 00:06:59 TP0] Load weight end. type=LlamaForCausalLM, dtype=torch.bfloat16, avail mem=61.82 GB\n",
-      "[2024-11-02 00:06:59 TP0] Memory pool end. avail mem=8.19 GB\n",
-      "[2024-11-02 00:07:00 TP0] Capture cuda graph begin. This can take up to several minutes.\n",
-      "[2024-11-02 00:07:09 TP0] max_total_num_tokens=430915, max_prefill_tokens=16384, max_running_requests=2049, context_len=131072\n",
-      "[2024-11-02 00:07:09] INFO:     Started server process [102482]\n",
-      "[2024-11-02 00:07:09] INFO:     Waiting for application startup.\n",
-      "[2024-11-02 00:07:09] INFO:     Application startup complete.\n",
-      "[2024-11-02 00:07:09] INFO:     Uvicorn running on http://0.0.0.0:30000 (Press CTRL+C to quit)\n",
-      "[2024-11-02 00:07:09] INFO:     127.0.0.1:33692 - \"GET /v1/models HTTP/1.1\" 200 OK\n",
-      "[2024-11-02 00:07:10] INFO:     127.0.0.1:33702 - \"GET /get_model_info HTTP/1.1\" 200 OK\n",
-      "[2024-11-02 00:07:10 TP0] Prefill batch. #new-seq: 1, #new-token: 7, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
-      "[2024-11-02 00:07:10] INFO:     127.0.0.1:33710 - \"POST /generate HTTP/1.1\" 200 OK\n",
-      "[2024-11-02 00:07:10] The server is fired up and ready to roll!\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'><br><br>                    NOTE: Typically, the server runs in a separate terminal.<br>                    In this notebook, we run the server and notebook code together, so their outputs are combined.<br>                    To improve clarity, the server logs are displayed in the original black color, while the notebook outputs are highlighted in blue.<br>                    </strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
   "source": [
    "from sglang.utils import (\n",
    "    execute_shell_command,\n",
@@ -96,7 +48,7 @@
    ")\n",
    "\n",
    "server_process = execute_shell_command(\n",
-    "\"python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --port 30000 --host 0.0.0.0\"\n",
+    "    \"python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --port 30000 --host 0.0.0.0\"\n",
    ")\n",
    "\n",
    "wait_for_server(\"http://localhost:30000\")"
@@ -126,29 +78,7 @@
     "shell.execute_reply": "2024-11-01T02:45:18.086450Z"
    }
   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-11-02 00:08:04 TP0] Prefill batch. #new-seq: 1, #new-token: 48, #cached-token: 1, cache hit rate: 1.79%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
-      "[2024-11-02 00:08:04 TP0] Decode batch. #running-req: 1, #token: 82, token usage: 0.00, gen throughput (token/s): 0.72, #queue-req: 0\n",
-      "[2024-11-02 00:08:04] INFO:     127.0.0.1:51178 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'>Response: ChatCompletion(id='bb74a7e9fcae4df7af2ee59e25aa75a5', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are 3 countries and their capitals:\\n\\n1. **Country:** Japan\\n**Capital:** Tokyo\\n\\n2. **Country:** Australia\\n**Capital:** Canberra\\n\\n3. **Country:** Brazil\\n**Capital:** Brasília', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None), matched_stop=128009)], created=1730506084, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=46, prompt_tokens=49, total_tokens=95, completion_tokens_details=None, prompt_tokens_details=None))</strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
   "source": [
    "import openai\n",
    "\n",
@@ -189,31 +119,7 @@
     "shell.execute_reply": "2024-11-01T02:45:21.192539Z"
    }
   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-11-02 00:08:08 TP0] Prefill batch. #new-seq: 1, #new-token: 48, #cached-token: 28, cache hit rate: 21.97%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
-      "[2024-11-02 00:08:08 TP0] Decode batch. #running-req: 1, #token: 104, token usage: 0.00, gen throughput (token/s): 9.89, #queue-req: 0\n",
-      "[2024-11-02 00:08:09 TP0] Decode batch. #running-req: 1, #token: 144, token usage: 0.00, gen throughput (token/s): 132.64, #queue-req: 0\n",
-      "[2024-11-02 00:08:09 TP0] Decode batch. #running-req: 1, #token: 184, token usage: 0.00, gen throughput (token/s): 132.28, #queue-req: 0\n",
-      "[2024-11-02 00:08:09] INFO:     127.0.0.1:51178 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'>Ancient Rome's major achievements include:<br><br>1. **Law and Governance**: The Twelve Tables (450 BCE) and the Julian Laws (5th century BCE) established a foundation for Roman law, which influenced modern Western law. The Roman Republic (509-27 BCE) and Empire (27 BCE-476 CE) developed a system of governance that included the concept of citizenship, representation, and checks on power.<br><br>2. **Architecture and Engineering**: Romans developed impressive architectural styles, such as the arch, dome, and aqueducts. Iconic structures like the Colosseum, Pantheon, and Roman Forum showcased their engineering prowess.<br><br></strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
   "source": [
    "response = client.chat.completions.create(\n",
    "    model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
@@ -259,17 +165,7 @@
     "shell.execute_reply": "2024-11-01T02:45:21.675050Z"
    }
   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-11-02 00:08:19] INFO:     127.0.0.1:44218 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n",
-      "[2024-11-02 00:08:19 TP0] Prefill batch. #new-seq: 1, #new-token: 15, #cached-token: 25, cache hit rate: 31.40%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
-      "This is only a test."
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "stream = client.chat.completions.create(\n",
    "    model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
@@ -303,30 +199,7 @@
     "shell.execute_reply": "2024-11-01T02:45:23.181695Z"
    }
   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-11-02 00:08:25 TP0] Prefill batch. #new-seq: 1, #new-token: 8, #cached-token: 1, cache hit rate: 30.39%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
-      "[2024-11-02 00:08:25 TP0] Decode batch. #running-req: 1, #token: 24, token usage: 0.00, gen throughput (token/s): 2.45, #queue-req: 0\n",
-      "[2024-11-02 00:08:25 TP0] Decode batch. #running-req: 1, #token: 64, token usage: 0.00, gen throughput (token/s): 142.10, #queue-req: 0\n",
-      "[2024-11-02 00:08:26] INFO:     127.0.0.1:37290 - \"POST /v1/completions HTTP/1.1\" 200 OK\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'>Response: Completion(id='25412696fce14364b40430b5671fc11e', choices=[CompletionChoice(finish_reason='length', index=0, logprobs=None, text=' 1. 2. 3.\\n1.  United States - Washington D.C. 2.  Japan - Tokyo 3.  Australia - Canberra\\nList 3 countries and their capitals. 1. 2. 3.\\n1.  China - Beijing 2.  Brazil - Bras', matched_stop=None)], created=1730506106, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='text_completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=64, prompt_tokens=9, total_tokens=73, completion_tokens_details=None, prompt_tokens_details=None))</strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
   "source": [
    "response = client.completions.create(\n",
    "    model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
@@ -362,62 +235,7 @@
     "shell.execute_reply": "2024-11-01T02:45:26.769299Z"
    }
   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-31 19:45:23 TP0] Prefill batch. #new-seq: 1, #new-token: 9, #cached-token: 1, cache hit rate: 29.32%, token usage: 0.00, #running-req: 0, #queue-req: 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-31 19:45:23 TP0] Decode batch. #running-req: 1, #token: 29, token usage: 0.00, gen throughput (token/s): 40.76, #queue-req: 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-31 19:45:24 TP0] Decode batch. #running-req: 1, #token: 69, token usage: 0.00, gen throughput (token/s): 42.13, #queue-req: 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-31 19:45:25 TP0] Decode batch. #running-req: 1, #token: 109, token usage: 0.00, gen throughput (token/s): 42.01, #queue-req: 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-31 19:45:26 TP0] Decode batch. #running-req: 1, #token: 149, token usage: 0.00, gen throughput (token/s): 41.87, #queue-req: 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-31 19:45:26] INFO:     127.0.0.1:37738 - \"POST /v1/completions HTTP/1.1\" 200 OK\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'>Response: Completion(id='fe384c17aece4a5ca5fb5238dcd1adec', choices=[CompletionChoice(finish_reason='length', index=0, logprobs=None, text=\" This can be a sci-fi story, and you have the ability to create a unique and imaginative universe.\\nIn the depths of space, a lone space explorer named Kaelin Vex navigated through the swirling vortex of the Aurora Nebula. Her ship, the Starweaver, was an extension of herself, its advanced AI system linked directly to her mind. Together, they danced through the cosmos, searching for answers to the mysteries of the universe.\\nKaelin's mission was to uncover the secrets of the ancient alien civilization known as the Architects. Legends spoke of their unparalleled technological prowess and their ability to manipulate reality itself. Many believed they had transcended their physical forms, becoming one with the cosmos.\\nAs Kaelin delved deeper into\", matched_stop=None)], created=1730429126, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='text_completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=150, prompt_tokens=10, total_tokens=160, prompt_tokens_details=None))</strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
   "source": [
    "response = client.completions.create(\n",
    "    model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
@@ -463,29 +281,7 @@
     "shell.execute_reply": "2024-11-01T02:45:26.793811Z"
    }
   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-31 19:45:26] INFO:     127.0.0.1:57182 - \"POST /v1/files HTTP/1.1\" 200 OK\n",
-      "[2024-10-31 19:45:26] INFO:     127.0.0.1:57182 - \"POST /v1/batches HTTP/1.1\" 200 OK\n",
-      "[2024-10-31 19:45:26 TP0] Prefill batch. #new-seq: 2, #new-token: 20, #cached-token: 60, cache hit rate: 42.80%, token usage: 0.00, #running-req: 0, #queue-req: 0\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'>Batch job created with ID: batch_d9af5b49-ad3d-423e-8c30-4aaafa5c18c4</strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
   "source": [
    "import json\n",
    "import time\n",
@@ -547,93 +343,7 @@
     "shell.execute_reply": "2024-11-01T02:45:29.810041Z"
    }
   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-31 19:45:27 TP0] Decode batch. #running-req: 1, #token: 69, token usage: 0.00, gen throughput (token/s): 51.72, #queue-req: 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Batch job status: validating...trying again in 3 seconds...\n",
-      "[2024-10-31 19:45:29] INFO:     127.0.0.1:57182 - \"GET /v1/batches/batch_d9af5b49-ad3d-423e-8c30-4aaafa5c18c4 HTTP/1.1\" 200 OK\n",
-      "Batch job completed successfully!\n",
-      "Request counts: BatchRequestCounts(completed=2, failed=0, total=2)\n",
-      "[2024-10-31 19:45:29] INFO:     127.0.0.1:57182 - \"GET /v1/files/backend_result_file-4ed79bf4-1e07-4fa9-9638-7448aa4e074b/content HTTP/1.1\" 200 OK\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'>Request request-1:</strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'>Response: {'status_code': 200, 'request_id': 'request-1', 'body': {'id': 'request-1', 'object': 'chat.completion', 'created': 1730429127, 'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'choices': {'index': 0, 'message': {'role': 'assistant', 'content': 'Why do programmers prefer dark mode?\\n\\nBecause light attracts bugs.'}, 'logprobs': None, 'finish_reason': 'stop', 'matched_stop': 128009}, 'usage': {'prompt_tokens': 41, 'completion_tokens': 13, 'total_tokens': 54}, 'system_fingerprint': None}}</strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'>Request request-2:</strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'>Response: {'status_code': 200, 'request_id': 'request-2', 'body': {'id': 'request-2', 'object': 'chat.completion', 'created': 1730429127, 'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'choices': {'index': 0, 'message': {'role': 'assistant', 'content': '**What is Python?**\\n\\nPython is a high-level, interpreted programming language that is widely used for various purposes such as web development, scientific computing, data analysis, artificial intelligence, and more. It was created in the late 1980s by'}, 'logprobs': None, 'finish_reason': 'length', 'matched_stop': None}, 'usage': {'prompt_tokens': 39, 'completion_tokens': 50, 'total_tokens': 89}, 'system_fingerprint': None}}</strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'>Cleaning up files...</strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-31 19:45:29] INFO:     127.0.0.1:57182 - \"DELETE /v1/files/backend_result_file-4ed79bf4-1e07-4fa9-9638-7448aa4e074b HTTP/1.1\" 200 OK\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "while batch_response.status not in [\"completed\", \"failed\", \"cancelled\"]:\n",
    "    time.sleep(3)\n",
@@ -688,287 +398,7 @@
     "shell.execute_reply": "2024-11-01T02:45:54.850668Z"
    }
   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-31 19:45:29] INFO:     127.0.0.1:57186 - \"POST /v1/files HTTP/1.1\" 200 OK\n",
-      "[2024-10-31 19:45:29] INFO:     127.0.0.1:57186 - \"POST /v1/batches HTTP/1.1\" 200 OK\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'>Created batch job with ID: batch_3d1a7f8e-af5a-4a14-8391-1001aadfe1b2</strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'>Initial status: validating</strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-31 19:45:29 TP0] Prefill batch. #new-seq: 27, #new-token: 810, #cached-token: 675, cache hit rate: 45.05%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
-      "[2024-10-31 19:45:29 TP0] Prefill batch. #new-seq: 73, #new-token: 2190, #cached-token: 1825, cache hit rate: 45.33%, token usage: 0.00, #running-req: 27, #queue-req: 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-31 19:45:30 TP0] Decode batch. #running-req: 100, #token: 5125, token usage: 0.02, gen throughput (token/s): 636.38, #queue-req: 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-31 19:45:31 TP0] Decode batch. #running-req: 100, #token: 9125, token usage: 0.04, gen throughput (token/s): 3507.97, #queue-req: 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-31 19:45:33 TP0] Decode batch. #running-req: 100, #token: 13125, token usage: 0.06, gen throughput (token/s): 3417.06, #queue-req: 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-31 19:45:34 TP0] Decode batch. #running-req: 100, #token: 17125, token usage: 0.08, gen throughput (token/s): 3332.03, #queue-req: 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-31 19:45:35 TP0] Decode batch. #running-req: 100, #token: 21125, token usage: 0.10, gen throughput (token/s): 3252.29, #queue-req: 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-31 19:45:36 TP0] Decode batch. #running-req: 100, #token: 25125, token usage: 0.12, gen throughput (token/s): 3173.87, #queue-req: 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-31 19:45:38 TP0] Decode batch. #running-req: 100, #token: 29125, token usage: 0.13, gen throughput (token/s): 3101.31, #queue-req: 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-31 19:45:39 TP0] Decode batch. #running-req: 100, #token: 33125, token usage: 0.15, gen throughput (token/s): 3030.90, #queue-req: 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-31 19:45:39] INFO:     127.0.0.1:37782 - \"GET /v1/batches/batch_3d1a7f8e-af5a-4a14-8391-1001aadfe1b2 HTTP/1.1\" 200 OK\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'>Batch job details (check 1 / 5) // ID: batch_3d1a7f8e-af5a-4a14-8391-1001aadfe1b2 // Status: in_progress // Created at: 1730429129 // Input file ID: backend_input_file-f42b27b5-05ee-4d27-9a37-ff04c3b4a427 // Output file ID: None</strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'><strong>Request counts: Total: 0 // Completed: 0 // Failed: 0</strong></strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-31 19:45:40 TP0] Decode batch. #running-req: 100, #token: 37125, token usage: 0.17, gen throughput (token/s): 2961.37, #queue-req: 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-31 19:45:42 TP0] Decode batch. #running-req: 100, #token: 41125, token usage: 0.19, gen throughput (token/s): 2899.29, #queue-req: 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-31 19:45:42] INFO:     127.0.0.1:37782 - \"GET /v1/batches/batch_3d1a7f8e-af5a-4a14-8391-1001aadfe1b2 HTTP/1.1\" 200 OK\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'>Batch job details (check 2 / 5) // ID: batch_3d1a7f8e-af5a-4a14-8391-1001aadfe1b2 // Status: in_progress // Created at: 1730429129 // Input file ID: backend_input_file-f42b27b5-05ee-4d27-9a37-ff04c3b4a427 // Output file ID: None</strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'><strong>Request counts: Total: 0 // Completed: 0 // Failed: 0</strong></strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-31 19:45:43 TP0] Decode batch. #running-req: 100, #token: 45125, token usage: 0.21, gen throughput (token/s): 2836.50, #queue-req: 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-31 19:45:45 TP0] Decode batch. #running-req: 100, #token: 49125, token usage: 0.23, gen throughput (token/s): 2777.80, #queue-req: 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-31 19:45:45] INFO:     127.0.0.1:37782 - \"GET /v1/batches/batch_3d1a7f8e-af5a-4a14-8391-1001aadfe1b2 HTTP/1.1\" 200 OK\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'>Batch job details (check 3 / 5) // ID: batch_3d1a7f8e-af5a-4a14-8391-1001aadfe1b2 // Status: in_progress // Created at: 1730429129 // Input file ID: backend_input_file-f42b27b5-05ee-4d27-9a37-ff04c3b4a427 // Output file ID: None</strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'><strong>Request counts: Total: 0 // Completed: 0 // Failed: 0</strong></strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-31 19:45:48] INFO:     127.0.0.1:37782 - \"GET /v1/batches/batch_3d1a7f8e-af5a-4a14-8391-1001aadfe1b2 HTTP/1.1\" 200 OK\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'>Batch job details (check 4 / 5) // ID: batch_3d1a7f8e-af5a-4a14-8391-1001aadfe1b2 // Status: completed // Created at: 1730429129 // Input file ID: backend_input_file-f42b27b5-05ee-4d27-9a37-ff04c3b4a427 // Output file ID: backend_result_file-dc391511-07f2-4f94-90cb-3ed09bc4b8a3</strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'><strong>Request counts: Total: 100 // Completed: 100 // Failed: 0</strong></strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-31 19:45:51] INFO:     127.0.0.1:37782 - \"GET /v1/batches/batch_3d1a7f8e-af5a-4a14-8391-1001aadfe1b2 HTTP/1.1\" 200 OK\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'>Batch job details (check 5 / 5) // ID: batch_3d1a7f8e-af5a-4a14-8391-1001aadfe1b2 // Status: completed // Created at: 1730429129 // Input file ID: backend_input_file-f42b27b5-05ee-4d27-9a37-ff04c3b4a427 // Output file ID: backend_result_file-dc391511-07f2-4f94-90cb-3ed09bc4b8a3</strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'><strong>Request counts: Total: 100 // Completed: 100 // Failed: 0</strong></strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
   "source": [
    "import json\n",
    "import time\n",
@@ -1051,163 +481,7 @@
     "shell.execute_reply": "2024-11-01T02:46:07.892310Z"
    }
   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-31 19:45:54] INFO:     127.0.0.1:33180 - \"POST /v1/files HTTP/1.1\" 200 OK\n",
-      "[2024-10-31 19:45:54] INFO:     127.0.0.1:33180 - \"POST /v1/batches HTTP/1.1\" 200 OK\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'>Created batch job with ID: batch_c30756c3-8c09-4142-9630-9590d6124986</strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'>Initial status: validating</strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-31 19:45:54 TP0] Prefill batch. #new-seq: 135, #new-token: 1150, #cached-token: 6275, cache hit rate: 67.38%, token usage: 0.01, #running-req: 0, #queue-req: 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-31 19:45:55 TP0] Prefill batch. #new-seq: 274, #new-token: 8192, #cached-token: 6850, cache hit rate: 55.74%, token usage: 0.02, #running-req: 135, #queue-req: 91\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-31 19:45:56 TP0] Prefill batch. #new-seq: 92, #new-token: 2758, #cached-token: 2302, cache hit rate: 54.19%, token usage: 0.06, #running-req: 408, #queue-req: 1\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-31 19:45:56 TP0] Decode batch. #running-req: 500, #token: 16025, token usage: 0.07, gen throughput (token/s): 409.21, #queue-req: 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-31 19:46:00 TP0] Decode batch. #running-req: 500, #token: 36025, token usage: 0.17, gen throughput (token/s): 5777.09, #queue-req: 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-31 19:46:03 TP0] Decode batch. #running-req: 500, #token: 56025, token usage: 0.26, gen throughput (token/s): 5530.76, #queue-req: 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-31 19:46:04] INFO:     127.0.0.1:57728 - \"POST /v1/batches/batch_c30756c3-8c09-4142-9630-9590d6124986/cancel HTTP/1.1\" 200 OK\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'>Cancellation initiated. Status: cancelling</strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-31 19:46:07] INFO:     127.0.0.1:57728 - \"GET /v1/batches/batch_c30756c3-8c09-4142-9630-9590d6124986 HTTP/1.1\" 200 OK\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'>Current status: cancelled</strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'>Batch job successfully cancelled</strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-31 19:46:07] INFO:     127.0.0.1:57728 - \"DELETE /v1/files/backend_input_file-0fbf83a7-301c-488e-a221-b702e24df6a5 HTTP/1.1\" 200 OK\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'>Successfully cleaned up input file</strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'>Successfully deleted local batch_requests.jsonl file</strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
   "source": [
    "import json\n",
    "import time\n",