Imporve openai api documents (#1827)

Co-authored-by: Chayenne <zhaochenyang@g.ucla.edu>
2024-10-30 00:39:41 -07:00
parent 5e00ddebc0
commit 539df95d2c
8 changed files with 857 additions and 207 deletions
--- a/docs/openai_api.ipynb
+++ b/docs/openai_api.ipynb
@@ -6,7 +6,9 @@
   "source": [
    "# OpenAI Compatible API\n",
    "\n",
-    "SGLang provides an OpenAI compatible API for smooth transition from OpenAI services.\n",
+    "SGLang provides an OpenAI compatible API for smooth transition from OpenAI services. Full reference of the API is available at [OpenAI API Reference](https://platform.openai.com/docs/api-reference).\n",
+    "\n",
+    "This tutorial aims at these popular APIs:\n",
    "\n",
    "- `chat/completions`\n",
    "- `completions`\n",
@@ -27,42 +29,99 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 38,
+   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Server is ready. Proceeding with the next steps.\n"
+      "/home/chenyang/miniconda3/envs/AlphaMeemory/lib/python3.11/site-packages/transformers/utils/hub.py:127: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n",
+      "  warnings.warn(\n",
+      "[2024-10-28 02:02:31] server_args=ServerArgs(model_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, context_length=None, device='cuda', served_model_name='meta-llama/Meta-Llama-3.1-8B-Instruct', chat_template=None, is_embedding=False, host='0.0.0.0', port=30000, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, tp_size=1, stream_interval=1, random_seed=800169736, constrained_json_whitespace_pattern=None, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, watchdog_timeout=600, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_flashinfer=False, disable_flashinfer_sampling=False, disable_radix_cache=False, disable_regex_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_penalizer=False, disable_nan_detection=False, enable_overlap_schedule=False, enable_mixed_chunk=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config='', enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1)\n",
+      "/home/chenyang/miniconda3/envs/AlphaMeemory/lib/python3.11/site-packages/transformers/utils/hub.py:127: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n",
+      "  warnings.warn(\n",
+      "/home/chenyang/miniconda3/envs/AlphaMeemory/lib/python3.11/site-packages/transformers/utils/hub.py:127: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n",
+      "  warnings.warn(\n",
+      "[2024-10-28 02:02:36 TP0] Init torch distributed begin.\n",
+      "[2024-10-28 02:02:37 TP0] Load weight begin. avail mem=47.27 GB\n",
+      "[2024-10-28 02:02:37 TP0] Ignore import error when loading sglang.srt.models.mllama. No module named 'transformers.models.mllama'\n",
+      "INFO 10-28 02:02:38 weight_utils.py:236] Using model weights format ['*.safetensors']\n",
+      "Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]\n",
+      "Loading safetensors checkpoint shards:  25% Completed | 1/4 [00:00<00:01,  2.57it/s]\n",
+      "Loading safetensors checkpoint shards:  50% Completed | 2/4 [00:00<00:00,  2.45it/s]\n",
+      "Loading safetensors checkpoint shards:  75% Completed | 3/4 [00:00<00:00,  3.53it/s]\n",
+      "Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:01<00:00,  2.98it/s]\n",
+      "Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:01<00:00,  2.94it/s]\n",
+      "\n",
+      "[2024-10-28 02:02:40 TP0] Load weight end. type=LlamaForCausalLM, dtype=torch.bfloat16, avail mem=32.22 GB\n",
+      "[2024-10-28 02:02:40 TP0] Memory pool end. avail mem=4.60 GB\n",
+      "[2024-10-28 02:02:40 TP0] Capture cuda graph begin. This can take up to several minutes.\n",
+      "[2024-10-28 02:02:48 TP0] max_total_num_tokens=217512, max_prefill_tokens=16384, max_running_requests=2049, context_len=131072\n",
+      "[2024-10-28 02:02:48] INFO:     Started server process [1185529]\n",
+      "[2024-10-28 02:02:48] INFO:     Waiting for application startup.\n",
+      "[2024-10-28 02:02:48] INFO:     Application startup complete.\n",
+      "[2024-10-28 02:02:48] INFO:     Uvicorn running on http://0.0.0.0:30000 (Press CTRL+C to quit)\n",
+      "[2024-10-28 02:02:48] INFO:     127.0.0.1:47904 - \"GET /v1/models HTTP/1.1\" 200 OK\n"
     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Server is ready. Proceeding with the next steps.</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
    }
   ],
   "source": [
-    "from sglang.utils import execute_shell_command, wait_for_server, terminate_process\n",
-    "\n",
-    "server_process = execute_shell_command(\n",
-    "    \"\"\"\n",
-    "python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \\\n",
-    "--port 30000 --host 0.0.0.0 --log-level warning\n",
-    "\"\"\"\n",
+    "from sglang.utils import (\n",
+    "    execute_shell_command,\n",
+    "    wait_for_server,\n",
+    "    terminate_process,\n",
+    "    print_highlight,\n",
    ")\n",
    "\n",
-    "wait_for_server(\"http://localhost:30000\")\n",
-    "print(\"Server is ready. Proceeding with the next steps.\")"
+    "server_process = execute_shell_command(\n",
+    "    command=\"python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --port 30000 --host 0.0.0.0\"\n",
+    ")\n",
+    "\n",
+    "wait_for_server(\"http://localhost:30000\")"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 39,
+   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "ChatCompletion(id='e854540ec7914b2d8c712f16fd9ed2ca', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are 3 countries and their capitals:\\n\\n1. **Country:** Japan\\n**Capital:** Tokyo\\n\\n2. **Country:** Australia\\n**Capital:** Canberra\\n\\n3. **Country:** Brazil\\n**Capital:** Brasília', refusal=None, role='assistant', function_call=None, tool_calls=None), matched_stop=128009)], created=1730012326, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=46, prompt_tokens=49, total_tokens=95, prompt_tokens_details=None))\n"
+      "[2024-10-28 02:02:49 TP0] Prefill batch. #new-seq: 1, #new-token: 49, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
+      "[2024-10-28 02:02:49] INFO:     127.0.0.1:47912 - \"GET /get_model_info HTTP/1.1\" 200 OK\n",
+      "[2024-10-28 02:02:49 TP0] Prefill batch. #new-seq: 1, #new-token: 6, #cached-token: 1, cache hit rate: 1.79%, token usage: 0.00, #running-req: 1, #queue-req: 0\n",
+      "[2024-10-28 02:02:49] INFO:     127.0.0.1:47926 - \"POST /generate HTTP/1.1\" 200 OK\n",
+      "[2024-10-28 02:02:49] The server is fired up and ready to roll!\n",
+      "[2024-10-28 02:02:50 TP0] Decode batch. #running-req: 1, #token: 89, token usage: 0.00, gen throughput (token/s): 24.12, #queue-req: 0\n",
+      "[2024-10-28 02:02:50] INFO:     127.0.0.1:47910 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n"
     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Response: ChatCompletion(id='692899ebd3ea464dbb456008a7d60bf3', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are 3 countries and their capitals:\\n\\n1. **Country:** Japan\\n**Capital:** Tokyo\\n\\n2. **Country:** Australia\\n**Capital:** Canberra\\n\\n3. **Country:** Brazil\\n**Capital:** Brasília', refusal=None, role='assistant', function_call=None, tool_calls=None), matched_stop=128009)], created=1730106170, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=46, prompt_tokens=49, total_tokens=95, prompt_tokens_details=None))</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
    }
   ],
   "source": [
@@ -84,7 +143,8 @@
    "    temperature=0,\n",
    "    max_tokens=64,\n",
    ")\n",
-    "print(response)"
+    "\n",
+    "print_highlight(f\"Response: {response}\")"
   ]
  },
  {
@@ -93,40 +153,35 @@
   "source": [
    "### Parameters\n",
    "\n",
-    "The chat completions API accepts the following parameters (refer to [OpenAI Chat Completions API](https://platform.openai.com/docs/api-reference/chat/create) for more details):\n",
-    "\n",
-    "- `messages`: List of messages in the conversation, each containing `role` and `content`\n",
-    "- `model`: The model identifier to use for completion\n",
-    "- `max_tokens`: Maximum number of tokens to generate in the response\n",
-    "- `temperature`: Controls randomness (0-2). Lower values make output more focused and deterministic\n",
-    "- `top_p`: Alternative to temperature. Controls diversity via nucleus sampling\n",
-    "- `n`: Number of chat completion choices to generate\n",
-    "- `stream`: If true, partial message deltas will be sent as they become available\n",
-    "- `stop`: Sequences where the API will stop generating further tokens\n",
-    "- `presence_penalty`: Penalizes new tokens based on their presence in the text so far (-2.0 to 2.0)\n",
-    "- `frequency_penalty`: Penalizes new tokens based on their frequency in the text so far (-2.0 to 2.0)\n",
-    "- `logit_bias`: Modify the likelihood of specified tokens appearing in the completion\n",
-    "- `logprobs`: Include log probabilities of tokens in the response\n",
-    "- `top_logprobs`: Number of most likely tokens to return probabilities for\n",
-    "- `seed`: Random seed for deterministic results\n",
-    "- `response_format`: Specify the format of the response (e.g., JSON)\n",
-    "- `stream_options`: Additional options for streaming responses\n",
-    "- `user`: A unique identifier representing your end-user\n",
+    "The chat completions API accepts OpenAI Chat Completions API's parameters. Refer to [OpenAI Chat Completions API](https://platform.openai.com/docs/api-reference/chat/create) for more details.\n",
    "\n",
    "Here is an example of a detailed chat completion request:"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 40,
+   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Ancient Rome's major achievements include:"
+      "[2024-10-28 02:02:50 TP0] Prefill batch. #new-seq: 1, #new-token: 48, #cached-token: 28, cache hit rate: 21.97%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
+      "[2024-10-28 02:02:50] INFO:     127.0.0.1:47910 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n"
     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Response: ChatCompletion(id='bffa083869484c78ab89d334514d5af3', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content=\"Ancient Rome's major achievements include:\", refusal=None, role='assistant', function_call=None, tool_calls=None), matched_stop='\\n\\n')], created=1730106170, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=8, prompt_tokens=76, total_tokens=84, prompt_tokens_details=None))</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
    }
   ],
   "source": [
@@ -152,11 +207,9 @@
    "    frequency_penalty=0.2,  # Mild penalty for more natural language\n",
    "    n=1,  # Single response is usually more stable\n",
    "    seed=42,  # Keep for reproducibility\n",
-    "    stream=True,  # Keep streaming for real-time output\n",
    ")\n",
    "\n",
-    "for chunk in response:\n",
-    "    print(chunk.choices[0].delta.content or \"\", end=\"\")"
+    "print_highlight(f\"Response: {response}\")"
   ]
  },
  {
@@ -167,20 +220,34 @@
    "\n",
    "### Usage\n",
    "\n",
-    "Completions API is similar to Chat Completions API, but without the `messages` parameter. Refer to [OpenAI Completions API](https://platform.openai.com/docs/api-reference/completions/create) for more details."
+    "Completions API is similar to Chat Completions API, but without the `messages` parameter."
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 41,
+   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Completion(id='a6e07198f4b445baa0fb08a2178ceb59', choices=[CompletionChoice(finish_reason='length', index=0, logprobs=None, text=' 1. 2. 3.\\n1.  United States - Washington D.C. 2.  Japan - Tokyo 3.  Australia - Canberra\\nList 3 countries and their capitals. 1. 2. 3.\\n1.  China - Beijing 2.  Brazil - Bras', matched_stop=None)], created=1730012328, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='text_completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=64, prompt_tokens=9, total_tokens=73, prompt_tokens_details=None))\n"
+      "[2024-10-28 02:02:50 TP0] Prefill batch. #new-seq: 1, #new-token: 8, #cached-token: 1, cache hit rate: 21.28%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
+      "[2024-10-28 02:02:51 TP0] Decode batch. #running-req: 1, #token: 37, token usage: 0.00, gen throughput (token/s): 38.07, #queue-req: 0\n",
+      "[2024-10-28 02:02:52] INFO:     127.0.0.1:47910 - \"POST /v1/completions HTTP/1.1\" 200 OK\n"
     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Response: Completion(id='eb486d0a32fd4384baba923f3bc17e8b', choices=[CompletionChoice(finish_reason='length', index=0, logprobs=None, text=' 1. 2. 3.\\n1.  United States - Washington D.C. 2.  Japan - Tokyo 3.  Australia - Canberra\\nList 3 countries and their capitals. 1. 2. 3.\\n1.  China - Beijing 2.  Brazil - Bras', matched_stop=None)], created=1730106172, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='text_completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=64, prompt_tokens=9, total_tokens=73, prompt_tokens_details=None))</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
    }
   ],
   "source": [
@@ -192,7 +259,8 @@
    "    n=1,\n",
    "    stop=None,\n",
    ")\n",
-    "print(response)"
+    "\n",
+    "print_highlight(f\"Response: {response}\")"
   ]
  },
  {
@@ -201,44 +269,39 @@
   "source": [
    "### Parameters\n",
    "\n",
-    "The completions API accepts the following parameters:\n",
-    "\n",
-    "- `model`: The model identifier to use for completion\n",
-    "- `prompt`: Input text to generate completions for. Can be a string, array of strings, or token arrays\n",
-    "- `best_of`: Number of completions to generate server-side and return the best one\n",
-    "- `echo`: If true, the prompt will be included in the response\n",
-    "- `frequency_penalty`: Penalizes new tokens based on their frequency in the text so far (-2.0 to 2.0)\n",
-    "- `logit_bias`: Modify the likelihood of specified tokens appearing in the completion\n",
-    "- `logprobs`: Include log probabilities of tokens in the response\n",
-    "- `max_tokens`: Maximum number of tokens to generate in the response (default: 16)\n",
-    "- `n`: Number of completion choices to generate\n",
-    "- `presence_penalty`: Penalizes new tokens based on their presence in the text so far (-2.0 to 2.0)\n",
-    "- `seed`: Random seed for deterministic results\n",
-    "- `stop`: Sequences where the API will stop generating further tokens\n",
-    "- `stream`: If true, partial completion deltas will be sent as they become available\n",
-    "- `stream_options`: Additional options for streaming responses\n",
-    "- `suffix`: Text to append to the completion\n",
-    "- `temperature`: Controls randomness (0-2). Lower values make output more focused and deterministic\n",
-    "- `top_p`: Alternative to temperature. Controls diversity via nucleus sampling\n",
-    "- `user`: A unique identifier representing your end-user\n",
+    "The completions API accepts OpenAI Completions API's parameters.  Refer to [OpenAI Completions API](https://platform.openai.com/docs/api-reference/completions/create) for more details.\n",
    "\n",
    "Here is an example of a detailed completions request:"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 42,
+   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      " Space explorer, Captain Orion Blackwood, had been traveling through the galaxy for 12 years, searching for a new home for humanity. His ship, the Aurora, had been his home for so long that he barely remembered what it was like to walk on solid ground.\n",
-      "As he navigated through the dense asteroid field, the ship's computer, S.A.R.A. (Self-Aware Reasoning Algorithm), alerted him to a strange reading on one of the asteroids. Captain Blackwood's curiosity was piqued, and he decided to investigate further.\n",
-      "\"Captain, I'm detecting unusual energy signatures emanating from the asteroid,\" S.A.R.A. said. \"It's unlike anything I've seen before.\"\n",
-      "Captain Blackwood's eyes narrowed as"
+      "[2024-10-28 02:02:52 TP0] Prefill batch. #new-seq: 1, #new-token: 9, #cached-token: 1, cache hit rate: 20.53%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
+      "[2024-10-28 02:02:52 TP0] Decode batch. #running-req: 1, #token: 15, token usage: 0.00, gen throughput (token/s): 40.91, #queue-req: 0\n",
+      "[2024-10-28 02:02:53 TP0] Decode batch. #running-req: 1, #token: 55, token usage: 0.00, gen throughput (token/s): 42.13, #queue-req: 0\n",
+      "[2024-10-28 02:02:54 TP0] Decode batch. #running-req: 1, #token: 95, token usage: 0.00, gen throughput (token/s): 42.10, #queue-req: 0\n",
+      "[2024-10-28 02:02:55 TP0] Decode batch. #running-req: 1, #token: 135, token usage: 0.00, gen throughput (token/s): 41.94, #queue-req: 0\n",
+      "[2024-10-28 02:02:55] INFO:     127.0.0.1:47910 - \"POST /v1/completions HTTP/1.1\" 200 OK\n"
     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Response: Completion(id='fb23a12a15bc4137815b91d63b6fd976', choices=[CompletionChoice(finish_reason='length', index=0, logprobs=None, text=\" Here is a short story about a space explorer named Astrid.\\nAstrid had always been fascinated by the stars. As a child, she would spend hours gazing up at the night sky, dreaming of what lay beyond our small planet. Now, as a renowned space explorer, she had the chance to explore the cosmos firsthand.\\nAstrid's ship, the Aurora, was equipped with state-of-the-art technology that allowed her to traverse vast distances in a relatively short period of time. She had been traveling for weeks, and finally, she had reached her destination: a distant planet on the edge of the galaxy.\\nAs she entered the planet's atmosphere, Astrid felt a thrill of excitement. She had never seen anything like this before.\", matched_stop=None)], created=1730106175, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='text_completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=150, prompt_tokens=10, total_tokens=160, prompt_tokens_details=None))</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
    }
   ],
   "source": [
@@ -253,11 +316,9 @@
    "    frequency_penalty=0.3,  # Reduce repetitive phrases\n",
    "    n=1,  # Generate one completion\n",
    "    seed=123,  # For reproducible results\n",
-    "    stream=True,  # Stream the response\n",
    ")\n",
    "\n",
-    "for chunk in response:\n",
-    "    print(chunk.choices[0].text or \"\", end=\"\")"
+    "print_highlight(f\"Response: {response}\")"
   ]
  },
  {
@@ -279,15 +340,29 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 43,
+   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Batch job created with ID: batch_03d7f74f-dffe-4c26-b5e7-bb9fb5cb89ff\n"
+      "[2024-10-28 02:02:55] INFO:     127.0.0.1:43330 - \"POST /v1/files HTTP/1.1\" 200 OK\n",
+      "[2024-10-28 02:02:55] INFO:     127.0.0.1:43330 - \"POST /v1/batches HTTP/1.1\" 200 OK\n",
+      "[2024-10-28 02:02:55 TP0] Prefill batch. #new-seq: 2, #new-token: 30, #cached-token: 50, cache hit rate: 35.06%, token usage: 0.00, #running-req: 0, #queue-req: 0\n"
     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Batch job created with ID: batch_56fefd2e-0187-4c14-aa2d-110917723dde</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
    }
   ],
   "source": [
@@ -337,29 +412,91 @@
    "    completion_window=\"24h\",\n",
    ")\n",
    "\n",
-    "print(f\"Batch job created with ID: {batch_response.id}\")"
+    "print_highlight(f\"Batch job created with ID: {batch_response.id}\")"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 44,
+   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
+      "[2024-10-28 02:02:56 TP0] Decode batch. #running-req: 2, #token: 82, token usage: 0.00, gen throughput (token/s): 55.10, #queue-req: 0\n",
      "Batch job status: validating...trying again in 3 seconds...\n",
+      "[2024-10-28 02:02:58] INFO:     127.0.0.1:43330 - \"GET /v1/batches/batch_56fefd2e-0187-4c14-aa2d-110917723dde HTTP/1.1\" 200 OK\n",
      "Batch job completed successfully!\n",
      "Request counts: BatchRequestCounts(completed=2, failed=0, total=2)\n",
-      "\n",
-      "Request request-1:\n",
-      "Response: {'status_code': 200, 'request_id': 'request-1', 'body': {'id': 'request-1', 'object': 'chat.completion', 'created': 1730012333, 'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'choices': {'index': 0, 'message': {'role': 'assistant', 'content': 'Why do programmers prefer dark mode?\\n\\nBecause light attracts bugs.'}, 'logprobs': None, 'finish_reason': 'stop', 'matched_stop': 128009}, 'usage': {'prompt_tokens': 41, 'completion_tokens': 13, 'total_tokens': 54}, 'system_fingerprint': None}}\n",
-      "\n",
-      "Request request-2:\n",
-      "Response: {'status_code': 200, 'request_id': 'request-2', 'body': {'id': 'request-2', 'object': 'chat.completion', 'created': 1730012333, 'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'choices': {'index': 0, 'message': {'role': 'assistant', 'content': '**What is Python?**\\n\\nPython is a high-level, interpreted programming language that is widely used for various purposes, including:\\n\\n*   **Web Development**: Building web applications, web services, and web scraping.\\n*   **Data Science**: Data analysis'}, 'logprobs': None, 'finish_reason': 'length', 'matched_stop': None}, 'usage': {'prompt_tokens': 39, 'completion_tokens': 50, 'total_tokens': 89}, 'system_fingerprint': None}}\n",
-      "\n",
-      "Cleaning up files...\n"
+      "[2024-10-28 02:02:58] INFO:     127.0.0.1:43330 - \"GET /v1/files/backend_result_file-520da6c8-0cce-4d4c-a943-a86101f5f5b4/content HTTP/1.1\" 200 OK\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Request request-1:</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Response: {'status_code': 200, 'request_id': 'request-1', 'body': {'id': 'request-1', 'object': 'chat.completion', 'created': 1730106176, 'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'choices': {'index': 0, 'message': {'role': 'assistant', 'content': 'A programmer walks into a library and asks the librarian, \"Do you have any books on Pavlov\\'s dogs and Schrödinger\\'s cat?\"\\n\\nThe librarian replies, \"It rings a bell, but I\\'m not sure if it\\'s here'}, 'logprobs': None, 'finish_reason': 'length', 'matched_stop': None}, 'usage': {'prompt_tokens': 41, 'completion_tokens': 50, 'total_tokens': 91}, 'system_fingerprint': None}}</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Request request-2:</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Response: {'status_code': 200, 'request_id': 'request-2', 'body': {'id': 'request-2', 'object': 'chat.completion', 'created': 1730106176, 'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'choices': {'index': 0, 'message': {'role': 'assistant', 'content': '**What is Python?**\\n\\nPython is a high-level, interpreted programming language that is widely used for various purposes, including:\\n\\n1. **Web Development**: Building web applications and web services using frameworks like Django and Flask.\\n2. **Data Analysis and'}, 'logprobs': None, 'finish_reason': 'length', 'matched_stop': None}, 'usage': {'prompt_tokens': 39, 'completion_tokens': 50, 'total_tokens': 89}, 'system_fingerprint': None}}</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Cleaning up files...</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 02:02:58] INFO:     127.0.0.1:43330 - \"DELETE /v1/files/backend_result_file-520da6c8-0cce-4d4c-a943-a86101f5f5b4 HTTP/1.1\" 200 OK\n"
     ]
    }
   ],
@@ -382,16 +519,16 @@
    "    ]\n",
    "\n",
    "    for result in results:\n",
-    "        print(f\"\\nRequest {result['custom_id']}:\")\n",
-    "        print(f\"Response: {result['response']}\")\n",
+    "        print_highlight(f\"Request {result['custom_id']}:\")\n",
+    "        print_highlight(f\"Response: {result['response']}\")\n",
    "\n",
-    "    print(\"\\nCleaning up files...\")\n",
+    "    print_highlight(\"Cleaning up files...\")\n",
    "    # Only delete the result file ID since file_response is just content\n",
    "    client.files.delete(result_file_id)\n",
    "else:\n",
-    "    print(f\"Batch job failed with status: {batch_response.status}\")\n",
+    "    print_highlight(f\"Batch job failed with status: {batch_response.status}\")\n",
    "    if hasattr(batch_response, \"errors\"):\n",
-    "        print(f\"Errors: {batch_response.errors}\")"
+    "        print_highlight(f\"Errors: {batch_response.errors}\")"
   ]
  },
  {
@@ -408,66 +545,210 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 45,
+   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Created batch job with ID: batch_6b9625ac-9ebc-4c4f-bfd5-f84f88b0100d\n",
-      "Initial status: validating\n",
-      "Batch job details (check 1/5):\n",
-      "ID: batch_6b9625ac-9ebc-4c4f-bfd5-f84f88b0100d\n",
-      "Status: in_progress\n",
-      "Created at: 1730012334\n",
-      "Input file ID: backend_input_file-8203d42a-109c-4573-9663-13b5d9cb6a2b\n",
-      "Output file ID: None\n",
-      "Request counts:\n",
-      "Total: 0\n",
-      "Completed: 0\n",
-      "Failed: 0\n",
-      "Batch job details (check 2/5):\n",
-      "ID: batch_6b9625ac-9ebc-4c4f-bfd5-f84f88b0100d\n",
-      "Status: in_progress\n",
-      "Created at: 1730012334\n",
-      "Input file ID: backend_input_file-8203d42a-109c-4573-9663-13b5d9cb6a2b\n",
-      "Output file ID: None\n",
-      "Request counts:\n",
-      "Total: 0\n",
-      "Completed: 0\n",
-      "Failed: 0\n",
-      "Batch job details (check 3/5):\n",
-      "ID: batch_6b9625ac-9ebc-4c4f-bfd5-f84f88b0100d\n",
-      "Status: in_progress\n",
-      "Created at: 1730012334\n",
-      "Input file ID: backend_input_file-8203d42a-109c-4573-9663-13b5d9cb6a2b\n",
-      "Output file ID: None\n",
-      "Request counts:\n",
-      "Total: 0\n",
-      "Completed: 0\n",
-      "Failed: 0\n",
-      "Batch job details (check 4/5):\n",
-      "ID: batch_6b9625ac-9ebc-4c4f-bfd5-f84f88b0100d\n",
-      "Status: completed\n",
-      "Created at: 1730012334\n",
-      "Input file ID: backend_input_file-8203d42a-109c-4573-9663-13b5d9cb6a2b\n",
-      "Output file ID: backend_result_file-d32f441d-e737-4da3-b07a-c39349425b3a\n",
-      "Request counts:\n",
-      "Total: 100\n",
-      "Completed: 100\n",
-      "Failed: 0\n",
-      "Batch job details (check 5/5):\n",
-      "ID: batch_6b9625ac-9ebc-4c4f-bfd5-f84f88b0100d\n",
-      "Status: completed\n",
-      "Created at: 1730012334\n",
-      "Input file ID: backend_input_file-8203d42a-109c-4573-9663-13b5d9cb6a2b\n",
-      "Output file ID: backend_result_file-d32f441d-e737-4da3-b07a-c39349425b3a\n",
-      "Request counts:\n",
-      "Total: 100\n",
-      "Completed: 100\n",
-      "Failed: 0\n"
+      "[2024-10-28 02:02:58] INFO:     127.0.0.1:43336 - \"POST /v1/files HTTP/1.1\" 200 OK\n",
+      "[2024-10-28 02:02:58] INFO:     127.0.0.1:43336 - \"POST /v1/batches HTTP/1.1\" 200 OK\n"
     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Created batch job with ID: batch_67da0e16-e7b2-4a75-9f7a-58c033e739e5</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Initial status: validating</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 02:02:58 TP0] Prefill batch. #new-seq: 17, #new-token: 510, #cached-token: 425, cache hit rate: 43.40%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
+      "[2024-10-28 02:02:58 TP0] Prefill batch. #new-seq: 83, #new-token: 2490, #cached-token: 2075, cache hit rate: 45.04%, token usage: 0.00, #running-req: 17, #queue-req: 0\n",
+      "[2024-10-28 02:02:59 TP0] Decode batch. #running-req: 100, #token: 3725, token usage: 0.02, gen throughput (token/s): 234.43, #queue-req: 0\n",
+      "[2024-10-28 02:03:00 TP0] Decode batch. #running-req: 100, #token: 7725, token usage: 0.04, gen throughput (token/s): 3545.41, #queue-req: 0\n",
+      "[2024-10-28 02:03:01 TP0] Decode batch. #running-req: 100, #token: 11725, token usage: 0.05, gen throughput (token/s): 3448.10, #queue-req: 0\n",
+      "[2024-10-28 02:03:02 TP0] Decode batch. #running-req: 100, #token: 15725, token usage: 0.07, gen throughput (token/s): 3362.62, #queue-req: 0\n",
+      "[2024-10-28 02:03:04 TP0] Decode batch. #running-req: 100, #token: 19725, token usage: 0.09, gen throughput (token/s): 3279.58, #queue-req: 0\n",
+      "[2024-10-28 02:03:05 TP0] Decode batch. #running-req: 100, #token: 23725, token usage: 0.11, gen throughput (token/s): 3200.86, #queue-req: 0\n",
+      "[2024-10-28 02:03:06 TP0] Decode batch. #running-req: 100, #token: 27725, token usage: 0.13, gen throughput (token/s): 3126.52, #queue-req: 0\n",
+      "[2024-10-28 02:03:07 TP0] Decode batch. #running-req: 100, #token: 31725, token usage: 0.15, gen throughput (token/s): 3053.16, #queue-req: 0\n",
+      "[2024-10-28 02:03:08] INFO:     127.0.0.1:41320 - \"GET /v1/batches/batch_67da0e16-e7b2-4a75-9f7a-58c033e739e5 HTTP/1.1\" 200 OK\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Batch job details (check 1 / 5) // ID: batch_67da0e16-e7b2-4a75-9f7a-58c033e739e5 // Status: in_progress // Created at: 1730106178 // Input file ID: backend_input_file-92cf2cc1-afbd-428f-8c5c-85fabd86cb63 // Output file ID: None</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'><strong>Request counts: Total: 0 // Completed: 0 // Failed: 0</strong></strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 02:03:09 TP0] Decode batch. #running-req: 100, #token: 35725, token usage: 0.16, gen throughput (token/s): 2980.26, #queue-req: 0\n",
+      "[2024-10-28 02:03:10 TP0] Decode batch. #running-req: 100, #token: 39725, token usage: 0.18, gen throughput (token/s): 2919.09, #queue-req: 0\n",
+      "[2024-10-28 02:03:11] INFO:     127.0.0.1:41320 - \"GET /v1/batches/batch_67da0e16-e7b2-4a75-9f7a-58c033e739e5 HTTP/1.1\" 200 OK\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Batch job details (check 2 / 5) // ID: batch_67da0e16-e7b2-4a75-9f7a-58c033e739e5 // Status: in_progress // Created at: 1730106178 // Input file ID: backend_input_file-92cf2cc1-afbd-428f-8c5c-85fabd86cb63 // Output file ID: None</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'><strong>Request counts: Total: 0 // Completed: 0 // Failed: 0</strong></strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 02:03:11 TP0] Decode batch. #running-req: 100, #token: 43725, token usage: 0.20, gen throughput (token/s): 2854.92, #queue-req: 0\n",
+      "[2024-10-28 02:03:13 TP0] Decode batch. #running-req: 100, #token: 47725, token usage: 0.22, gen throughput (token/s): 2794.62, #queue-req: 0\n",
+      "[2024-10-28 02:03:14] INFO:     127.0.0.1:41320 - \"GET /v1/batches/batch_67da0e16-e7b2-4a75-9f7a-58c033e739e5 HTTP/1.1\" 200 OK\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Batch job details (check 3 / 5) // ID: batch_67da0e16-e7b2-4a75-9f7a-58c033e739e5 // Status: in_progress // Created at: 1730106178 // Input file ID: backend_input_file-92cf2cc1-afbd-428f-8c5c-85fabd86cb63 // Output file ID: None</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'><strong>Request counts: Total: 0 // Completed: 0 // Failed: 0</strong></strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 02:03:14 TP0] Decode batch. #running-req: 100, #token: 51725, token usage: 0.24, gen throughput (token/s): 2737.84, #queue-req: 0\n",
+      "[2024-10-28 02:03:17] INFO:     127.0.0.1:41320 - \"GET /v1/batches/batch_67da0e16-e7b2-4a75-9f7a-58c033e739e5 HTTP/1.1\" 200 OK\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Batch job details (check 4 / 5) // ID: batch_67da0e16-e7b2-4a75-9f7a-58c033e739e5 // Status: completed // Created at: 1730106178 // Input file ID: backend_input_file-92cf2cc1-afbd-428f-8c5c-85fabd86cb63 // Output file ID: backend_result_file-c10ee9f5-eca8-4357-a922-934543b7f433</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'><strong>Request counts: Total: 100 // Completed: 100 // Failed: 0</strong></strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 02:03:20] INFO:     127.0.0.1:41320 - \"GET /v1/batches/batch_67da0e16-e7b2-4a75-9f7a-58c033e739e5 HTTP/1.1\" 200 OK\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Batch job details (check 5 / 5) // ID: batch_67da0e16-e7b2-4a75-9f7a-58c033e739e5 // Status: completed // Created at: 1730106178 // Input file ID: backend_input_file-92cf2cc1-afbd-428f-8c5c-85fabd86cb63 // Output file ID: backend_result_file-c10ee9f5-eca8-4357-a922-934543b7f433</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'><strong>Request counts: Total: 100 // Completed: 100 // Failed: 0</strong></strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
    }
   ],
   "source": [
@@ -515,25 +796,21 @@
    "    completion_window=\"24h\",\n",
    ")\n",
    "\n",
-    "print(f\"Created batch job with ID: {batch_job.id}\")\n",
-    "print(f\"Initial status: {batch_job.status}\")\n",
+    "print_highlight(f\"Created batch job with ID: {batch_job.id}\")\n",
+    "print_highlight(f\"Initial status: {batch_job.status}\")\n",
    "\n",
    "time.sleep(10)\n",
    "\n",
    "max_checks = 5\n",
    "for i in range(max_checks):\n",
    "    batch_details = client.batches.retrieve(batch_id=batch_job.id)\n",
-    "    print(f\"Batch job details (check {i+1}/{max_checks}):\")\n",
-    "    print(f\"ID: {batch_details.id}\")\n",
-    "    print(f\"Status: {batch_details.status}\")\n",
-    "    print(f\"Created at: {batch_details.created_at}\")\n",
-    "    print(f\"Input file ID: {batch_details.input_file_id}\")\n",
-    "    print(f\"Output file ID: {batch_details.output_file_id}\")\n",
    "\n",
-    "    print(\"Request counts:\")\n",
-    "    print(f\"Total: {batch_details.request_counts.total}\")\n",
-    "    print(f\"Completed: {batch_details.request_counts.completed}\")\n",
-    "    print(f\"Failed: {batch_details.request_counts.failed}\")\n",
+    "    print_highlight(\n",
+    "        f\"Batch job details (check {i+1} / {max_checks}) // ID: {batch_details.id} // Status: {batch_details.status} // Created at: {batch_details.created_at} // Input file ID: {batch_details.input_file_id} // Output file ID: {batch_details.output_file_id}\"\n",
+    "    )\n",
+    "    print_highlight(\n",
+    "        f\"<strong>Request counts: Total: {batch_details.request_counts.total} // Completed: {batch_details.request_counts.completed} // Failed: {batch_details.request_counts.failed}</strong>\"\n",
+    "    )\n",
    "\n",
    "    time.sleep(3)"
   ]
@@ -547,20 +824,114 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 46,
+   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Created batch job with ID: batch_3d2dd881-ad84-465a-85ee-6d5991794e5e\n",
-      "Initial status: validating\n",
-      "Cancellation initiated. Status: cancelling\n",
-      "Current status: cancelled\n",
-      "Batch job successfully cancelled\n",
-      "Successfully cleaned up input file\n"
+      "[2024-10-28 02:03:23] INFO:     127.0.0.1:47360 - \"POST /v1/files HTTP/1.1\" 200 OK\n",
+      "[2024-10-28 02:03:23] INFO:     127.0.0.1:47360 - \"POST /v1/batches HTTP/1.1\" 200 OK\n"
     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Created batch job with ID: batch_8a409f86-b8c7-4e29-9cc7-187d6d28df62</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Initial status: validating</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 02:03:23 TP0] Prefill batch. #new-seq: 44, #new-token: 44, #cached-token: 2376, cache hit rate: 60.81%, token usage: 0.01, #running-req: 0, #queue-req: 0\n",
+      "[2024-10-28 02:03:23 TP0] Prefill batch. #new-seq: 328, #new-token: 8192, #cached-token: 9824, cache hit rate: 56.49%, token usage: 0.01, #running-req: 44, #queue-req: 128\n",
+      "[2024-10-28 02:03:24 TP0] Prefill batch. #new-seq: 129, #new-token: 3864, #cached-token: 3231, cache hit rate: 54.15%, token usage: 0.05, #running-req: 371, #queue-req: 1\n",
+      "[2024-10-28 02:03:27 TP0] Decode batch. #running-req: 500, #token: 29025, token usage: 0.13, gen throughput (token/s): 1162.55, #queue-req: 0\n",
+      "[2024-10-28 02:03:31 TP0] Decode batch. #running-req: 500, #token: 49025, token usage: 0.23, gen throughput (token/s): 5606.35, #queue-req: 0\n",
+      "[2024-10-28 02:03:33] INFO:     127.0.0.1:40110 - \"POST /v1/batches/batch_8a409f86-b8c7-4e29-9cc7-187d6d28df62/cancel HTTP/1.1\" 200 OK\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Cancellation initiated. Status: cancelling</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 02:03:36] INFO:     127.0.0.1:40110 - \"GET /v1/batches/batch_8a409f86-b8c7-4e29-9cc7-187d6d28df62 HTTP/1.1\" 200 OK\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Current status: cancelled</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Batch job successfully cancelled</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 02:03:36] INFO:     127.0.0.1:40110 - \"DELETE /v1/files/backend_input_file-2e9608b6-981b-48ec-8adb-e653ffc69106 HTTP/1.1\" 200 OK\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Successfully cleaned up input file</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
    }
   ],
   "source": [
@@ -608,45 +979,57 @@
    "    completion_window=\"24h\",\n",
    ")\n",
    "\n",
-    "print(f\"Created batch job with ID: {batch_job.id}\")\n",
-    "print(f\"Initial status: {batch_job.status}\")\n",
+    "print_highlight(f\"Created batch job with ID: {batch_job.id}\")\n",
+    "print_highlight(f\"Initial status: {batch_job.status}\")\n",
    "\n",
    "time.sleep(10)\n",
    "\n",
    "try:\n",
    "    cancelled_job = client.batches.cancel(batch_id=batch_job.id)\n",
-    "    print(f\"Cancellation initiated. Status: {cancelled_job.status}\")\n",
+    "    print_highlight(f\"Cancellation initiated. Status: {cancelled_job.status}\")\n",
    "    assert cancelled_job.status == \"cancelling\"\n",
    "\n",
    "    # Monitor the cancellation process\n",
    "    while cancelled_job.status not in [\"failed\", \"cancelled\"]:\n",
    "        time.sleep(3)\n",
    "        cancelled_job = client.batches.retrieve(batch_job.id)\n",
-    "        print(f\"Current status: {cancelled_job.status}\")\n",
+    "        print_highlight(f\"Current status: {cancelled_job.status}\")\n",
    "\n",
    "    # Verify final status\n",
    "    assert cancelled_job.status == \"cancelled\"\n",
-    "    print(\"Batch job successfully cancelled\")\n",
+    "    print_highlight(\"Batch job successfully cancelled\")\n",
    "\n",
    "except Exception as e:\n",
-    "    print(f\"Error during cancellation: {e}\")\n",
+    "    print_highlight(f\"Error during cancellation: {e}\")\n",
    "    raise e\n",
    "\n",
    "finally:\n",
    "    try:\n",
    "        del_response = client.files.delete(uploaded_file.id)\n",
    "        if del_response.deleted:\n",
-    "            print(\"Successfully cleaned up input file\")\n",
+    "            print_highlight(\"Successfully cleaned up input file\")\n",
    "    except Exception as e:\n",
-    "        print(f\"Error cleaning up: {e}\")\n",
+    "        print_highlight(f\"Error cleaning up: {e}\")\n",
    "        raise e"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 47,
+   "execution_count": 10,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 02:03:36] INFO:     Shutting down\n",
+      "[2024-10-28 02:03:36] INFO:     Waiting for application shutdown.\n",
+      "[2024-10-28 02:03:36] INFO:     Application shutdown complete.\n",
+      "[2024-10-28 02:03:36] INFO:     Finished server process [1185529]\n",
+      "W1028 02:03:37.084000 140231994889792 torch/_inductor/compile_worker/subproc_pool.py:126] SubprocPool unclean exit\n"
+     ]
+    }
+   ],
   "source": [
    "terminate_process(server_process)"
   ]