diff --git a/.github/workflows/deploy-docs.yml b/.github/workflows/deploy-docs.yml
index d7177d2f6..aeb9765f1 100644
--- a/.github/workflows/deploy-docs.yml
+++ b/.github/workflows/deploy-docs.yml
@@ -38,14 +38,8 @@ jobs:
GITHUB_TOKEN: ${{ secrets.PAT_TOKEN }}
run: |
cd docs
- for nb in *.ipynb; do
- if [ -f "$nb" ]; then
- echo "Executing $nb"
- jupyter nbconvert --to notebook --execute --inplace "$nb" \
- --ExecutePreprocessor.timeout=600 \
- --ExecutePreprocessor.kernel_name=python3
- fi
- done
+ make clean
+ make compile
make html
cd _build/html
diff --git a/.github/workflows/execute-notebook.yml b/.github/workflows/execute-notebook.yml
index ebc73bac1..b0a2a8324 100644
--- a/.github/workflows/execute-notebook.yml
+++ b/.github/workflows/execute-notebook.yml
@@ -44,11 +44,5 @@ jobs:
- name: Execute notebooks
run: |
cd docs
- for nb in *.ipynb; do
- if [ -f "$nb" ]; then
- echo "Executing $nb"
- jupyter nbconvert --to notebook --execute --inplace "$nb" \
- --ExecutePreprocessor.timeout=600 \
- --ExecutePreprocessor.kernel_name=python3
- fi
- done
\ No newline at end of file
+ make clean
+ make compile
\ No newline at end of file
diff --git a/README.md b/README.md
index 6dc577ee4..8e5d55f50 100644
--- a/README.md
+++ b/README.md
@@ -40,13 +40,13 @@ The core features include:
- **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
## Install
-See [https://sgl-project.github.io/install.html](https://sgl-project.github.io/install.html)
+See [https://sgl-project.github.io/starts/install.html](https://sgl-project.github.io/starts/install.html)
## Backend: SGLang Runtime (SRT)
-See [https://sgl-project.github.io/backend.html](https://sgl-project.github.io/backend.html)
+See [https://sgl-project.github.io/backend/backend.html](https://sgl-project.github.io/backend/backend.html)
## Frontend: Structured Generation Language (SGLang)
-See [https://sgl-project.github.io/frontend.html](https://sgl-project.github.io/frontend.html)
+See [https://sgl-project.github.io/frontend/frontend.html](https://sgl-project.github.io/frontend/frontend.html)
## Benchmark And Performance
Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/), [v0.3 blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)
diff --git a/docs/Makefile b/docs/Makefile
index 3e4a1b5e5..b439c4fe2 100644
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -12,7 +12,18 @@ BUILDDIR = _build
help:
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
-.PHONY: help Makefile
+# New target to compile Markdown and Jupyter Notebook files
+compile:
+ find $(SOURCEDIR) -name '*.ipynb' | while read nb; do \
+ if [ -f "$$nb" ]; then \
+ echo "Executing $$nb"; \
+ jupyter nbconvert --to notebook --execute --inplace "$$nb" \
+ --ExecutePreprocessor.timeout=600 \
+ --ExecutePreprocessor.kernel_name=python3; \
+ fi; \
+ done
+
+.PHONY: help Makefile compile
# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
diff --git a/docs/backend.md b/docs/backend/backend.md
similarity index 100%
rename from docs/backend.md
rename to docs/backend/backend.md
diff --git a/docs/embedding_model.ipynb b/docs/backend/embedding_model.ipynb
similarity index 59%
rename from docs/embedding_model.ipynb
rename to docs/backend/embedding_model.ipynb
index c939204c5..af985a87e 100644
--- a/docs/embedding_model.ipynb
+++ b/docs/backend/embedding_model.ipynb
@@ -30,47 +30,181 @@
{
"cell_type": "code",
"execution_count": 1,
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-11-01T02:47:32.337369Z",
+ "iopub.status.busy": "2024-11-01T02:47:32.337032Z",
+ "iopub.status.idle": "2024-11-01T02:47:59.540926Z",
+ "shell.execute_reply": "2024-11-01T02:47:59.539861Z"
+ }
+ },
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"/home/chenyang/miniconda3/envs/AlphaMeemory/lib/python3.11/site-packages/transformers/utils/hub.py:128: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n",
- " warnings.warn(\n",
- "[2024-10-29 21:07:15] server_args=ServerArgs(model_path='Alibaba-NLP/gte-Qwen2-7B-instruct', tokenizer_path='Alibaba-NLP/gte-Qwen2-7B-instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, context_length=None, device='cuda', served_model_name='Alibaba-NLP/gte-Qwen2-7B-instruct', chat_template=None, is_embedding=True, host='0.0.0.0', port=30010, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, tp_size=1, stream_interval=1, random_seed=568040040, constrained_json_whitespace_pattern=None, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, watchdog_timeout=600, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_flashinfer=False, disable_flashinfer_sampling=False, disable_radix_cache=False, disable_regex_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_penalizer=False, disable_nan_detection=False, enable_overlap_schedule=False, enable_mixed_chunk=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config='', enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1)\n",
+ " warnings.warn(\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[2024-10-31 19:47:37] server_args=ServerArgs(model_path='Alibaba-NLP/gte-Qwen2-7B-instruct', tokenizer_path='Alibaba-NLP/gte-Qwen2-7B-instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, context_length=None, device='cuda', served_model_name='Alibaba-NLP/gte-Qwen2-7B-instruct', chat_template=None, is_embedding=True, host='0.0.0.0', port=30010, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, tp_size=1, stream_interval=1, random_seed=314021918, constrained_json_whitespace_pattern=None, decode_log_interval=40, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, watchdog_timeout=600, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_flashinfer=False, disable_flashinfer_sampling=False, disable_radix_cache=False, disable_regex_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_penalizer=False, disable_nan_detection=False, enable_overlap_schedule=False, enable_mixed_chunk=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config='', enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1)\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
"/home/chenyang/miniconda3/envs/AlphaMeemory/lib/python3.11/site-packages/transformers/utils/hub.py:128: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n",
" warnings.warn(\n",
"/home/chenyang/miniconda3/envs/AlphaMeemory/lib/python3.11/site-packages/transformers/utils/hub.py:128: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n",
- " warnings.warn(\n",
- "[2024-10-29 21:07:20 TP0] Init torch distributed begin.\n",
- "[2024-10-29 21:07:20 TP0] Load weight begin. avail mem=47.27 GB\n",
- "[2024-10-29 21:07:21 TP0] lm_eval is not installed, GPTQ may not be usable\n",
- "INFO 10-29 21:07:22 weight_utils.py:243] Using model weights format ['*.safetensors']\n",
- "Loading safetensors checkpoint shards: 0% Completed | 0/7 [00:00, ?it/s]\n",
- "Loading safetensors checkpoint shards: 14% Completed | 1/7 [00:00<00:03, 1.65it/s]\n",
- "Loading safetensors checkpoint shards: 29% Completed | 2/7 [00:01<00:04, 1.02it/s]\n",
- "Loading safetensors checkpoint shards: 43% Completed | 3/7 [00:03<00:04, 1.24s/it]\n",
- "Loading safetensors checkpoint shards: 57% Completed | 4/7 [00:05<00:04, 1.47s/it]\n",
- "Loading safetensors checkpoint shards: 71% Completed | 5/7 [00:07<00:03, 1.62s/it]\n",
- "Loading safetensors checkpoint shards: 86% Completed | 6/7 [00:08<00:01, 1.64s/it]\n",
- "Loading safetensors checkpoint shards: 100% Completed | 7/7 [00:10<00:00, 1.63s/it]\n",
- "Loading safetensors checkpoint shards: 100% Completed | 7/7 [00:10<00:00, 1.49s/it]\n",
+ " warnings.warn(\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[2024-10-31 19:47:43 TP0] Init torch distributed begin.\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[2024-10-31 19:47:44 TP0] Load weight begin. avail mem=47.27 GB\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[2024-10-31 19:47:44 TP0] lm_eval is not installed, GPTQ may not be usable\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "INFO 10-31 19:47:45 weight_utils.py:243] Using model weights format ['*.safetensors']\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\r",
+ "Loading safetensors checkpoint shards: 0% Completed | 0/7 [00:00, ?it/s]\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\r",
+ "Loading safetensors checkpoint shards: 14% Completed | 1/7 [00:00<00:03, 1.96it/s]\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\r",
+ "Loading safetensors checkpoint shards: 29% Completed | 2/7 [00:01<00:03, 1.39it/s]\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\r",
+ "Loading safetensors checkpoint shards: 43% Completed | 3/7 [00:02<00:03, 1.13it/s]\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\r",
+ "Loading safetensors checkpoint shards: 57% Completed | 4/7 [00:03<00:02, 1.00it/s]\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\r",
+ "Loading safetensors checkpoint shards: 71% Completed | 5/7 [00:04<00:02, 1.05s/it]\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\r",
+ "Loading safetensors checkpoint shards: 86% Completed | 6/7 [00:05<00:01, 1.09s/it]\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\r",
+ "Loading safetensors checkpoint shards: 100% Completed | 7/7 [00:07<00:00, 1.11s/it]\n",
+ "\r",
+ "Loading safetensors checkpoint shards: 100% Completed | 7/7 [00:07<00:00, 1.01s/it]\n",
"\n",
- "[2024-10-29 21:07:32 TP0] Load weight end. type=Qwen2ForCausalLM, dtype=torch.float16, avail mem=32.91 GB\n",
- "[2024-10-29 21:07:33 TP0] Memory pool end. avail mem=4.56 GB\n",
- "[2024-10-29 21:07:33 TP0] max_total_num_tokens=509971, max_prefill_tokens=16384, max_running_requests=2049, context_len=131072\n",
- "[2024-10-29 21:07:33] INFO: Started server process [2650986]\n",
- "[2024-10-29 21:07:33] INFO: Waiting for application startup.\n",
- "[2024-10-29 21:07:33] INFO: Application startup complete.\n",
- "[2024-10-29 21:07:33] INFO: Uvicorn running on http://0.0.0.0:30010 (Press CTRL+C to quit)\n",
- "[2024-10-29 21:07:34] INFO: 127.0.0.1:47812 - \"GET /v1/models HTTP/1.1\" 200 OK\n"
+ "[2024-10-31 19:47:53 TP0] Load weight end. type=Qwen2ForCausalLM, dtype=torch.float16, avail mem=32.91 GB\n",
+ "[2024-10-31 19:47:53 TP0] Memory pool end. avail mem=4.56 GB\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[2024-10-31 19:47:53 TP0] max_total_num_tokens=509971, max_prefill_tokens=16384, max_running_requests=2049, context_len=131072\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[2024-10-31 19:47:54] INFO: Started server process [1552642]\n",
+ "[2024-10-31 19:47:54] INFO: Waiting for application startup.\n",
+ "[2024-10-31 19:47:54] INFO: Application startup complete.\n",
+ "[2024-10-31 19:47:54] INFO: Uvicorn running on http://0.0.0.0:30010 (Press CTRL+C to quit)\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[2024-10-31 19:47:54] INFO: 127.0.0.1:47776 - \"GET /v1/models HTTP/1.1\" 200 OK\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[2024-10-31 19:47:55] INFO: 127.0.0.1:50344 - \"GET /get_model_info HTTP/1.1\" 200 OK\n",
+ "[2024-10-31 19:47:55 TP0] Prefill batch. #new-seq: 1, #new-token: 6, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[2024-10-31 19:47:55] INFO: 127.0.0.1:50352 - \"POST /encode HTTP/1.1\" 200 OK\n",
+ "[2024-10-31 19:47:55] The server is fired up and ready to roll!\n"
]
},
{
"data": {
"text/html": [
- "
This cell combines server and notebook output.
Typically, the server runs in a separate terminal,
but we combine the output of server and notebook to demonstrate the usage better.
In our documentation, server output is in gray, notebook output is highlighted.
"
+ "
NOTE: Typically, the server runs in a separate terminal.
In this notebook, we run the server and notebook code together, so their outputs are combined.
To improve clarity, the server logs are displayed in the original black color, while the notebook outputs are highlighted in blue.
"
],
"text/plain": [
""
@@ -78,16 +212,6 @@
},
"metadata": {},
"output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "[2024-10-29 21:07:34] INFO: 127.0.0.1:41780 - \"GET /get_model_info HTTP/1.1\" 200 OK\n",
- "[2024-10-29 21:07:34 TP0] Prefill batch. #new-seq: 1, #new-token: 6, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
- "[2024-10-29 21:07:35] INFO: 127.0.0.1:41792 - \"POST /encode HTTP/1.1\" 200 OK\n",
- "[2024-10-29 21:07:35] The server is fired up and ready to roll!\n"
- ]
}
],
"source": [
@@ -118,20 +242,21 @@
{
"cell_type": "code",
"execution_count": 2,
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-11-01T02:47:59.543958Z",
+ "iopub.status.busy": "2024-11-01T02:47:59.543670Z",
+ "iopub.status.idle": "2024-11-01T02:47:59.591699Z",
+ "shell.execute_reply": "2024-11-01T02:47:59.590809Z"
+ }
+ },
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "[2024-10-28 02:10:30 TP0] Prefill batch. #new-seq: 1, #new-token: 4, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "[2024-10-28 02:10:31] INFO: 127.0.0.1:48094 - \"POST /v1/embeddings HTTP/1.1\" 200 OK\n"
+ "[2024-10-31 19:47:59 TP0] Prefill batch. #new-seq: 1, #new-token: 4, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
+ "[2024-10-31 19:47:59] INFO: 127.0.0.1:50358 - \"POST /v1/embeddings HTTP/1.1\" 200 OK\n"
]
},
{
@@ -174,18 +299,21 @@
{
"cell_type": "code",
"execution_count": 3,
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-11-01T02:47:59.594229Z",
+ "iopub.status.busy": "2024-11-01T02:47:59.594049Z",
+ "iopub.status.idle": "2024-11-01T02:48:00.006233Z",
+ "shell.execute_reply": "2024-11-01T02:48:00.005255Z"
+ }
+ },
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "[2024-10-28 02:10:31] INFO: 127.0.0.1:48110 - \"GET /get_model_info HTTP/1.1\" 200 OK\n",
- "[2024-10-28 02:10:31 TP0] Prefill batch. #new-seq: 1, #new-token: 6, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
- "[2024-10-28 02:10:31] INFO: 127.0.0.1:48114 - \"POST /encode HTTP/1.1\" 200 OK\n",
- "[2024-10-28 02:10:31] The server is fired up and ready to roll!\n",
- "[2024-10-28 02:10:31 TP0] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 3, cache hit rate: 21.43%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
- "[2024-10-28 02:10:31] INFO: 127.0.0.1:48118 - \"POST /v1/embeddings HTTP/1.1\" 200 OK\n"
+ "[2024-10-31 19:47:59 TP0] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 3, cache hit rate: 21.43%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
+ "[2024-10-31 19:47:59] INFO: 127.0.0.1:50362 - \"POST /v1/embeddings HTTP/1.1\" 200 OK\n"
]
},
{
@@ -228,13 +356,20 @@
{
"cell_type": "code",
"execution_count": 4,
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-11-01T02:48:00.008858Z",
+ "iopub.status.busy": "2024-11-01T02:48:00.008689Z",
+ "iopub.status.idle": "2024-11-01T02:48:01.872542Z",
+ "shell.execute_reply": "2024-11-01T02:48:01.871573Z"
+ }
+ },
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
- "/home/chenyang/miniconda3/envs/AlphaMeemory/lib/python3.11/site-packages/transformers/utils/hub.py:127: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n",
+ "/home/chenyang/miniconda3/envs/AlphaMeemory/lib/python3.11/site-packages/transformers/utils/hub.py:128: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n",
" warnings.warn(\n"
]
},
@@ -242,8 +377,8 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "[2024-10-28 02:10:32 TP0] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 3, cache hit rate: 33.33%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
- "[2024-10-28 02:10:32] INFO: 127.0.0.1:48124 - \"POST /v1/embeddings HTTP/1.1\" 200 OK\n"
+ "[2024-10-31 19:48:01 TP0] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 3, cache hit rate: 33.33%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
+ "[2024-10-31 19:48:01] INFO: 127.0.0.1:50366 - \"POST /v1/embeddings HTTP/1.1\" 200 OK\n"
]
},
{
@@ -284,20 +419,15 @@
{
"cell_type": "code",
"execution_count": 5,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "[2024-10-28 02:10:32] INFO: Shutting down\n",
- "[2024-10-28 02:10:32] INFO: Waiting for application shutdown.\n",
- "[2024-10-28 02:10:32] INFO: Application shutdown complete.\n",
- "[2024-10-28 02:10:32] INFO: Finished server process [1188896]\n",
- "W1028 02:10:32.490000 140389363193408 torch/_inductor/compile_worker/subproc_pool.py:126] SubprocPool unclean exit\n"
- ]
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-11-01T02:48:01.875204Z",
+ "iopub.status.busy": "2024-11-01T02:48:01.874915Z",
+ "iopub.status.idle": "2024-11-01T02:48:02.193734Z",
+ "shell.execute_reply": "2024-11-01T02:48:02.192158Z"
}
- ],
+ },
+ "outputs": [],
"source": [
"terminate_process(embedding_process)"
]
diff --git a/docs/openai_api.ipynb b/docs/backend/openai_api.ipynb
similarity index 55%
rename from docs/openai_api.ipynb
rename to docs/backend/openai_api.ipynb
index 374fa3056..9b9ba7ab0 100644
--- a/docs/openai_api.ipynb
+++ b/docs/backend/openai_api.ipynb
@@ -30,41 +30,140 @@
{
"cell_type": "code",
"execution_count": 1,
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-11-01T02:44:46.419815Z",
+ "iopub.status.busy": "2024-11-01T02:44:46.419509Z",
+ "iopub.status.idle": "2024-11-01T02:45:16.621648Z",
+ "shell.execute_reply": "2024-11-01T02:45:16.620659Z"
+ }
+ },
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "2024-10-30 09:44:20.477109: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
- "2024-10-30 09:44:20.489679: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
- "2024-10-30 09:44:20.489712: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1442] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
- "2024-10-30 09:44:21.010067: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n",
- "[2024-10-30 09:44:29] server_args=ServerArgs(model_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, context_length=None, device='cuda', served_model_name='meta-llama/Meta-Llama-3.1-8B-Instruct', chat_template=None, is_embedding=False, host='0.0.0.0', port=30000, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, tp_size=1, stream_interval=1, random_seed=134920821, constrained_json_whitespace_pattern=None, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, watchdog_timeout=600, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_flashinfer=False, disable_flashinfer_sampling=False, disable_radix_cache=False, disable_regex_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_penalizer=False, disable_nan_detection=False, enable_overlap_schedule=False, enable_mixed_chunk=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config='', enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1)\n",
- "[2024-10-30 09:44:39 TP0] Init torch distributed begin.\n",
- "[2024-10-30 09:44:41 TP0] Load weight begin. avail mem=76.83 GB\n",
- "[2024-10-30 09:44:42 TP0] lm_eval is not installed, GPTQ may not be usable\n",
- "INFO 10-30 09:44:42 weight_utils.py:243] Using model weights format ['*.safetensors']\n",
- "Loading safetensors checkpoint shards: 0% Completed | 0/4 [00:00, ?it/s]\n",
- "Loading safetensors checkpoint shards: 25% Completed | 1/4 [00:01<00:05, 1.77s/it]\n",
- "Loading safetensors checkpoint shards: 50% Completed | 2/4 [00:03<00:03, 1.77s/it]\n",
- "Loading safetensors checkpoint shards: 75% Completed | 3/4 [00:05<00:01, 1.77s/it]\n",
- "Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:05<00:00, 1.27s/it]\n",
- "Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:05<00:00, 1.45s/it]\n",
+ "/home/chenyang/miniconda3/envs/AlphaMeemory/lib/python3.11/site-packages/transformers/utils/hub.py:128: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n",
+ " warnings.warn(\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[2024-10-31 19:44:51] server_args=ServerArgs(model_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, context_length=None, device='cuda', served_model_name='meta-llama/Meta-Llama-3.1-8B-Instruct', chat_template=None, is_embedding=False, host='0.0.0.0', port=30000, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, tp_size=1, stream_interval=1, random_seed=357249111, constrained_json_whitespace_pattern=None, decode_log_interval=40, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, watchdog_timeout=600, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_flashinfer=False, disable_flashinfer_sampling=False, disable_radix_cache=False, disable_regex_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_penalizer=False, disable_nan_detection=False, enable_overlap_schedule=False, enable_mixed_chunk=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config='', enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1)\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "/home/chenyang/miniconda3/envs/AlphaMeemory/lib/python3.11/site-packages/transformers/utils/hub.py:128: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n",
+ " warnings.warn(\n",
+ "/home/chenyang/miniconda3/envs/AlphaMeemory/lib/python3.11/site-packages/transformers/utils/hub.py:128: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n",
+ " warnings.warn(\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[2024-10-31 19:44:57 TP0] Init torch distributed begin.\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[2024-10-31 19:44:58 TP0] Load weight begin. avail mem=47.27 GB\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[2024-10-31 19:44:59 TP0] lm_eval is not installed, GPTQ may not be usable\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "INFO 10-31 19:44:59 weight_utils.py:243] Using model weights format ['*.safetensors']\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\r",
+ "Loading safetensors checkpoint shards: 0% Completed | 0/4 [00:00, ?it/s]\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\r",
+ "Loading safetensors checkpoint shards: 25% Completed | 1/4 [00:00<00:01, 2.26it/s]\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\r",
+ "Loading safetensors checkpoint shards: 50% Completed | 2/4 [00:00<00:00, 2.25it/s]\n",
+ "\r",
+ "Loading safetensors checkpoint shards: 75% Completed | 3/4 [00:01<00:00, 3.24it/s]\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\r",
+ "Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:01<00:00, 2.70it/s]\n",
+ "\r",
+ "Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:01<00:00, 2.67it/s]\n",
"\n",
- "[2024-10-30 09:44:48 TP0] Load weight end. type=LlamaForCausalLM, dtype=torch.bfloat16, avail mem=61.82 GB\n",
- "[2024-10-30 09:44:48 TP0] Memory pool end. avail mem=8.19 GB\n",
- "[2024-10-30 09:44:49 TP0] Capture cuda graph begin. This can take up to several minutes.\n",
- "[2024-10-30 09:44:58 TP0] max_total_num_tokens=430915, max_prefill_tokens=16384, max_running_requests=2049, context_len=131072\n",
- "[2024-10-30 09:44:58] INFO: Started server process [231459]\n",
- "[2024-10-30 09:44:58] INFO: Waiting for application startup.\n",
- "[2024-10-30 09:44:58] INFO: Application startup complete.\n",
- "[2024-10-30 09:44:58] INFO: Uvicorn running on http://0.0.0.0:30000 (Press CTRL+C to quit)\n",
- "[2024-10-30 09:44:59] INFO: 127.0.0.1:54650 - \"GET /v1/models HTTP/1.1\" 200 OK\n",
- "[2024-10-30 09:44:59] INFO: 127.0.0.1:54666 - \"GET /get_model_info HTTP/1.1\" 200 OK\n",
- "[2024-10-30 09:44:59 TP0] Prefill batch. #new-seq: 1, #new-token: 7, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
- "[2024-10-30 09:44:59] INFO: 127.0.0.1:54672 - \"POST /generate HTTP/1.1\" 200 OK\n",
- "[2024-10-30 09:44:59] The server is fired up and ready to roll!\n"
+ "[2024-10-31 19:45:01 TP0] Load weight end. type=LlamaForCausalLM, dtype=torch.bfloat16, avail mem=32.22 GB\n",
+ "[2024-10-31 19:45:02 TP0] Memory pool end. avail mem=4.60 GB\n",
+ "[2024-10-31 19:45:02 TP0] Capture cuda graph begin. This can take up to several minutes.\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[2024-10-31 19:45:10 TP0] max_total_num_tokens=217512, max_prefill_tokens=16384, max_running_requests=2049, context_len=131072\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[2024-10-31 19:45:10] INFO: Started server process [1543025]\n",
+ "[2024-10-31 19:45:10] INFO: Waiting for application startup.\n",
+ "[2024-10-31 19:45:10] INFO: Application startup complete.\n",
+ "[2024-10-31 19:45:10] INFO: Uvicorn running on http://0.0.0.0:30000 (Press CTRL+C to quit)\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[2024-10-31 19:45:11] INFO: 127.0.0.1:35048 - \"GET /v1/models HTTP/1.1\" 200 OK\n",
+ "[2024-10-31 19:45:11] INFO: 127.0.0.1:35056 - \"GET /get_model_info HTTP/1.1\" 200 OK\n",
+ "[2024-10-31 19:45:11 TP0] Prefill batch. #new-seq: 1, #new-token: 7, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[2024-10-31 19:45:11] INFO: 127.0.0.1:35066 - \"POST /generate HTTP/1.1\" 200 OK\n",
+ "[2024-10-31 19:45:11] The server is fired up and ready to roll!\n"
]
},
{
@@ -98,21 +197,40 @@
{
"cell_type": "code",
"execution_count": 2,
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-11-01T02:45:16.624550Z",
+ "iopub.status.busy": "2024-11-01T02:45:16.624258Z",
+ "iopub.status.idle": "2024-11-01T02:45:18.087455Z",
+ "shell.execute_reply": "2024-11-01T02:45:18.086450Z"
+ }
+ },
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "[2024-10-30 09:45:52 TP0] Prefill batch. #new-seq: 1, #new-token: 48, #cached-token: 1, cache hit rate: 1.79%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
- "[2024-10-30 09:45:53 TP0] Decode batch. #running-req: 1, #token: 82, token usage: 0.00, gen throughput (token/s): 0.73, #queue-req: 0\n",
- "[2024-10-30 09:45:53] INFO: 127.0.0.1:55594 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n"
+ "[2024-10-31 19:45:16 TP0] Prefill batch. #new-seq: 1, #new-token: 48, #cached-token: 1, cache hit rate: 1.79%, token usage: 0.00, #running-req: 0, #queue-req: 0\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[2024-10-31 19:45:17 TP0] Decode batch. #running-req: 1, #token: 82, token usage: 0.00, gen throughput (token/s): 5.21, #queue-req: 0\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[2024-10-31 19:45:18] INFO: 127.0.0.1:37738 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n"
]
},
{
"data": {
"text/html": [
- "Response: ChatCompletion(id='876500c402ae452ea17e4dde415c108a', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are 3 countries and their capitals:\\n\\n1. **Country:** Japan\\n**Capital:** Tokyo\\n\\n2. **Country:** Australia\\n**Capital:** Canberra\\n\\n3. **Country:** Brazil\\n**Capital:** Brasília', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None), matched_stop=128009)], created=1730281553, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=46, prompt_tokens=49, total_tokens=95, completion_tokens_details=None, prompt_tokens_details=None))"
+ "Response: ChatCompletion(id='e04fce6c460d4764af68007fc82763e1', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are 3 countries and their capitals:\\n\\n1. **Country:** Japan\\n**Capital:** Tokyo\\n\\n2. **Country:** Australia\\n**Capital:** Canberra\\n\\n3. **Country:** Brazil\\n**Capital:** Brasília', refusal=None, role='assistant', function_call=None, tool_calls=None), matched_stop=128009)], created=1730429118, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=46, prompt_tokens=49, total_tokens=95, prompt_tokens_details=None))"
],
"text/plain": [
""
@@ -154,23 +272,54 @@
{
"cell_type": "code",
"execution_count": 3,
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-11-01T02:45:18.090228Z",
+ "iopub.status.busy": "2024-11-01T02:45:18.090071Z",
+ "iopub.status.idle": "2024-11-01T02:45:21.193221Z",
+ "shell.execute_reply": "2024-11-01T02:45:21.192539Z"
+ }
+ },
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "[2024-10-30 09:45:57 TP0] Prefill batch. #new-seq: 1, #new-token: 48, #cached-token: 28, cache hit rate: 21.97%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
- "[2024-10-30 09:45:57 TP0] Decode batch. #running-req: 1, #token: 104, token usage: 0.00, gen throughput (token/s): 8.70, #queue-req: 0\n",
- "[2024-10-30 09:45:58 TP0] Decode batch. #running-req: 1, #token: 144, token usage: 0.00, gen throughput (token/s): 132.75, #queue-req: 0\n",
- "[2024-10-30 09:45:58 TP0] Decode batch. #running-req: 1, #token: 184, token usage: 0.00, gen throughput (token/s): 132.30, #queue-req: 0\n",
- "[2024-10-30 09:45:58] INFO: 127.0.0.1:55594 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n"
+ "[2024-10-31 19:45:18 TP0] Prefill batch. #new-seq: 1, #new-token: 48, #cached-token: 28, cache hit rate: 21.97%, token usage: 0.00, #running-req: 0, #queue-req: 0\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[2024-10-31 19:45:18 TP0] Decode batch. #running-req: 1, #token: 104, token usage: 0.00, gen throughput (token/s): 39.15, #queue-req: 0\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[2024-10-31 19:45:19 TP0] Decode batch. #running-req: 1, #token: 144, token usage: 0.00, gen throughput (token/s): 41.80, #queue-req: 0\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[2024-10-31 19:45:20 TP0] Decode batch. #running-req: 1, #token: 184, token usage: 0.00, gen throughput (token/s): 41.81, #queue-req: 0\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[2024-10-31 19:45:21] INFO: 127.0.0.1:37738 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n"
]
},
{
"data": {
"text/html": [
- "Ancient Rome's major achievements include:
1. **Engineering and Architecture**: Developed concrete, aqueducts, roads, bridges, and monumental buildings like the Colosseum and Pantheon.
2. **Law and Governance**: Established the Twelve Tables, a foundation for modern law, and a system of governance that included the Senate and Assemblies.
3. **Military Conquests**: Expanded the empire through numerous wars, creating a vast territory that stretched from Britain to Egypt.
4. **Language and Literature**: Developed Latin, which became the language of law, government, and literature, influencing modern languages like French, Spanish, and Italian.
"
+ "Ancient Rome's major achievements include:
1. **Engineering and Architecture**: They built iconic structures like the Colosseum, Pantheon, and Roman Forum, showcasing their mastery of concrete, arches, and aqueducts.
2. **Law and Governance**: The Romans developed the 12 Tables (450 BCE), which formed the basis of their laws, and established the concept of citizenship, paving the way for modern democracy.
3. **Military Conquests**: Rome expanded its territories through a series of wars, creating a vast empire that lasted for centuries, stretching from Britain to Egypt.
4. **Language and Literature**: Latin became"
],
"text/plain": [
""
@@ -217,16 +366,50 @@
{
"cell_type": "code",
"execution_count": 4,
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-11-01T02:45:21.195226Z",
+ "iopub.status.busy": "2024-11-01T02:45:21.194680Z",
+ "iopub.status.idle": "2024-11-01T02:45:21.675473Z",
+ "shell.execute_reply": "2024-11-01T02:45:21.675050Z"
+ }
+ },
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "[2024-10-30 09:46:06] INFO: 127.0.0.1:45834 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n",
- "[2024-10-30 09:46:06 TP0] Prefill batch. #new-seq: 1, #new-token: 15, #cached-token: 25, cache hit rate: 31.40%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
- "It looks like you're getting started with our conversation. I'm happy to chat with you and see how[2024-10-30 09:46:06 TP0] Decode batch. #running-req: 1, #token: 61, token usage: 0.00, gen throughput (token/s): 4.78, #queue-req: 0\n",
- " things go. What would you like to talk about?"
+ "[2024-10-31 19:45:21] INFO: 127.0.0.1:37738 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n",
+ "[2024-10-31 19:45:21 TP0] Prefill batch. #new-seq: 1, #new-token: 15, #cached-token: 25, cache hit rate: 31.40%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
+ "It looks like you're ready to"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " begin"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ ". What kind of test would you like"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " to"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " conduct?"
]
}
],
@@ -255,21 +438,41 @@
{
"cell_type": "code",
"execution_count": 5,
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-11-01T02:45:21.676813Z",
+ "iopub.status.busy": "2024-11-01T02:45:21.676665Z",
+ "iopub.status.idle": "2024-11-01T02:45:23.182104Z",
+ "shell.execute_reply": "2024-11-01T02:45:23.181695Z"
+ }
+ },
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "[2024-10-30 09:46:11 TP0] Prefill batch. #new-seq: 1, #new-token: 8, #cached-token: 1, cache hit rate: 30.39%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
- "[2024-10-30 09:46:12 TP0] Decode batch. #running-req: 1, #token: 38, token usage: 0.00, gen throughput (token/s): 7.66, #queue-req: 0\n",
- "[2024-10-30 09:46:12] INFO: 127.0.0.1:45834 - \"POST /v1/completions HTTP/1.1\" 200 OK\n"
+ "[2024-10-31 19:45:21 TP0] Prefill batch. #new-seq: 1, #new-token: 8, #cached-token: 1, cache hit rate: 30.39%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
+ "[2024-10-31 19:45:21 TP0] Decode batch. #running-req: 1, #token: 11, token usage: 0.00, gen throughput (token/s): 39.18, #queue-req: 0\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[2024-10-31 19:45:22 TP0] Decode batch. #running-req: 1, #token: 51, token usage: 0.00, gen throughput (token/s): 42.85, #queue-req: 0\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[2024-10-31 19:45:23] INFO: 127.0.0.1:37738 - \"POST /v1/completions HTTP/1.1\" 200 OK\n"
]
},
{
"data": {
"text/html": [
- "Response: Completion(id='1c988750627649f8872965d00cc008d9', choices=[CompletionChoice(finish_reason='length', index=0, logprobs=None, text=' 1. 2. 3.\\n1. United States - Washington D.C. 2. Japan - Tokyo 3. Australia - Canberra\\nList 3 countries and their capitals. 1. 2. 3.\\n1. China - Beijing 2. Brazil - Bras', matched_stop=None)], created=1730281572, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='text_completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=64, prompt_tokens=9, total_tokens=73, completion_tokens_details=None, prompt_tokens_details=None))"
+ "Response: Completion(id='84ca7b4df182449697c4b38a454b8834', choices=[CompletionChoice(finish_reason='length', index=0, logprobs=None, text=' 1. 2. 3.\\n1. United States Washington D.C. 2. Japan Tokyo 3. Australia Canberra\\nList 3 countries and their capitals. 1. 2. 3.\\n1. China Beijing 2. Brazil Bras', matched_stop=None)], created=1730429123, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='text_completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=64, prompt_tokens=9, total_tokens=73, prompt_tokens_details=None))"
],
"text/plain": [
""
@@ -306,24 +509,61 @@
{
"cell_type": "code",
"execution_count": 6,
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-11-01T02:45:23.186337Z",
+ "iopub.status.busy": "2024-11-01T02:45:23.186189Z",
+ "iopub.status.idle": "2024-11-01T02:45:26.769744Z",
+ "shell.execute_reply": "2024-11-01T02:45:26.769299Z"
+ }
+ },
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "[2024-10-30 09:46:15 TP0] Prefill batch. #new-seq: 1, #new-token: 9, #cached-token: 1, cache hit rate: 29.32%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
- "[2024-10-30 09:46:15 TP0] Decode batch. #running-req: 1, #token: 16, token usage: 0.00, gen throughput (token/s): 12.28, #queue-req: 0\n",
- "[2024-10-30 09:46:15 TP0] Decode batch. #running-req: 1, #token: 56, token usage: 0.00, gen throughput (token/s): 135.70, #queue-req: 0\n",
- "[2024-10-30 09:46:15 TP0] Decode batch. #running-req: 1, #token: 96, token usage: 0.00, gen throughput (token/s): 134.45, #queue-req: 0\n",
- "[2024-10-30 09:46:16 TP0] Decode batch. #running-req: 1, #token: 136, token usage: 0.00, gen throughput (token/s): 133.34, #queue-req: 0\n",
- "[2024-10-30 09:46:16] INFO: 127.0.0.1:45834 - \"POST /v1/completions HTTP/1.1\" 200 OK\n"
+ "[2024-10-31 19:45:23 TP0] Prefill batch. #new-seq: 1, #new-token: 9, #cached-token: 1, cache hit rate: 29.32%, token usage: 0.00, #running-req: 0, #queue-req: 0\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[2024-10-31 19:45:23 TP0] Decode batch. #running-req: 1, #token: 29, token usage: 0.00, gen throughput (token/s): 40.76, #queue-req: 0\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[2024-10-31 19:45:24 TP0] Decode batch. #running-req: 1, #token: 69, token usage: 0.00, gen throughput (token/s): 42.13, #queue-req: 0\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[2024-10-31 19:45:25 TP0] Decode batch. #running-req: 1, #token: 109, token usage: 0.00, gen throughput (token/s): 42.01, #queue-req: 0\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[2024-10-31 19:45:26 TP0] Decode batch. #running-req: 1, #token: 149, token usage: 0.00, gen throughput (token/s): 41.87, #queue-req: 0\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[2024-10-31 19:45:26] INFO: 127.0.0.1:37738 - \"POST /v1/completions HTTP/1.1\" 200 OK\n"
]
},
{
"data": {
"text/html": [
- "Response: Completion(id='784041b9af634537a7960a0ba6152ba2', choices=[CompletionChoice(finish_reason='length', index=0, logprobs=None, text=\"\\xa0\\nOnce upon a time, in a distant corner of the universe, there was a brave space explorer named Captain Orion. She had spent her entire life studying the stars and dreaming of the day she could explore them for herself. Finally, after years of training and preparation, she set off on her maiden voyage to explore the cosmos.\\nCaptain Orion's ship, the Aurora, was equipped with state-of-the-art technology and a crew of skilled astronauts who were eager to venture into the unknown. As they soared through the galaxy, they encountered breathtaking landscapes and incredible creatures that defied explanation.\\nOn their first stop, they landed on a planet called Zorvath, a world of swirling purple clouds and towering crystal spires. Captain Orion and her crew mar\", matched_stop=None)], created=1730281576, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='text_completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=150, prompt_tokens=10, total_tokens=160, completion_tokens_details=None, prompt_tokens_details=None))"
+ "Response: Completion(id='fe384c17aece4a5ca5fb5238dcd1adec', choices=[CompletionChoice(finish_reason='length', index=0, logprobs=None, text=\" This can be a sci-fi story, and you have the ability to create a unique and imaginative universe.\\nIn the depths of space, a lone space explorer named Kaelin Vex navigated through the swirling vortex of the Aurora Nebula. Her ship, the Starweaver, was an extension of herself, its advanced AI system linked directly to her mind. Together, they danced through the cosmos, searching for answers to the mysteries of the universe.\\nKaelin's mission was to uncover the secrets of the ancient alien civilization known as the Architects. Legends spoke of their unparalleled technological prowess and their ability to manipulate reality itself. Many believed they had transcended their physical forms, becoming one with the cosmos.\\nAs Kaelin delved deeper into\", matched_stop=None)], created=1730429126, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='text_completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=150, prompt_tokens=10, total_tokens=160, prompt_tokens_details=None))"
],
"text/plain": [
""
@@ -369,22 +609,29 @@
},
{
"cell_type": "code",
- "execution_count": 6,
- "metadata": {},
+ "execution_count": 7,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-11-01T02:45:26.772016Z",
+ "iopub.status.busy": "2024-11-01T02:45:26.771868Z",
+ "iopub.status.idle": "2024-11-01T02:45:26.794225Z",
+ "shell.execute_reply": "2024-11-01T02:45:26.793811Z"
+ }
+ },
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "[2024-10-28 02:02:55] INFO: 127.0.0.1:43330 - \"POST /v1/files HTTP/1.1\" 200 OK\n",
- "[2024-10-28 02:02:55] INFO: 127.0.0.1:43330 - \"POST /v1/batches HTTP/1.1\" 200 OK\n",
- "[2024-10-28 02:02:55 TP0] Prefill batch. #new-seq: 2, #new-token: 30, #cached-token: 50, cache hit rate: 35.06%, token usage: 0.00, #running-req: 0, #queue-req: 0\n"
+ "[2024-10-31 19:45:26] INFO: 127.0.0.1:57182 - \"POST /v1/files HTTP/1.1\" 200 OK\n",
+ "[2024-10-31 19:45:26] INFO: 127.0.0.1:57182 - \"POST /v1/batches HTTP/1.1\" 200 OK\n",
+ "[2024-10-31 19:45:26 TP0] Prefill batch. #new-seq: 2, #new-token: 20, #cached-token: 60, cache hit rate: 42.80%, token usage: 0.00, #running-req: 0, #queue-req: 0\n"
]
},
{
"data": {
"text/html": [
- "Batch job created with ID: batch_56fefd2e-0187-4c14-aa2d-110917723dde"
+ "Batch job created with ID: batch_d9af5b49-ad3d-423e-8c30-4aaafa5c18c4"
],
"text/plain": [
""
@@ -446,19 +693,32 @@
},
{
"cell_type": "code",
- "execution_count": 7,
- "metadata": {},
+ "execution_count": 8,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-11-01T02:45:26.796422Z",
+ "iopub.status.busy": "2024-11-01T02:45:26.796273Z",
+ "iopub.status.idle": "2024-11-01T02:45:29.810471Z",
+ "shell.execute_reply": "2024-11-01T02:45:29.810041Z"
+ }
+ },
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "[2024-10-28 02:02:56 TP0] Decode batch. #running-req: 2, #token: 82, token usage: 0.00, gen throughput (token/s): 55.10, #queue-req: 0\n",
+ "[2024-10-31 19:45:27 TP0] Decode batch. #running-req: 1, #token: 69, token usage: 0.00, gen throughput (token/s): 51.72, #queue-req: 0\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
"Batch job status: validating...trying again in 3 seconds...\n",
- "[2024-10-28 02:02:58] INFO: 127.0.0.1:43330 - \"GET /v1/batches/batch_56fefd2e-0187-4c14-aa2d-110917723dde HTTP/1.1\" 200 OK\n",
+ "[2024-10-31 19:45:29] INFO: 127.0.0.1:57182 - \"GET /v1/batches/batch_d9af5b49-ad3d-423e-8c30-4aaafa5c18c4 HTTP/1.1\" 200 OK\n",
"Batch job completed successfully!\n",
"Request counts: BatchRequestCounts(completed=2, failed=0, total=2)\n",
- "[2024-10-28 02:02:58] INFO: 127.0.0.1:43330 - \"GET /v1/files/backend_result_file-520da6c8-0cce-4d4c-a943-a86101f5f5b4/content HTTP/1.1\" 200 OK\n"
+ "[2024-10-31 19:45:29] INFO: 127.0.0.1:57182 - \"GET /v1/files/backend_result_file-4ed79bf4-1e07-4fa9-9638-7448aa4e074b/content HTTP/1.1\" 200 OK\n"
]
},
{
@@ -476,7 +736,7 @@
{
"data": {
"text/html": [
- "Response: {'status_code': 200, 'request_id': 'request-1', 'body': {'id': 'request-1', 'object': 'chat.completion', 'created': 1730106176, 'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'choices': {'index': 0, 'message': {'role': 'assistant', 'content': 'A programmer walks into a library and asks the librarian, \"Do you have any books on Pavlov\\'s dogs and Schrödinger\\'s cat?\"\\n\\nThe librarian replies, \"It rings a bell, but I\\'m not sure if it\\'s here'}, 'logprobs': None, 'finish_reason': 'length', 'matched_stop': None}, 'usage': {'prompt_tokens': 41, 'completion_tokens': 50, 'total_tokens': 91}, 'system_fingerprint': None}}"
+ "Response: {'status_code': 200, 'request_id': 'request-1', 'body': {'id': 'request-1', 'object': 'chat.completion', 'created': 1730429127, 'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'choices': {'index': 0, 'message': {'role': 'assistant', 'content': 'Why do programmers prefer dark mode?\\n\\nBecause light attracts bugs.'}, 'logprobs': None, 'finish_reason': 'stop', 'matched_stop': 128009}, 'usage': {'prompt_tokens': 41, 'completion_tokens': 13, 'total_tokens': 54}, 'system_fingerprint': None}}"
],
"text/plain": [
""
@@ -500,7 +760,7 @@
{
"data": {
"text/html": [
- "Response: {'status_code': 200, 'request_id': 'request-2', 'body': {'id': 'request-2', 'object': 'chat.completion', 'created': 1730106176, 'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'choices': {'index': 0, 'message': {'role': 'assistant', 'content': '**What is Python?**\\n\\nPython is a high-level, interpreted programming language that is widely used for various purposes, including:\\n\\n1. **Web Development**: Building web applications and web services using frameworks like Django and Flask.\\n2. **Data Analysis and'}, 'logprobs': None, 'finish_reason': 'length', 'matched_stop': None}, 'usage': {'prompt_tokens': 39, 'completion_tokens': 50, 'total_tokens': 89}, 'system_fingerprint': None}}"
+ "Response: {'status_code': 200, 'request_id': 'request-2', 'body': {'id': 'request-2', 'object': 'chat.completion', 'created': 1730429127, 'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'choices': {'index': 0, 'message': {'role': 'assistant', 'content': '**What is Python?**\\n\\nPython is a high-level, interpreted programming language that is widely used for various purposes such as web development, scientific computing, data analysis, artificial intelligence, and more. It was created in the late 1980s by'}, 'logprobs': None, 'finish_reason': 'length', 'matched_stop': None}, 'usage': {'prompt_tokens': 39, 'completion_tokens': 50, 'total_tokens': 89}, 'system_fingerprint': None}}"
],
"text/plain": [
""
@@ -525,7 +785,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "[2024-10-28 02:02:58] INFO: 127.0.0.1:43330 - \"DELETE /v1/files/backend_result_file-520da6c8-0cce-4d4c-a943-a86101f5f5b4 HTTP/1.1\" 200 OK\n"
+ "[2024-10-31 19:45:29] INFO: 127.0.0.1:57182 - \"DELETE /v1/files/backend_result_file-4ed79bf4-1e07-4fa9-9638-7448aa4e074b HTTP/1.1\" 200 OK\n"
]
}
],
@@ -574,21 +834,28 @@
},
{
"cell_type": "code",
- "execution_count": 8,
- "metadata": {},
+ "execution_count": 9,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-11-01T02:45:29.812339Z",
+ "iopub.status.busy": "2024-11-01T02:45:29.812198Z",
+ "iopub.status.idle": "2024-11-01T02:45:54.851243Z",
+ "shell.execute_reply": "2024-11-01T02:45:54.850668Z"
+ }
+ },
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "[2024-10-28 02:02:58] INFO: 127.0.0.1:43336 - \"POST /v1/files HTTP/1.1\" 200 OK\n",
- "[2024-10-28 02:02:58] INFO: 127.0.0.1:43336 - \"POST /v1/batches HTTP/1.1\" 200 OK\n"
+ "[2024-10-31 19:45:29] INFO: 127.0.0.1:57186 - \"POST /v1/files HTTP/1.1\" 200 OK\n",
+ "[2024-10-31 19:45:29] INFO: 127.0.0.1:57186 - \"POST /v1/batches HTTP/1.1\" 200 OK\n"
]
},
{
"data": {
"text/html": [
- "Created batch job with ID: batch_67da0e16-e7b2-4a75-9f7a-58c033e739e5"
+ "Created batch job with ID: batch_3d1a7f8e-af5a-4a14-8391-1001aadfe1b2"
],
"text/plain": [
""
@@ -613,23 +880,77 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "[2024-10-28 02:02:58 TP0] Prefill batch. #new-seq: 17, #new-token: 510, #cached-token: 425, cache hit rate: 43.40%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
- "[2024-10-28 02:02:58 TP0] Prefill batch. #new-seq: 83, #new-token: 2490, #cached-token: 2075, cache hit rate: 45.04%, token usage: 0.00, #running-req: 17, #queue-req: 0\n",
- "[2024-10-28 02:02:59 TP0] Decode batch. #running-req: 100, #token: 3725, token usage: 0.02, gen throughput (token/s): 234.43, #queue-req: 0\n",
- "[2024-10-28 02:03:00 TP0] Decode batch. #running-req: 100, #token: 7725, token usage: 0.04, gen throughput (token/s): 3545.41, #queue-req: 0\n",
- "[2024-10-28 02:03:01 TP0] Decode batch. #running-req: 100, #token: 11725, token usage: 0.05, gen throughput (token/s): 3448.10, #queue-req: 0\n",
- "[2024-10-28 02:03:02 TP0] Decode batch. #running-req: 100, #token: 15725, token usage: 0.07, gen throughput (token/s): 3362.62, #queue-req: 0\n",
- "[2024-10-28 02:03:04 TP0] Decode batch. #running-req: 100, #token: 19725, token usage: 0.09, gen throughput (token/s): 3279.58, #queue-req: 0\n",
- "[2024-10-28 02:03:05 TP0] Decode batch. #running-req: 100, #token: 23725, token usage: 0.11, gen throughput (token/s): 3200.86, #queue-req: 0\n",
- "[2024-10-28 02:03:06 TP0] Decode batch. #running-req: 100, #token: 27725, token usage: 0.13, gen throughput (token/s): 3126.52, #queue-req: 0\n",
- "[2024-10-28 02:03:07 TP0] Decode batch. #running-req: 100, #token: 31725, token usage: 0.15, gen throughput (token/s): 3053.16, #queue-req: 0\n",
- "[2024-10-28 02:03:08] INFO: 127.0.0.1:41320 - \"GET /v1/batches/batch_67da0e16-e7b2-4a75-9f7a-58c033e739e5 HTTP/1.1\" 200 OK\n"
+ "[2024-10-31 19:45:29 TP0] Prefill batch. #new-seq: 27, #new-token: 810, #cached-token: 675, cache hit rate: 45.05%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
+ "[2024-10-31 19:45:29 TP0] Prefill batch. #new-seq: 73, #new-token: 2190, #cached-token: 1825, cache hit rate: 45.33%, token usage: 0.00, #running-req: 27, #queue-req: 0\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[2024-10-31 19:45:30 TP0] Decode batch. #running-req: 100, #token: 5125, token usage: 0.02, gen throughput (token/s): 636.38, #queue-req: 0\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[2024-10-31 19:45:31 TP0] Decode batch. #running-req: 100, #token: 9125, token usage: 0.04, gen throughput (token/s): 3507.97, #queue-req: 0\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[2024-10-31 19:45:33 TP0] Decode batch. #running-req: 100, #token: 13125, token usage: 0.06, gen throughput (token/s): 3417.06, #queue-req: 0\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[2024-10-31 19:45:34 TP0] Decode batch. #running-req: 100, #token: 17125, token usage: 0.08, gen throughput (token/s): 3332.03, #queue-req: 0\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[2024-10-31 19:45:35 TP0] Decode batch. #running-req: 100, #token: 21125, token usage: 0.10, gen throughput (token/s): 3252.29, #queue-req: 0\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[2024-10-31 19:45:36 TP0] Decode batch. #running-req: 100, #token: 25125, token usage: 0.12, gen throughput (token/s): 3173.87, #queue-req: 0\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[2024-10-31 19:45:38 TP0] Decode batch. #running-req: 100, #token: 29125, token usage: 0.13, gen throughput (token/s): 3101.31, #queue-req: 0\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[2024-10-31 19:45:39 TP0] Decode batch. #running-req: 100, #token: 33125, token usage: 0.15, gen throughput (token/s): 3030.90, #queue-req: 0\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[2024-10-31 19:45:39] INFO: 127.0.0.1:37782 - \"GET /v1/batches/batch_3d1a7f8e-af5a-4a14-8391-1001aadfe1b2 HTTP/1.1\" 200 OK\n"
]
},
{
"data": {
"text/html": [
- "Batch job details (check 1 / 5) // ID: batch_67da0e16-e7b2-4a75-9f7a-58c033e739e5 // Status: in_progress // Created at: 1730106178 // Input file ID: backend_input_file-92cf2cc1-afbd-428f-8c5c-85fabd86cb63 // Output file ID: None"
+ "Batch job details (check 1 / 5) // ID: batch_3d1a7f8e-af5a-4a14-8391-1001aadfe1b2 // Status: in_progress // Created at: 1730429129 // Input file ID: backend_input_file-f42b27b5-05ee-4d27-9a37-ff04c3b4a427 // Output file ID: None"
],
"text/plain": [
""
@@ -654,15 +975,27 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "[2024-10-28 02:03:09 TP0] Decode batch. #running-req: 100, #token: 35725, token usage: 0.16, gen throughput (token/s): 2980.26, #queue-req: 0\n",
- "[2024-10-28 02:03:10 TP0] Decode batch. #running-req: 100, #token: 39725, token usage: 0.18, gen throughput (token/s): 2919.09, #queue-req: 0\n",
- "[2024-10-28 02:03:11] INFO: 127.0.0.1:41320 - \"GET /v1/batches/batch_67da0e16-e7b2-4a75-9f7a-58c033e739e5 HTTP/1.1\" 200 OK\n"
+ "[2024-10-31 19:45:40 TP0] Decode batch. #running-req: 100, #token: 37125, token usage: 0.17, gen throughput (token/s): 2961.37, #queue-req: 0\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[2024-10-31 19:45:42 TP0] Decode batch. #running-req: 100, #token: 41125, token usage: 0.19, gen throughput (token/s): 2899.29, #queue-req: 0\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[2024-10-31 19:45:42] INFO: 127.0.0.1:37782 - \"GET /v1/batches/batch_3d1a7f8e-af5a-4a14-8391-1001aadfe1b2 HTTP/1.1\" 200 OK\n"
]
},
{
"data": {
"text/html": [
- "Batch job details (check 2 / 5) // ID: batch_67da0e16-e7b2-4a75-9f7a-58c033e739e5 // Status: in_progress // Created at: 1730106178 // Input file ID: backend_input_file-92cf2cc1-afbd-428f-8c5c-85fabd86cb63 // Output file ID: None"
+ "Batch job details (check 2 / 5) // ID: batch_3d1a7f8e-af5a-4a14-8391-1001aadfe1b2 // Status: in_progress // Created at: 1730429129 // Input file ID: backend_input_file-f42b27b5-05ee-4d27-9a37-ff04c3b4a427 // Output file ID: None"
],
"text/plain": [
""
@@ -687,15 +1020,27 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "[2024-10-28 02:03:11 TP0] Decode batch. #running-req: 100, #token: 43725, token usage: 0.20, gen throughput (token/s): 2854.92, #queue-req: 0\n",
- "[2024-10-28 02:03:13 TP0] Decode batch. #running-req: 100, #token: 47725, token usage: 0.22, gen throughput (token/s): 2794.62, #queue-req: 0\n",
- "[2024-10-28 02:03:14] INFO: 127.0.0.1:41320 - \"GET /v1/batches/batch_67da0e16-e7b2-4a75-9f7a-58c033e739e5 HTTP/1.1\" 200 OK\n"
+ "[2024-10-31 19:45:43 TP0] Decode batch. #running-req: 100, #token: 45125, token usage: 0.21, gen throughput (token/s): 2836.50, #queue-req: 0\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[2024-10-31 19:45:45 TP0] Decode batch. #running-req: 100, #token: 49125, token usage: 0.23, gen throughput (token/s): 2777.80, #queue-req: 0\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[2024-10-31 19:45:45] INFO: 127.0.0.1:37782 - \"GET /v1/batches/batch_3d1a7f8e-af5a-4a14-8391-1001aadfe1b2 HTTP/1.1\" 200 OK\n"
]
},
{
"data": {
"text/html": [
- "Batch job details (check 3 / 5) // ID: batch_67da0e16-e7b2-4a75-9f7a-58c033e739e5 // Status: in_progress // Created at: 1730106178 // Input file ID: backend_input_file-92cf2cc1-afbd-428f-8c5c-85fabd86cb63 // Output file ID: None"
+ "Batch job details (check 3 / 5) // ID: batch_3d1a7f8e-af5a-4a14-8391-1001aadfe1b2 // Status: in_progress // Created at: 1730429129 // Input file ID: backend_input_file-f42b27b5-05ee-4d27-9a37-ff04c3b4a427 // Output file ID: None"
],
"text/plain": [
""
@@ -720,14 +1065,13 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "[2024-10-28 02:03:14 TP0] Decode batch. #running-req: 100, #token: 51725, token usage: 0.24, gen throughput (token/s): 2737.84, #queue-req: 0\n",
- "[2024-10-28 02:03:17] INFO: 127.0.0.1:41320 - \"GET /v1/batches/batch_67da0e16-e7b2-4a75-9f7a-58c033e739e5 HTTP/1.1\" 200 OK\n"
+ "[2024-10-31 19:45:48] INFO: 127.0.0.1:37782 - \"GET /v1/batches/batch_3d1a7f8e-af5a-4a14-8391-1001aadfe1b2 HTTP/1.1\" 200 OK\n"
]
},
{
"data": {
"text/html": [
- "Batch job details (check 4 / 5) // ID: batch_67da0e16-e7b2-4a75-9f7a-58c033e739e5 // Status: completed // Created at: 1730106178 // Input file ID: backend_input_file-92cf2cc1-afbd-428f-8c5c-85fabd86cb63 // Output file ID: backend_result_file-c10ee9f5-eca8-4357-a922-934543b7f433"
+ "Batch job details (check 4 / 5) // ID: batch_3d1a7f8e-af5a-4a14-8391-1001aadfe1b2 // Status: completed // Created at: 1730429129 // Input file ID: backend_input_file-f42b27b5-05ee-4d27-9a37-ff04c3b4a427 // Output file ID: backend_result_file-dc391511-07f2-4f94-90cb-3ed09bc4b8a3"
],
"text/plain": [
""
@@ -752,13 +1096,13 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "[2024-10-28 02:03:20] INFO: 127.0.0.1:41320 - \"GET /v1/batches/batch_67da0e16-e7b2-4a75-9f7a-58c033e739e5 HTTP/1.1\" 200 OK\n"
+ "[2024-10-31 19:45:51] INFO: 127.0.0.1:37782 - \"GET /v1/batches/batch_3d1a7f8e-af5a-4a14-8391-1001aadfe1b2 HTTP/1.1\" 200 OK\n"
]
},
{
"data": {
"text/html": [
- "Batch job details (check 5 / 5) // ID: batch_67da0e16-e7b2-4a75-9f7a-58c033e739e5 // Status: completed // Created at: 1730106178 // Input file ID: backend_input_file-92cf2cc1-afbd-428f-8c5c-85fabd86cb63 // Output file ID: backend_result_file-c10ee9f5-eca8-4357-a922-934543b7f433"
+ "Batch job details (check 5 / 5) // ID: batch_3d1a7f8e-af5a-4a14-8391-1001aadfe1b2 // Status: completed // Created at: 1730429129 // Input file ID: backend_input_file-f42b27b5-05ee-4d27-9a37-ff04c3b4a427 // Output file ID: backend_result_file-dc391511-07f2-4f94-90cb-3ed09bc4b8a3"
],
"text/plain": [
""
@@ -853,21 +1197,28 @@
},
{
"cell_type": "code",
- "execution_count": 9,
- "metadata": {},
+ "execution_count": 10,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-11-01T02:45:54.854018Z",
+ "iopub.status.busy": "2024-11-01T02:45:54.853851Z",
+ "iopub.status.idle": "2024-11-01T02:46:07.893199Z",
+ "shell.execute_reply": "2024-11-01T02:46:07.892310Z"
+ }
+ },
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "[2024-10-28 02:03:23] INFO: 127.0.0.1:47360 - \"POST /v1/files HTTP/1.1\" 200 OK\n",
- "[2024-10-28 02:03:23] INFO: 127.0.0.1:47360 - \"POST /v1/batches HTTP/1.1\" 200 OK\n"
+ "[2024-10-31 19:45:54] INFO: 127.0.0.1:33180 - \"POST /v1/files HTTP/1.1\" 200 OK\n",
+ "[2024-10-31 19:45:54] INFO: 127.0.0.1:33180 - \"POST /v1/batches HTTP/1.1\" 200 OK\n"
]
},
{
"data": {
"text/html": [
- "Created batch job with ID: batch_8a409f86-b8c7-4e29-9cc7-187d6d28df62"
+ "Created batch job with ID: batch_c30756c3-8c09-4142-9630-9590d6124986"
],
"text/plain": [
""
@@ -892,12 +1243,49 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "[2024-10-28 02:03:23 TP0] Prefill batch. #new-seq: 44, #new-token: 44, #cached-token: 2376, cache hit rate: 60.81%, token usage: 0.01, #running-req: 0, #queue-req: 0\n",
- "[2024-10-28 02:03:23 TP0] Prefill batch. #new-seq: 328, #new-token: 8192, #cached-token: 9824, cache hit rate: 56.49%, token usage: 0.01, #running-req: 44, #queue-req: 128\n",
- "[2024-10-28 02:03:24 TP0] Prefill batch. #new-seq: 129, #new-token: 3864, #cached-token: 3231, cache hit rate: 54.15%, token usage: 0.05, #running-req: 371, #queue-req: 1\n",
- "[2024-10-28 02:03:27 TP0] Decode batch. #running-req: 500, #token: 29025, token usage: 0.13, gen throughput (token/s): 1162.55, #queue-req: 0\n",
- "[2024-10-28 02:03:31 TP0] Decode batch. #running-req: 500, #token: 49025, token usage: 0.23, gen throughput (token/s): 5606.35, #queue-req: 0\n",
- "[2024-10-28 02:03:33] INFO: 127.0.0.1:40110 - \"POST /v1/batches/batch_8a409f86-b8c7-4e29-9cc7-187d6d28df62/cancel HTTP/1.1\" 200 OK\n"
+ "[2024-10-31 19:45:54 TP0] Prefill batch. #new-seq: 135, #new-token: 1150, #cached-token: 6275, cache hit rate: 67.38%, token usage: 0.01, #running-req: 0, #queue-req: 0\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[2024-10-31 19:45:55 TP0] Prefill batch. #new-seq: 274, #new-token: 8192, #cached-token: 6850, cache hit rate: 55.74%, token usage: 0.02, #running-req: 135, #queue-req: 91\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[2024-10-31 19:45:56 TP0] Prefill batch. #new-seq: 92, #new-token: 2758, #cached-token: 2302, cache hit rate: 54.19%, token usage: 0.06, #running-req: 408, #queue-req: 1\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[2024-10-31 19:45:56 TP0] Decode batch. #running-req: 500, #token: 16025, token usage: 0.07, gen throughput (token/s): 409.21, #queue-req: 0\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[2024-10-31 19:46:00 TP0] Decode batch. #running-req: 500, #token: 36025, token usage: 0.17, gen throughput (token/s): 5777.09, #queue-req: 0\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[2024-10-31 19:46:03 TP0] Decode batch. #running-req: 500, #token: 56025, token usage: 0.26, gen throughput (token/s): 5530.76, #queue-req: 0\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[2024-10-31 19:46:04] INFO: 127.0.0.1:57728 - \"POST /v1/batches/batch_c30756c3-8c09-4142-9630-9590d6124986/cancel HTTP/1.1\" 200 OK\n"
]
},
{
@@ -916,7 +1304,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "[2024-10-28 02:03:36] INFO: 127.0.0.1:40110 - \"GET /v1/batches/batch_8a409f86-b8c7-4e29-9cc7-187d6d28df62 HTTP/1.1\" 200 OK\n"
+ "[2024-10-31 19:46:07] INFO: 127.0.0.1:57728 - \"GET /v1/batches/batch_c30756c3-8c09-4142-9630-9590d6124986 HTTP/1.1\" 200 OK\n"
]
},
{
@@ -947,7 +1335,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "[2024-10-28 02:03:36] INFO: 127.0.0.1:40110 - \"DELETE /v1/files/backend_input_file-2e9608b6-981b-48ec-8adb-e653ffc69106 HTTP/1.1\" 200 OK\n"
+ "[2024-10-31 19:46:07] INFO: 127.0.0.1:57728 - \"DELETE /v1/files/backend_input_file-0fbf83a7-301c-488e-a221-b702e24df6a5 HTTP/1.1\" 200 OK\n"
]
},
{
@@ -961,12 +1349,25 @@
},
"metadata": {},
"output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "Successfully deleted local batch_requests.jsonl file"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
}
],
"source": [
"import json\n",
"import time\n",
"from openai import OpenAI\n",
+ "import os\n",
"\n",
"client = OpenAI(base_url=\"http://127.0.0.1:30000/v1\", api_key=\"None\")\n",
"\n",
@@ -1037,6 +1438,9 @@
" del_response = client.files.delete(uploaded_file.id)\n",
" if del_response.deleted:\n",
" print_highlight(\"Successfully cleaned up input file\")\n",
+ " if os.path.exists(input_file_path):\n",
+ " os.remove(input_file_path)\n",
+ " print_highlight(\"Successfully deleted local batch_requests.jsonl file\")\n",
" except Exception as e:\n",
" print_highlight(f\"Error cleaning up: {e}\")\n",
" raise e"
@@ -1044,8 +1448,15 @@
},
{
"cell_type": "code",
- "execution_count": 7,
- "metadata": {},
+ "execution_count": 11,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-11-01T02:46:07.896114Z",
+ "iopub.status.busy": "2024-11-01T02:46:07.895820Z",
+ "iopub.status.idle": "2024-11-01T02:46:09.365287Z",
+ "shell.execute_reply": "2024-11-01T02:46:09.364705Z"
+ }
+ },
"outputs": [],
"source": [
"terminate_process(server_process)"
@@ -1068,7 +1479,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.10.12"
+ "version": "3.11.7"
}
},
"nbformat": 4,
diff --git a/docs/deploy.py b/docs/deploy.py
index 75b7ea7f2..35ebbcb6c 100644
--- a/docs/deploy.py
+++ b/docs/deploy.py
@@ -1,22 +1,22 @@
-# Deploy the documents
+# Deploy the documents
import os
from datetime import datetime
-def run_cmd(cmd):
- print(cmd)
- os.system(cmd)
+def run_cmd(cmd):
+ print(cmd)
+ os.system(cmd)
-run_cmd("cd $DOC_SITE_PATH; git pull")
+run_cmd("cd $DOC_SITE_PATH; git pull")
-# (Optional) Remove old files
-# run_cmd("rm -rf $ALPA_SITE_PATH/*")
+# (Optional) Remove old files
+# run_cmd("rm -rf $ALPA_SITE_PATH/*")
-run_cmd("cp -r _build/html/* $DOC_SITE_PATH")
+run_cmd("cp -r _build/html/* $DOC_SITE_PATH")
-cmd_message = f"Update {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
-run_cmd(
- f"cd $DOC_SITE_PATH; git add .; git commit -m '{cmd_message}'; git push origin main"
-)
+cmd_message = f"Update {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
+run_cmd(
+ f"cd $DOC_SITE_PATH; git add .; git commit -m '{cmd_message}'; git push origin main"
+)
diff --git a/docs/release_process.md b/docs/developer/release_process.md
similarity index 100%
rename from docs/release_process.md
rename to docs/developer/release_process.md
diff --git a/docs/setup_github_runner.md b/docs/developer/setup_github_runner.md
similarity index 100%
rename from docs/setup_github_runner.md
rename to docs/developer/setup_github_runner.md
diff --git a/docs/frontend.md b/docs/frontend/frontend.md
similarity index 100%
rename from docs/frontend.md
rename to docs/frontend/frontend.md
diff --git a/docs/index.rst b/docs/index.rst
index 8cb286009..420655241 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -15,35 +15,35 @@ The core features include:
:maxdepth: 1
:caption: Getting Started
- install.md
- send_request.ipynb
+ starts/install.md
+ starts/send_request.ipynb
.. toctree::
:maxdepth: 1
:caption: Backend Tutorial
- openai_api.ipynb
- backend.md
+ backend/openai_api.ipynb
+ backend/backend.md
.. toctree::
:maxdepth: 1
:caption: Frontend Tutorial
- frontend.md
+ frontend/frontend.md
.. toctree::
:maxdepth: 1
:caption: References
- sampling_params.md
- hyperparameter_tuning.md
- model_support.md
- contributor_guide.md
- choices_methods.md
- benchmark_and_profiling.md
- troubleshooting.md
- embedding_model.ipynb
- learn_more.md
+ references/sampling_params.md
+ references/hyperparameter_tuning.md
+ references/model_support.md
+ references/contributor_guide.md
+ references/choices_methods.md
+ references/benchmark_and_profiling.md
+ references/troubleshooting.md
+ references/embedding_model.ipynb
+ references/learn_more.md
diff --git a/docs/benchmark_and_profiling.md b/docs/references/benchmark_and_profiling.md
similarity index 100%
rename from docs/benchmark_and_profiling.md
rename to docs/references/benchmark_and_profiling.md
diff --git a/docs/choices_methods.md b/docs/references/choices_methods.md
similarity index 100%
rename from docs/choices_methods.md
rename to docs/references/choices_methods.md
diff --git a/docs/contributor_guide.md b/docs/references/contributor_guide.md
similarity index 100%
rename from docs/contributor_guide.md
rename to docs/references/contributor_guide.md
diff --git a/docs/custom_chat_template.md b/docs/references/custom_chat_template.md
similarity index 100%
rename from docs/custom_chat_template.md
rename to docs/references/custom_chat_template.md
diff --git a/docs/hyperparameter_tuning.md b/docs/references/hyperparameter_tuning.md
similarity index 100%
rename from docs/hyperparameter_tuning.md
rename to docs/references/hyperparameter_tuning.md
diff --git a/docs/learn_more.md b/docs/references/learn_more.md
similarity index 100%
rename from docs/learn_more.md
rename to docs/references/learn_more.md
diff --git a/docs/model_support.md b/docs/references/model_support.md
similarity index 100%
rename from docs/model_support.md
rename to docs/references/model_support.md
diff --git a/docs/sampling_params.md b/docs/references/sampling_params.md
similarity index 100%
rename from docs/sampling_params.md
rename to docs/references/sampling_params.md
diff --git a/docs/troubleshooting.md b/docs/references/troubleshooting.md
similarity index 100%
rename from docs/troubleshooting.md
rename to docs/references/troubleshooting.md
diff --git a/docs/send_request.ipynb b/docs/send_request.ipynb
deleted file mode 100644
index f616912af..000000000
--- a/docs/send_request.ipynb
+++ /dev/null
@@ -1,222 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Quick Start: Launch A Server and Send Requests\n",
- "\n",
- "This notebook provides a quick-start guide for using SGLang after installation."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Launch a server\n",
- "\n",
- "This code block is equivalent to executing \n",
- "\n",
- "```bash\n",
- "python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \\\n",
- "--port 30000 --host 0.0.0.0\n",
- "```\n",
- "\n",
- "in your command line and wait for the server to be ready."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "[2024-10-30 09:32:30] server_args=ServerArgs(model_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, context_length=None, device='cuda', served_model_name='meta-llama/Meta-Llama-3.1-8B-Instruct', chat_template=None, is_embedding=False, host='0.0.0.0', port=30000, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, tp_size=1, stream_interval=1, random_seed=335520337, constrained_json_whitespace_pattern=None, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, watchdog_timeout=600, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_flashinfer=False, disable_flashinfer_sampling=False, disable_radix_cache=False, disable_regex_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_penalizer=False, disable_nan_detection=False, enable_overlap_schedule=False, enable_mixed_chunk=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config='', enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1)\n",
- "[2024-10-30 09:32:39 TP0] Init torch distributed begin.\n",
- "[2024-10-30 09:32:43 TP0] Load weight begin. avail mem=76.83 GB\n",
- "[2024-10-30 09:32:43 TP0] lm_eval is not installed, GPTQ may not be usable\n",
- "INFO 10-30 09:32:43 weight_utils.py:243] Using model weights format ['*.safetensors']\n",
- "Loading safetensors checkpoint shards: 0% Completed | 0/4 [00:00, ?it/s]\n",
- "Loading safetensors checkpoint shards: 25% Completed | 1/4 [00:01<00:05, 1.78s/it]\n",
- "Loading safetensors checkpoint shards: 50% Completed | 2/4 [00:03<00:03, 1.78s/it]\n",
- "Loading safetensors checkpoint shards: 75% Completed | 3/4 [00:05<00:01, 1.80s/it]\n",
- "Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:05<00:00, 1.30s/it]\n",
- "Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:05<00:00, 1.48s/it]\n",
- "\n",
- "[2024-10-30 09:32:49 TP0] Load weight end. type=LlamaForCausalLM, dtype=torch.bfloat16, avail mem=61.82 GB\n",
- "[2024-10-30 09:32:49 TP0] Memory pool end. avail mem=8.19 GB\n",
- "[2024-10-30 09:32:51 TP0] Capture cuda graph begin. This can take up to several minutes.\n",
- "[2024-10-30 09:32:59 TP0] max_total_num_tokens=430915, max_prefill_tokens=16384, max_running_requests=2049, context_len=131072\n",
- "[2024-10-30 09:33:00] INFO: Started server process [227758]\n",
- "[2024-10-30 09:33:00] INFO: Waiting for application startup.\n",
- "[2024-10-30 09:33:00] INFO: Application startup complete.\n",
- "[2024-10-30 09:33:00] INFO: Uvicorn running on http://0.0.0.0:30000 (Press CTRL+C to quit)\n",
- "[2024-10-30 09:33:01] INFO: 127.0.0.1:49220 - \"GET /v1/models HTTP/1.1\" 200 OK\n",
- "[2024-10-30 09:33:01] INFO: 127.0.0.1:49236 - \"GET /get_model_info HTTP/1.1\" 200 OK\n",
- "[2024-10-30 09:33:01 TP0] Prefill batch. #new-seq: 1, #new-token: 7, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
- "[2024-10-30 09:33:01] INFO: 127.0.0.1:49240 - \"POST /generate HTTP/1.1\" 200 OK\n",
- "[2024-10-30 09:33:01] The server is fired up and ready to roll!\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "
NOTE: Typically, the server runs in a separate terminal.
In this notebook, we run the server and notebook code together, so their outputs are combined.
To improve clarity, the server logs are displayed in the original black color, while the notebook outputs are highlighted in blue.
"
- ],
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
- "source": [
- "from sglang.utils import (\n",
- " execute_shell_command,\n",
- " wait_for_server,\n",
- " terminate_process,\n",
- " print_highlight,\n",
- ")\n",
- "\n",
- "server_process = execute_shell_command(\n",
- "\"\"\"\n",
- "python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \\\n",
- "--port 30000 --host 0.0.0.0\n",
- "\"\"\"\n",
- ")\n",
- "\n",
- "wait_for_server(\"http://localhost:30000\")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Send a Request\n",
- "\n",
- "Once the server is running, you can send test requests using curl. The server implements the [OpenAI-compatible API](https://platform.openai.com/docs/api-reference/chat)."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "[2024-10-30 09:34:00 TP0] Prefill batch. #new-seq: 1, #new-token: 46, #cached-token: 1, cache hit rate: 1.85%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
- "[2024-10-30 09:34:00 TP0] Decode batch. #running-req: 1, #token: 80, token usage: 0.00, gen throughput (token/s): 0.65, #queue-req: 0\n",
- "[2024-10-30 09:34:01 TP0] Decode batch. #running-req: 1, #token: 120, token usage: 0.00, gen throughput (token/s): 139.05, #queue-req: 0\n",
- "[2024-10-30 09:34:01 TP0] Decode batch. #running-req: 1, #token: 160, token usage: 0.00, gen throughput (token/s): 137.75, #queue-req: 0\n",
- "[2024-10-30 09:34:01 TP0] Decode batch. #running-req: 1, #token: 200, token usage: 0.00, gen throughput (token/s): 137.59, #queue-req: 0\n",
- "[2024-10-30 09:34:02 TP0] Decode batch. #running-req: 1, #token: 240, token usage: 0.00, gen throughput (token/s): 137.62, #queue-req: 0\n",
- "[2024-10-30 09:34:02 TP0] Decode batch. #running-req: 1, #token: 280, token usage: 0.00, gen throughput (token/s): 137.61, #queue-req: 0\n",
- "[2024-10-30 09:34:02 TP0] Decode batch. #running-req: 1, #token: 320, token usage: 0.00, gen throughput (token/s): 137.49, #queue-req: 0\n",
- "[2024-10-30 09:34:02 TP0] Decode batch. #running-req: 1, #token: 360, token usage: 0.00, gen throughput (token/s): 137.51, #queue-req: 0\n",
- "[2024-10-30 09:34:03 TP0] Decode batch. #running-req: 1, #token: 400, token usage: 0.00, gen throughput (token/s): 137.47, #queue-req: 0\n",
- "[2024-10-30 09:34:03 TP0] Decode batch. #running-req: 1, #token: 440, token usage: 0.00, gen throughput (token/s): 137.48, #queue-req: 0\n",
- "[2024-10-30 09:34:03 TP0] Decode batch. #running-req: 1, #token: 480, token usage: 0.00, gen throughput (token/s): 137.47, #queue-req: 0\n",
- "[2024-10-30 09:34:04 TP0] Decode batch. #running-req: 1, #token: 520, token usage: 0.00, gen throughput (token/s): 137.47, #queue-req: 0\n",
- "[2024-10-30 09:34:04] INFO: 127.0.0.1:54110 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n",
- "{\"id\":\"a53e18ead1314ab0a2cec76cef484c11\",\"object\":\"chat.completion\",\"created\":1730280844,\"model\":\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\"choices\":[{\"index\":0,\"message\":{\"role\":\"assistant\",\"content\":\"LLM stands for Large Language Model. It's a type of artificial intelligence (AI) model that is designed to process and understand human language in a way that's similar to how humans do. \\n\\nLLMs are trained on vast amounts of text data, which allows them to learn patterns, relationships, and context within language. This training enables them to generate human-like responses to a wide range of questions, prompts, and topics.\\n\\nSome common characteristics of LLMs include:\\n\\n1. **Language understanding**: LLMs can comprehend the meaning and context of language, including nuances like idioms, sarcasm, and figurative language.\\n2. **Language generation**: LLMs can generate text that's coherent, contextually relevant, and often engaging.\\n3. **Knowledge retrieval**: LLMs can access and retrieve information from their vast training datasets, allowing them to answer questions and provide information on a wide range of topics.\\n4. **Conversational dialogue**: LLMs can engage in natural-sounding conversations, using context and understanding to respond to questions and statements.\\n\\nLLMs have many applications, including:\\n\\n1. **Virtual assistants**: LLMs power virtual assistants like Siri, Alexa, and Google Assistant.\\n2. **Language translation**: LLMs can translate languages in real-time, with high accuracy.\\n3. **Content generation**: LLMs can generate text, such as articles, emails, and social media posts.\\n4. **Chatbots**: LLMs can power chatbots that provide customer support, answer questions, and engage in conversations.\\n\\nSome popular examples of LLMs include:\\n\\n1. **BERT (Bidirectional Encoder Representations from Transformers)**: Developed by Google, BERT is a widely used LLM that's been trained on a massive dataset of text.\\n2. **RoBERTa (Robustly Optimized BERT Pretraining Approach)**: Developed by Facebook AI, RoBERTa is another popular LLM that's been trained on a large dataset of text.\\n3. **Language models from OpenAI**: OpenAI has developed a range of LLMs, including GPT-3 (Generative Pre-trained Transformer 3), which is one of the most advanced LLMs available today.\\n\\nOverall, LLMs have the potential to revolutionize the way we interact with language and information, making it easier to access and understand complex topics, and opening up new possibilities for language-based applications.\"},\"logprobs\":null,\"finish_reason\":\"stop\",\"matched_stop\":128009}],\"usage\":{\"prompt_tokens\":47,\"total_tokens\":539,\"completion_tokens\":492,\"prompt_tokens_details\":null}}"
- ]
- }
- ],
- "source": [
- "!curl http://localhost:30000/v1/chat/completions \\\n",
- " -H \"Content-Type: application/json\" \\\n",
- " -H \"Authorization: Bearer None\" \\\n",
- " -d '{\"model\": \"meta-llama/Meta-Llama-3.1-8B-Instruct\", \"messages\": [{\"role\": \"system\", \"content\": \"You are a helpful assistant.\"}, {\"role\": \"user\", \"content\": \"What is a LLM?\"}]}'"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Using OpenAI Python Client\n",
- "\n",
- "You can also use the OpenAI Python API library to send requests."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "[2024-10-30 09:34:06 TP0] Prefill batch. #new-seq: 1, #new-token: 20, #cached-token: 29, cache hit rate: 29.13%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
- "[2024-10-30 09:34:07 TP0] Decode batch. #running-req: 1, #token: 71, token usage: 0.00, gen throughput (token/s): 13.51, #queue-req: 0\n",
- "[2024-10-30 09:34:07] INFO: 127.0.0.1:42068 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "ChatCompletion(id='0708a0196e524456a1316359f6189e48', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are 3 countries and their capitals:\\n\\n1. **Country:** Japan\\n**Capital:** Tokyo\\n\\n2. **Country:** Australia\\n**Capital:** Canberra\\n\\n3. **Country:** Brazil\\n**Capital:** Brasília', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None), matched_stop=128009)], created=1730280847, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=46, prompt_tokens=49, total_tokens=95, completion_tokens_details=None, prompt_tokens_details=None))"
- ],
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
- "source": [
- "import openai\n",
- "\n",
- "client = openai.Client(base_url=\"http://127.0.0.1:30000/v1\", api_key=\"None\")\n",
- "\n",
- "response = client.chat.completions.create(\n",
- " model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
- " messages=[\n",
- " {\"role\": \"system\", \"content\": \"You are a helpful AI assistant\"},\n",
- " {\"role\": \"user\", \"content\": \"List 3 countries and their capitals.\"},\n",
- " ],\n",
- " temperature=0,\n",
- " max_tokens=64,\n",
- ")\n",
- "print_highlight(response)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {},
- "outputs": [],
- "source": [
- "terminate_process(server_process)"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3 (ipykernel)",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.10.12"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/docs/install.md b/docs/starts/install.md
similarity index 100%
rename from docs/install.md
rename to docs/starts/install.md
diff --git a/docs/starts/send_request.ipynb b/docs/starts/send_request.ipynb
new file mode 100644
index 000000000..dda2371b5
--- /dev/null
+++ b/docs/starts/send_request.ipynb
@@ -0,0 +1,403 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Quick Start: Launch A Server and Send Requests\n",
+ "\n",
+ "This notebook provides a quick-start guide for using SGLang after installation."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Launch a server\n",
+ "\n",
+ "This code block is equivalent to executing \n",
+ "\n",
+ "```bash\n",
+ "python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \\\n",
+ "--port 30000 --host 0.0.0.0\n",
+ "```\n",
+ "\n",
+ "in your command line and wait for the server to be ready."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-11-01T02:46:13.611212Z",
+ "iopub.status.busy": "2024-11-01T02:46:13.611093Z",
+ "iopub.status.idle": "2024-11-01T02:46:42.810261Z",
+ "shell.execute_reply": "2024-11-01T02:46:42.809147Z"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "/home/chenyang/miniconda3/envs/AlphaMeemory/lib/python3.11/site-packages/transformers/utils/hub.py:128: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n",
+ " warnings.warn(\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[2024-10-31 19:46:18] server_args=ServerArgs(model_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, context_length=None, device='cuda', served_model_name='meta-llama/Meta-Llama-3.1-8B-Instruct', chat_template=None, is_embedding=False, host='0.0.0.0', port=30000, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, tp_size=1, stream_interval=1, random_seed=706578968, constrained_json_whitespace_pattern=None, decode_log_interval=40, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, watchdog_timeout=600, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_flashinfer=False, disable_flashinfer_sampling=False, disable_radix_cache=False, disable_regex_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_penalizer=False, disable_nan_detection=False, enable_overlap_schedule=False, enable_mixed_chunk=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config='', enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1)\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "/home/chenyang/miniconda3/envs/AlphaMeemory/lib/python3.11/site-packages/transformers/utils/hub.py:128: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n",
+ " warnings.warn(\n",
+ "/home/chenyang/miniconda3/envs/AlphaMeemory/lib/python3.11/site-packages/transformers/utils/hub.py:128: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n",
+ " warnings.warn(\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[2024-10-31 19:46:24 TP0] Init torch distributed begin.\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[2024-10-31 19:46:24 TP0] Load weight begin. avail mem=47.27 GB\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[2024-10-31 19:46:25 TP0] lm_eval is not installed, GPTQ may not be usable\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "INFO 10-31 19:46:26 weight_utils.py:243] Using model weights format ['*.safetensors']\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\r",
+ "Loading safetensors checkpoint shards: 0% Completed | 0/4 [00:00, ?it/s]\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\r",
+ "Loading safetensors checkpoint shards: 25% Completed | 1/4 [00:00<00:01, 2.50it/s]\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\r",
+ "Loading safetensors checkpoint shards: 50% Completed | 2/4 [00:00<00:00, 2.39it/s]\n",
+ "\r",
+ "Loading safetensors checkpoint shards: 75% Completed | 3/4 [00:00<00:00, 3.45it/s]\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\r",
+ "Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:01<00:00, 2.95it/s]\n",
+ "\r",
+ "Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:01<00:00, 2.90it/s]\n",
+ "\n",
+ "[2024-10-31 19:46:28 TP0] Load weight end. type=LlamaForCausalLM, dtype=torch.bfloat16, avail mem=32.22 GB\n",
+ "[2024-10-31 19:46:28 TP0] Memory pool end. avail mem=4.60 GB\n",
+ "[2024-10-31 19:46:28 TP0] Capture cuda graph begin. This can take up to several minutes.\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[2024-10-31 19:46:36 TP0] max_total_num_tokens=217512, max_prefill_tokens=16384, max_running_requests=2049, context_len=131072\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[2024-10-31 19:46:36] INFO: Started server process [1548791]\n",
+ "[2024-10-31 19:46:36] INFO: Waiting for application startup.\n",
+ "[2024-10-31 19:46:36] INFO: Application startup complete.\n",
+ "[2024-10-31 19:46:36] INFO: Uvicorn running on http://0.0.0.0:30000 (Press CTRL+C to quit)\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[2024-10-31 19:46:37] INFO: 127.0.0.1:46022 - \"GET /v1/models HTTP/1.1\" 200 OK\n",
+ "[2024-10-31 19:46:37] INFO: 127.0.0.1:46028 - \"GET /get_model_info HTTP/1.1\" 200 OK\n",
+ "[2024-10-31 19:46:37 TP0] Prefill batch. #new-seq: 1, #new-token: 7, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[2024-10-31 19:46:38] INFO: 127.0.0.1:46042 - \"POST /generate HTTP/1.1\" 200 OK\n",
+ "[2024-10-31 19:46:38] The server is fired up and ready to roll!\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "
NOTE: Typically, the server runs in a separate terminal.
In this notebook, we run the server and notebook code together, so their outputs are combined.
To improve clarity, the server logs are displayed in the original black color, while the notebook outputs are highlighted in blue.
"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "from sglang.utils import (\n",
+ " execute_shell_command,\n",
+ " wait_for_server,\n",
+ " terminate_process,\n",
+ " print_highlight,\n",
+ ")\n",
+ "\n",
+ "server_process = execute_shell_command(\n",
+ "\"\"\"\n",
+ "python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \\\n",
+ "--port 30000 --host 0.0.0.0\n",
+ "\"\"\"\n",
+ ")\n",
+ "\n",
+ "wait_for_server(\"http://localhost:30000\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Send a Request\n",
+ "\n",
+ "Once the server is running, you can send test requests using curl. The server implements the [OpenAI-compatible API](https://platform.openai.com/docs/api-reference/chat)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-11-01T02:46:42.813656Z",
+ "iopub.status.busy": "2024-11-01T02:46:42.813354Z",
+ "iopub.status.idle": "2024-11-01T02:46:51.436613Z",
+ "shell.execute_reply": "2024-11-01T02:46:51.435965Z"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[2024-10-31 19:46:42 TP0] Prefill batch. #new-seq: 1, #new-token: 46, #cached-token: 1, cache hit rate: 1.85%, token usage: 0.00, #running-req: 0, #queue-req: 0\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[2024-10-31 19:46:43 TP0] Decode batch. #running-req: 1, #token: 80, token usage: 0.00, gen throughput (token/s): 5.40, #queue-req: 0\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[2024-10-31 19:46:44 TP0] Decode batch. #running-req: 1, #token: 120, token usage: 0.00, gen throughput (token/s): 42.48, #queue-req: 0\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[2024-10-31 19:46:45 TP0] Decode batch. #running-req: 1, #token: 160, token usage: 0.00, gen throughput (token/s): 42.37, #queue-req: 0\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[2024-10-31 19:46:46 TP0] Decode batch. #running-req: 1, #token: 200, token usage: 0.00, gen throughput (token/s): 42.33, #queue-req: 0\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[2024-10-31 19:46:47 TP0] Decode batch. #running-req: 1, #token: 240, token usage: 0.00, gen throughput (token/s): 42.34, #queue-req: 0\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[2024-10-31 19:46:48 TP0] Decode batch. #running-req: 1, #token: 280, token usage: 0.00, gen throughput (token/s): 42.28, #queue-req: 0\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[2024-10-31 19:46:49 TP0] Decode batch. #running-req: 1, #token: 320, token usage: 0.00, gen throughput (token/s): 42.28, #queue-req: 0\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[2024-10-31 19:46:50 TP0] Decode batch. #running-req: 1, #token: 360, token usage: 0.00, gen throughput (token/s): 42.24, #queue-req: 0\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[2024-10-31 19:46:51] INFO: 127.0.0.1:46046 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n",
+ "{\"id\":\"f9761ee1b1444bd7a640286884a90842\",\"object\":\"chat.completion\",\"created\":1730429211,\"model\":\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\"choices\":[{\"index\":0,\"message\":{\"role\":\"assistant\",\"content\":\"LLM stands for Large Language Model. It's a type of artificial intelligence (AI) designed to process and comprehend human language in a way that's similar to how humans do.\\n\\nLarge Language Models are trained on massive amounts of text data, which allows them to learn patterns and relationships in language. This training enables them to generate text, answer questions, summarize content, and even engage in conversation.\\n\\nSome key characteristics of LLMs include:\\n\\n1. **Language understanding**: LLMs can comprehend the meaning of text, including nuances like idioms, sarcasm, and figurative language.\\n2. **Contextual awareness**: LLMs can understand the context in which a piece of text is written, including the topic, tone, and intent.\\n3. **Generative capabilities**: LLMs can generate text, including entire articles, conversations, or even creative writing like stories or poetry.\\n4. **Continuous learning**: LLMs can learn from new data and update their understanding of language over time.\\n\\nLLMs are used in a wide range of applications, including:\\n\\n1. **Virtual assistants**: LLMs power virtual assistants like Siri, Alexa, and Google Assistant.\\n2. **Chatbots**: LLMs are used to create chatbots that can engage with customers and provide support.\\n3. **Language translation**: LLMs can translate text from one language to another with high accuracy.\\n4. **Content generation**: LLMs can generate content, such as articles, social media posts, and product descriptions.\\n5. **Research and analysis**: LLMs can help researchers analyze and understand large amounts of text data.\\n\\nIn the context of our conversation, I'm a Large Language Model designed to provide helpful and informative responses to your questions!\"},\"logprobs\":null,\"finish_reason\":\"stop\",\"matched_stop\":128009}],\"usage\":{\"prompt_tokens\":47,\"total_tokens\":400,\"completion_tokens\":353,\"prompt_tokens_details\":null}}"
+ ]
+ }
+ ],
+ "source": [
+ "!curl http://localhost:30000/v1/chat/completions \\\n",
+ " -H \"Content-Type: application/json\" \\\n",
+ " -H \"Authorization: Bearer None\" \\\n",
+ " -d '{\"model\": \"meta-llama/Meta-Llama-3.1-8B-Instruct\", \"messages\": [{\"role\": \"system\", \"content\": \"You are a helpful assistant.\"}, {\"role\": \"user\", \"content\": \"What is a LLM?\"}]}'"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Using OpenAI Python Client\n",
+ "\n",
+ "You can also use the OpenAI Python API library to send requests."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-11-01T02:46:51.439372Z",
+ "iopub.status.busy": "2024-11-01T02:46:51.439178Z",
+ "iopub.status.idle": "2024-11-01T02:46:52.895776Z",
+ "shell.execute_reply": "2024-11-01T02:46:52.895318Z"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[2024-10-31 19:46:51 TP0] Prefill batch. #new-seq: 1, #new-token: 20, #cached-token: 29, cache hit rate: 29.13%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
+ "[2024-10-31 19:46:51 TP0] Decode batch. #running-req: 1, #token: 50, token usage: 0.00, gen throughput (token/s): 27.57, #queue-req: 0\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[2024-10-31 19:46:52 TP0] Decode batch. #running-req: 1, #token: 90, token usage: 0.00, gen throughput (token/s): 42.69, #queue-req: 0\n",
+ "[2024-10-31 19:46:52] INFO: 127.0.0.1:40952 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "ChatCompletion(id='c563abb8fe74496f83203fe21ec4ff61', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are 3 countries and their capitals:\\n\\n1. **Country:** Japan\\n**Capital:** Tokyo\\n\\n2. **Country:** Australia\\n**Capital:** Canberra\\n\\n3. **Country:** Brazil\\n**Capital:** Brasília', refusal=None, role='assistant', function_call=None, tool_calls=None), matched_stop=128009)], created=1730429212, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=46, prompt_tokens=49, total_tokens=95, prompt_tokens_details=None))"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "import openai\n",
+ "\n",
+ "client = openai.Client(base_url=\"http://127.0.0.1:30000/v1\", api_key=\"None\")\n",
+ "\n",
+ "response = client.chat.completions.create(\n",
+ " model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
+ " messages=[\n",
+ " {\"role\": \"system\", \"content\": \"You are a helpful AI assistant\"},\n",
+ " {\"role\": \"user\", \"content\": \"List 3 countries and their capitals.\"},\n",
+ " ],\n",
+ " temperature=0,\n",
+ " max_tokens=64,\n",
+ ")\n",
+ "print_highlight(response)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-11-01T02:46:52.898411Z",
+ "iopub.status.busy": "2024-11-01T02:46:52.898149Z",
+ "iopub.status.idle": "2024-11-01T02:46:54.398382Z",
+ "shell.execute_reply": "2024-11-01T02:46:54.397564Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "terminate_process(server_process)"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.11.7"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}