From ced362f7c60f9bf36d659423aa23aba6c9691018 Mon Sep 17 00:00:00 2001
From: Chayenne <zhaochen20@outlook.com>
Date: Sat, 26 Oct 2024 10:44:11 -0700
Subject: [PATCH] Simplify our docs with complicated functions into utils
 (#1807)

Co-authored-by: Chayenne <zhaochenyang@ucla.edu>
---
 .github/workflows/deploy-docs.yml      | 15 +++--
 .github/workflows/execute-notebook.yml | 42 +++++++++++++
 docs/embedding_model.ipynb             | 49 +++++-----------
 docs/send_request.ipynb                | 75 +++---------------------
 python/sglang/utils.py                 | 81 ++++++++++++++++++++++++++
 5 files changed, 159 insertions(+), 103 deletions(-)
 create mode 100644 .github/workflows/execute-notebook.yml

diff --git a/.github/workflows/deploy-docs.yml b/.github/workflows/deploy-docs.yml
index 5b00ee578..719db0e92 100644
--- a/.github/workflows/deploy-docs.yml
+++ b/.github/workflows/deploy-docs.yml
@@ -1,12 +1,18 @@
-name: Build Documentation
+name: Release Documentation
 
 on:
+  push:
+    branches:
+      - main
+    paths:
+      - 'docs/**'
+      - 'python/sglang/version.py'
   workflow_dispatch:
 
 jobs:
   execute-notebooks:
     runs-on: 1-gpu-runner
-    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
+    if: github.repository == 'sgl-project/sglang'
     steps:
       - name: Checkout code
         uses: actions/checkout@v3
@@ -38,8 +44,9 @@ jobs:
           done
 
   build-and-deploy:
+    needs: execute-notebooks
     if: github.repository == 'sgl-project/sglang'
-    runs-on: 1-gpu-runner
+    runs-on: ubuntu-latest
     steps:
       - name: Checkout code
         uses: actions/checkout@v3
@@ -75,4 +82,4 @@ jobs:
           git commit -m "Update $(date +'%Y-%m-%d %H:%M:%S')"
           git push https://$GITHUB_TOKEN@github.com/sgl-project/sgl-project.github.io.git main
           cd ..
-          rm -rf sgl-project.github.io
+          rm -rf sgl-project.github.io
\ No newline at end of file
diff --git a/.github/workflows/execute-notebook.yml b/.github/workflows/execute-notebook.yml
new file mode 100644
index 000000000..48578bf73
--- /dev/null
+++ b/.github/workflows/execute-notebook.yml
@@ -0,0 +1,42 @@
+name: Execute Notebooks
+
+on:
+  pull_request:
+  push:
+    branches:
+      - main
+  workflow_dispatch:
+
+jobs:
+  run-all-notebooks:
+    runs-on: 1-gpu-runner
+    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.9'
+
+      - name: Install dependencies
+        run: |
+          bash scripts/ci_install_dependency.sh
+          pip install -r docs/requirements.txt
+
+      - name: Setup Jupyter Kernel
+        run: |
+          python -m ipykernel install --user --name python3 --display-name "Python 3"
+
+      - name: Execute notebooks
+        run: |
+          cd docs
+          for nb in *.ipynb; do
+            if [ -f "$nb" ]; then
+              echo "Executing $nb"
+              jupyter nbconvert --to notebook --execute --inplace "$nb" \
+                --ExecutePreprocessor.timeout=600 \
+                --ExecutePreprocessor.kernel_name=python3
+            fi
+          done
\ No newline at end of file
diff --git a/docs/embedding_model.ipynb b/docs/embedding_model.ipynb
index f2e155b02..bfa44aa11 100644
--- a/docs/embedding_model.ipynb
+++ b/docs/embedding_model.ipynb
@@ -28,42 +28,16 @@
     }
    ],
    "source": [
-    "import subprocess\n",
-    "import time\n",
-    "import requests\n",
-    "\n",
     "# Equivalent to running this in the shell:\n",
     "# python -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-7B-instruct --port 30010 --host 0.0.0.0 --is-embedding --log-level error\n",
-    "embedding_process = subprocess.Popen(\n",
-    "    [\n",
-    "        \"python\",\n",
-    "        \"-m\",\n",
-    "        \"sglang.launch_server\",\n",
-    "        \"--model-path\",\n",
-    "        \"Alibaba-NLP/gte-Qwen2-7B-instruct\",\n",
-    "        \"--port\",\n",
-    "        \"30010\",\n",
-    "        \"--host\",\n",
-    "        \"0.0.0.0\",\n",
-    "        \"--is-embedding\",\n",
-    "        \"--log-level\",\n",
-    "        \"error\",\n",
-    "    ],\n",
-    "    text=True,\n",
-    "    stdout=subprocess.DEVNULL,\n",
-    "    stderr=subprocess.DEVNULL,\n",
-    ")\n",
+    "from sglang.utils import execute_shell_command, wait_for_server, terminate_process\n",
     "\n",
-    "while True:\n",
-    "    try:\n",
-    "        response = requests.get(\n",
-    "            \"http://localhost:30010/v1/models\",\n",
-    "            headers={\"Authorization\": \"Bearer None\"},\n",
-    "        )\n",
-    "        if response.status_code == 200:\n",
-    "            break\n",
-    "    except requests.exceptions.RequestException:\n",
-    "        time.sleep(1)\n",
+    "embedding_process = execute_shell_command(\"\"\"\n",
+    "python -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-7B-instruct \\\n",
+    "    --port 30010 --host 0.0.0.0 --is-embedding --log-level error\n",
+    "\"\"\")\n",
+    "\n",
+    "wait_for_server(\"http://localhost:30010\")\n",
     "\n",
     "print(\"Embedding server is ready. Proceeding with the next steps.\")"
    ]
@@ -134,6 +108,15 @@
     "embedding = response.data[0].embedding[:10]\n",
     "print(embedding)"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "terminate_process(embedding_process)"
+   ]
   }
  ],
  "metadata": {
diff --git a/docs/send_request.ipynb b/docs/send_request.ipynb
index a305ccfb8..8062ae004 100644
--- a/docs/send_request.ipynb
+++ b/docs/send_request.ipynb
@@ -36,41 +36,15 @@
     }
    ],
    "source": [
-    "import subprocess\n",
-    "import time\n",
-    "import requests\n",
-    "import os\n",
+    "from sglang.utils import execute_shell_command, wait_for_server, terminate_process\n",
     "\n",
-    "server_process = subprocess.Popen(\n",
-    "    [\n",
-    "        \"python\",\n",
-    "        \"-m\",\n",
-    "        \"sglang.launch_server\",\n",
-    "        \"--model-path\",\n",
-    "        \"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
-    "        \"--port\",\n",
-    "        \"30000\",\n",
-    "        \"--host\",\n",
-    "        \"0.0.0.0\",\n",
-    "        \"--log-level\",\n",
-    "        \"error\",\n",
-    "    ],\n",
-    "    text=True,\n",
-    "    stdout=subprocess.DEVNULL,\n",
-    "    stderr=subprocess.DEVNULL,\n",
-    ")\n",
     "\n",
-    "while True:\n",
-    "    try:\n",
-    "        response = requests.get(\n",
-    "            \"http://localhost:30000/v1/models\",\n",
-    "            headers={\"Authorization\": \"Bearer None\"},\n",
-    "        )\n",
-    "        if response.status_code == 200:\n",
-    "            break\n",
-    "    except requests.exceptions.RequestException:\n",
-    "        time.sleep(1)\n",
+    "server_process = execute_shell_command(\"\"\"\n",
+    "python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \\\n",
+    "--port 30000 --host 0.0.0.0 --log-level warning\n",
+    "\"\"\")\n",
     "\n",
+    "wait_for_server(\"http://localhost:30000\")\n",
     "print(\"Server is ready. Proceeding with the next steps.\")"
    ]
   },
@@ -92,7 +66,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "{\"id\":\"1449c9c20d4448299431a57facc68d7a\",\"object\":\"chat.completion\",\"created\":1729816891,\"model\":\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\"choices\":[{\"index\":0,\"message\":{\"role\":\"assistant\",\"content\":\"LLM stands for Large Language Model. It's a type of artificial intelligence (AI) designed to process and generate human-like language. LLMs are trained on vast amounts of text data, which enables them to learn patterns, relationships, and nuances of language.\\n\\nLarge Language Models are typically trained using a technique called deep learning, where multiple layers of artificial neural networks are used to analyze and understand the input data. This training process involves feeding the model massive amounts of text data, which it uses to learn and improve its language understanding and generation capabilities.\\n\\nSome key characteristics of LLMs include:\\n\\n1. **Language understanding**: LLMs can comprehend natural language, including its syntax, semantics, and context.\\n2. **Language generation**: LLMs can generate text, including responses to user input, articles, stories, and more.\\n3. **Contextual understanding**: LLMs can understand the context in which language is being used, including the topic, tone, and intent.\\n4. **Self-supervised learning**: LLMs can learn from large datasets without explicit supervision or labeling.\\n\\nLLMs have a wide range of applications, including:\\n\\n1. **Virtual assistants**: LLMs power virtual assistants like Siri, Alexa, and Google Assistant.\\n2. **Language translation**: LLMs can translate text from one language to another.\\n3. **Text summarization**: LLMs can summarize long pieces of text into shorter, more digestible versions.\\n4. **Content generation**: LLMs can generate content, such as news articles, product descriptions, and social media posts.\\n5. **Chatbots**: LLMs can power chatbots that can have human-like conversations with users.\\n\\nThe Large Language Model I am, is a type of LLM that has been trained on a massive dataset of text and can answer a wide range of questions and engage in conversation.\"},\"logprobs\":null,\"finish_reason\":\"stop\",\"matched_stop\":128009}],\"usage\":{\"prompt_tokens\":47,\"total_tokens\":426,\"completion_tokens\":379,\"prompt_tokens_details\":null}}"
+      "{\"id\":\"449710eb827c49c99b82ce187e912c2a\",\"object\":\"chat.completion\",\"created\":1729962606,\"model\":\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\"choices\":[{\"index\":0,\"message\":{\"role\":\"assistant\",\"content\":\"LLM stands for Large Language Model. It's a type of artificial intelligence (AI) designed to process and generate human-like language. These models are trained on vast amounts of text data, allowing them to learn patterns, relationships, and context within language.\\n\\nLarge language models use various techniques, such as deep learning and natural language processing, to analyze and understand the input text. They can then use this understanding to generate coherent and context-specific text, such as:\\n\\n1. Responses to questions or prompts\\n2. Summaries of long pieces of text\\n3. Creative writing, like stories or poetry\\n4. Translation of text from one language to another\\n\\nSome popular examples of LLMs include:\\n\\n1. Chatbots: Virtual assistants that can understand and respond to user input\\n2. Virtual assistants: Like Siri, Alexa, or Google Assistant\\n3. Language translation tools: Such as Google Translate\\n4. Writing assistants: Like Grammarly or Language Tool\\n\\nThe key characteristics of LLMs include:\\n\\n1. **Scalability**: They can process large amounts of text data\\n2. **Flexibility**: They can be fine-tuned for specific tasks or domains\\n3. **Contextual understanding**: They can recognize context and nuances in language\\n4. **Creativity**: They can generate original text or responses\\n\\nHowever, LLMs also have limitations and potential drawbacks:\\n\\n1. **Bias**: They can perpetuate existing biases in the training data\\n2. **Misinformation**: They can spread misinformation or false information\\n3. **Dependence on data quality**: The quality of the training data directly affects the model's performance\\n\\nOverall, LLMs are powerful tools that can be used in various applications, from language translation and writing assistance to chatbots and virtual assistants.\"},\"logprobs\":null,\"finish_reason\":\"stop\",\"matched_stop\":128009}],\"usage\":{\"prompt_tokens\":47,\"total_tokens\":408,\"completion_tokens\":361,\"prompt_tokens_details\":null}}"
      ]
     }
    ],
@@ -121,7 +95,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "ChatCompletion(id='16757c3dd6e14a6e9bafd1122f84e4c5', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are 3 countries and their capitals:\\n\\n1. **Country:** Japan\\n**Capital:** Tokyo\\n\\n2. **Country:** Australia\\n**Capital:** Canberra\\n\\n3. **Country:** Brazil\\n**Capital:** Brasília', refusal=None, role='assistant', function_call=None, tool_calls=None), matched_stop=128009)], created=1729816893, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=46, prompt_tokens=49, total_tokens=95, prompt_tokens_details=None))\n"
+      "ChatCompletion(id='6bbf20fed17940739eb5cd5d685fa29a', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are 3 countries and their capitals:\\n\\n1. **Country:** Japan\\n**Capital:** Tokyo\\n\\n2. **Country:** Australia\\n**Capital:** Canberra\\n\\n3. **Country:** Brazil\\n**Capital:** Brasília', refusal=None, role='assistant', function_call=None, tool_calls=None), matched_stop=128009)], created=1729962608, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=46, prompt_tokens=49, total_tokens=95, prompt_tokens_details=None))\n"
      ]
     }
    ],
@@ -155,38 +129,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import signal\n",
-    "import gc\n",
-    "import torch\n",
-    "\n",
-    "def terminate_process(process):\n",
-    "    try:\n",
-    "        process.terminate()\n",
-    "        try:\n",
-    "            process.wait(timeout=5)\n",
-    "        except subprocess.TimeoutExpired:\n",
-    "            if os.name != 'nt':\n",
-    "                try:\n",
-    "                    pgid = os.getpgid(process.pid)\n",
-    "                    os.killpg(pgid, signal.SIGTERM)\n",
-    "                    time.sleep(1)\n",
-    "                    if process.poll() is None:\n",
-    "                        os.killpg(pgid, signal.SIGKILL)\n",
-    "                except ProcessLookupError:\n",
-    "                    pass\n",
-    "            else:\n",
-    "                process.kill()\n",
-    "            process.wait()\n",
-    "    except Exception as e:\n",
-    "        print(f\"Warning: {e}\")\n",
-    "    finally:\n",
-    "        gc.collect()\n",
-    "        if torch.cuda.is_available():\n",
-    "            torch.cuda.empty_cache()\n",
-    "            torch.cuda.ipc_collect()\n",
-    "\n",
-    "terminate_process(server_process)\n",
-    "time.sleep(2)"
+    "terminate_process(server_process)"
    ]
   }
  ],
diff --git a/python/sglang/utils.py b/python/sglang/utils.py
index 621efb537..139a01c42 100644
--- a/python/sglang/utils.py
+++ b/python/sglang/utils.py
@@ -1,12 +1,15 @@
 """Common utilities."""
 
 import base64
+import gc
 import importlib
 import json
 import logging
 import os
 import signal
+import subprocess
 import sys
+import time
 import traceback
 import urllib.request
 from concurrent.futures import ThreadPoolExecutor
@@ -16,6 +19,7 @@ from typing import Optional, Union
 
 import numpy as np
 import requests
+import torch
 from tqdm import tqdm
 
 logger = logging.getLogger(__name__)
@@ -294,3 +298,80 @@ def download_and_cache_file(url: str, filename: Optional[str] = None):
             bar.update(len(chunk))
 
     return filename
+
+
+def execute_shell_command(command: str) -> subprocess.Popen:
+    """
+    Execute a shell command and return the process handle
+
+    Args:
+        command: Shell command as a string (can include \ line continuations)
+    Returns:
+        subprocess.Popen: Process handle
+    """
+    # Replace \ newline with space and split
+    command = command.replace("\\\n", " ").replace("\\", " ")
+    parts = command.split()
+
+    return subprocess.Popen(
+        parts,
+        text=True,
+        stdout=subprocess.DEVNULL,
+        stderr=subprocess.DEVNULL,
+    )
+
+
+def wait_for_server(base_url: str, timeout: int = None) -> None:
+    """Wait for the server to be ready by polling the /v1/models endpoint.
+
+    Args:
+        base_url: The base URL of the server
+        timeout: Maximum time to wait in seconds. None means wait forever.
+    """
+    start_time = time.time()
+    while True:
+        try:
+            response = requests.get(
+                f"{base_url}/v1/models",
+                headers={"Authorization": "Bearer None"},
+            )
+            if response.status_code == 200:
+                break
+
+            if timeout and time.time() - start_time > timeout:
+                raise TimeoutError("Server did not become ready within timeout period")
+        except requests.exceptions.RequestException:
+            time.sleep(1)
+
+
+def terminate_process(process):
+    """Safely terminate a process and clean up GPU memory.
+
+    Args:
+        process: subprocess.Popen object to terminate
+    """
+    try:
+        process.terminate()
+        try:
+            process.wait(timeout=5)
+        except subprocess.TimeoutExpired:
+            if os.name != "nt":
+                try:
+                    pgid = os.getpgid(process.pid)
+                    os.killpg(pgid, signal.SIGTERM)
+                    time.sleep(1)
+                    if process.poll() is None:
+                        os.killpg(pgid, signal.SIGKILL)
+                except ProcessLookupError:
+                    pass
+            else:
+                process.kill()
+            process.wait()
+    except Exception as e:
+        print(f"Warning: {e}")
+    finally:
+        gc.collect()
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            torch.cuda.ipc_collect()
+        time.sleep(2)