Docs fix about EAGLE and streaming output (#3166)

Co-authored-by: Chayenne <zhaochenyang@ucla.edu> Co-authored-by: Chayenne <zhaochen20@outlook.com> Co-authored-by: Jhin <jhinpan@umich.edu>
2025-01-27 20:10:45 -06:00
parent 08104b56de
commit 7b9b4f4426
6 changed files with 91 additions and 29 deletions
--- a/docs/backend/offline_engine_api.ipynb
+++ b/docs/backend/offline_engine_api.ipynb
@@ -37,7 +37,7 @@
   "outputs": [],
   "source": [
    "# launch the offline engine\n",
-    "\n",
+    "from sglang.utils import stream_and_merge, async_stream_and_merge\n",
    "import sglang as sgl\n",
    "import asyncio\n",
    "\n",
@@ -86,20 +86,22 @@
   "outputs": [],
   "source": [
    "prompts = [\n",
-    "    \"Hello, my name is\",\n",
-    "    \"The capital of France is\",\n",
-    "    \"The future of AI is\",\n",
+    "    \"Write a short, neutral self-introduction for a fictional character. Hello, my name is\",\n",
+    "    \"Provide a concise factual statement about France’s capital city. The capital of France is\",\n",
+    "    \"Explain possible future trends in artificial intelligence. The future of AI is\",\n",
    "]\n",
-    "sampling_params = {\"temperature\": 0.8, \"top_p\": 0.95}\n",
    "\n",
-    "print(\"\\n=== Testing synchronous streaming generation ===\")\n",
+    "sampling_params = {\n",
+    "    \"temperature\": 0.2,\n",
+    "    \"top_p\": 0.9,\n",
+    "}\n",
+    "\n",
+    "print(\"\\n=== Testing synchronous streaming generation with overlap removal ===\\n\")\n",
    "\n",
    "for prompt in prompts:\n",
-    "    print(f\"\\nPrompt: {prompt}\")\n",
-    "    print(\"Generated text: \", end=\"\", flush=True)\n",
-    "\n",
-    "    for chunk in llm.generate(prompt, sampling_params, stream=True):\n",
-    "        print(chunk[\"text\"], end=\"\", flush=True)\n",
+    "    print(f\"Prompt: {prompt}\")\n",
+    "    merged_output = stream_and_merge(llm, prompt, sampling_params)\n",
+    "    print(\"Generated text:\", merged_output)\n",
    "    print()"
   ]
  },
@@ -117,9 +119,9 @@
   "outputs": [],
   "source": [
    "prompts = [\n",
-    "    \"Hello, my name is\",\n",
-    "    \"The capital of France is\",\n",
-    "    \"The future of AI is\",\n",
+    "    \"Write a short, neutral self-introduction for a fictional character. Hello, my name is\",\n",
+    "    \"Provide a concise factual statement about France’s capital city. The capital of France is\",\n",
+    "    \"Explain possible future trends in artificial intelligence. The future of AI is\",\n",
    "]\n",
    "\n",
    "sampling_params = {\"temperature\": 0.8, \"top_p\": 0.95}\n",
@@ -152,13 +154,14 @@
   "outputs": [],
   "source": [
    "prompts = [\n",
-    "    \"Hello, my name is\",\n",
-    "    \"The capital of France is\",\n",
-    "    \"The future of AI is\",\n",
+    "    \"Write a short, neutral self-introduction for a fictional character. Hello, my name is\",\n",
+    "    \"Provide a concise factual statement about France’s capital city. The capital of France is\",\n",
+    "    \"Explain possible future trends in artificial intelligence. The future of AI is\",\n",
    "]\n",
+    "\n",
    "sampling_params = {\"temperature\": 0.8, \"top_p\": 0.95}\n",
    "\n",
-    "print(\"\\n=== Testing asynchronous streaming generation ===\")\n",
+    "print(\"\\n=== Testing asynchronous streaming generation (no repeats) ===\")\n",
    "\n",
    "\n",
    "async def main():\n",
@@ -166,10 +169,11 @@
    "        print(f\"\\nPrompt: {prompt}\")\n",
    "        print(\"Generated text: \", end=\"\", flush=True)\n",
    "\n",
-    "        generator = await llm.async_generate(prompt, sampling_params, stream=True)\n",
-    "        async for chunk in generator:\n",
-    "            print(chunk[\"text\"], end=\"\", flush=True)\n",
-    "        print()\n",
+    "        # Replace direct calls to async_generate with our custom overlap-aware version\n",
+    "        async for cleaned_chunk in async_stream_and_merge(llm, prompt, sampling_params):\n",
+    "            print(cleaned_chunk, end=\"\", flush=True)\n",
+    "\n",
+    "        print()  # New line after each prompt\n",
    "\n",
    "\n",
    "asyncio.run(main())"