Docs fix about EAGLE and streaming output (#3166)
Co-authored-by: Chayenne <zhaochenyang@ucla.edu> Co-authored-by: Chayenne <zhaochen20@outlook.com> Co-authored-by: Jhin <jhinpan@umich.edu>
This commit is contained in:
@@ -507,7 +507,15 @@
|
||||
],
|
||||
"metadata": {
|
||||
"language_info": {
|
||||
"name": "python"
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
||||
@@ -37,7 +37,7 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# launch the offline engine\n",
|
||||
"\n",
|
||||
"from sglang.utils import stream_and_merge, async_stream_and_merge\n",
|
||||
"import sglang as sgl\n",
|
||||
"import asyncio\n",
|
||||
"\n",
|
||||
@@ -86,20 +86,22 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"prompts = [\n",
|
||||
" \"Hello, my name is\",\n",
|
||||
" \"The capital of France is\",\n",
|
||||
" \"The future of AI is\",\n",
|
||||
" \"Write a short, neutral self-introduction for a fictional character. Hello, my name is\",\n",
|
||||
" \"Provide a concise factual statement about France’s capital city. The capital of France is\",\n",
|
||||
" \"Explain possible future trends in artificial intelligence. The future of AI is\",\n",
|
||||
"]\n",
|
||||
"sampling_params = {\"temperature\": 0.8, \"top_p\": 0.95}\n",
|
||||
"\n",
|
||||
"print(\"\\n=== Testing synchronous streaming generation ===\")\n",
|
||||
"sampling_params = {\n",
|
||||
" \"temperature\": 0.2,\n",
|
||||
" \"top_p\": 0.9,\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"print(\"\\n=== Testing synchronous streaming generation with overlap removal ===\\n\")\n",
|
||||
"\n",
|
||||
"for prompt in prompts:\n",
|
||||
" print(f\"\\nPrompt: {prompt}\")\n",
|
||||
" print(\"Generated text: \", end=\"\", flush=True)\n",
|
||||
"\n",
|
||||
" for chunk in llm.generate(prompt, sampling_params, stream=True):\n",
|
||||
" print(chunk[\"text\"], end=\"\", flush=True)\n",
|
||||
" print(f\"Prompt: {prompt}\")\n",
|
||||
" merged_output = stream_and_merge(llm, prompt, sampling_params)\n",
|
||||
" print(\"Generated text:\", merged_output)\n",
|
||||
" print()"
|
||||
]
|
||||
},
|
||||
@@ -117,9 +119,9 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"prompts = [\n",
|
||||
" \"Hello, my name is\",\n",
|
||||
" \"The capital of France is\",\n",
|
||||
" \"The future of AI is\",\n",
|
||||
" \"Write a short, neutral self-introduction for a fictional character. Hello, my name is\",\n",
|
||||
" \"Provide a concise factual statement about France’s capital city. The capital of France is\",\n",
|
||||
" \"Explain possible future trends in artificial intelligence. The future of AI is\",\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"sampling_params = {\"temperature\": 0.8, \"top_p\": 0.95}\n",
|
||||
@@ -152,13 +154,14 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"prompts = [\n",
|
||||
" \"Hello, my name is\",\n",
|
||||
" \"The capital of France is\",\n",
|
||||
" \"The future of AI is\",\n",
|
||||
" \"Write a short, neutral self-introduction for a fictional character. Hello, my name is\",\n",
|
||||
" \"Provide a concise factual statement about France’s capital city. The capital of France is\",\n",
|
||||
" \"Explain possible future trends in artificial intelligence. The future of AI is\",\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"sampling_params = {\"temperature\": 0.8, \"top_p\": 0.95}\n",
|
||||
"\n",
|
||||
"print(\"\\n=== Testing asynchronous streaming generation ===\")\n",
|
||||
"print(\"\\n=== Testing asynchronous streaming generation (no repeats) ===\")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"async def main():\n",
|
||||
@@ -166,10 +169,11 @@
|
||||
" print(f\"\\nPrompt: {prompt}\")\n",
|
||||
" print(\"Generated text: \", end=\"\", flush=True)\n",
|
||||
"\n",
|
||||
" generator = await llm.async_generate(prompt, sampling_params, stream=True)\n",
|
||||
" async for chunk in generator:\n",
|
||||
" print(chunk[\"text\"], end=\"\", flush=True)\n",
|
||||
" print()\n",
|
||||
" # Replace direct calls to async_generate with our custom overlap-aware version\n",
|
||||
" async for cleaned_chunk in async_stream_and_merge(llm, prompt, sampling_params):\n",
|
||||
" print(cleaned_chunk, end=\"\", flush=True)\n",
|
||||
"\n",
|
||||
" print() # New line after each prompt\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"asyncio.run(main())"
|
||||
|
||||
@@ -8,12 +8,17 @@
|
||||
"\n",
|
||||
"SGLang now provides an EAGLE-based speculative decoding option. The implementation aims to maximize speed and efficiency and is considered to be among the fastest in open-source LLM engines.\n",
|
||||
"\n",
|
||||
"To run the following tests or benchmarks, you also need to install [**cutex**](https://pypi.org/project/cutex/): \n",
|
||||
"> ```bash\n",
|
||||
"> pip install cutex\n",
|
||||
"> ```\n",
|
||||
"\n",
|
||||
"### Performance Highlights\n",
|
||||
"\n",
|
||||
"- **Official EAGLE code** ([SafeAILab/EAGLE](https://github.com/SafeAILab/EAGLE)): ~200 tokens/s\n",
|
||||
"- **Standard SGLang Decoding**: ~156 tokens/s\n",
|
||||
"- **EAGLE Decoding in SGLang**: ~297 tokens/s\n",
|
||||
"- **EAGLE Decoding in SGLang (w/ `torch.compile`)**: ~316 tokens/s\n",
|
||||
"- Official EAGLE code ([SafeAILab/EAGLE](https://github.com/SafeAILab/EAGLE)): ~200 tokens/s\n",
|
||||
"- Standard SGLang Decoding: ~156 tokens/s\n",
|
||||
"- EAGLE Decoding in SGLang: ~297 tokens/s\n",
|
||||
"- EAGLE Decoding in SGLang (w/ `torch.compile`): ~316 tokens/s\n",
|
||||
"\n",
|
||||
"All benchmarks below were run on a single H100."
|
||||
]
|
||||
|
||||
@@ -5,6 +5,7 @@ You can install SGLang using any of the methods below.
|
||||
## Method 1: With pip
|
||||
```
|
||||
pip install --upgrade pip
|
||||
pip install sgl-kernel --force-reinstall --no-deps
|
||||
pip install "sglang[all]" --find-links https://flashinfer.ai/whl/cu124/torch2.4/flashinfer/
|
||||
```
|
||||
|
||||
@@ -17,10 +18,11 @@ git clone -b v0.4.2 https://github.com/sgl-project/sglang.git
|
||||
cd sglang
|
||||
|
||||
pip install --upgrade pip
|
||||
pip install sgl-kernel --force-reinstall --no-deps
|
||||
pip install -e "python[all]" --find-links https://flashinfer.ai/whl/cu124/torch2.4/flashinfer/
|
||||
```
|
||||
|
||||
Note: Please check the [FlashInfer installation doc](https://docs.flashinfer.ai/installation.html) to install the proper version according to your PyTorch and CUDA versions.
|
||||
Note: Please check the [FlashInfer installation doc](https://docs.flashinfer.ai/installation.html) to install the proper version according to your PyTorch and CUDA versions. If you meet with issue like **ImportError: cannot import name `_grouped_size_compiled_for_decode_kernels`**, installing FlashInfer with some older version like 0.1.6 instead of the latest version could solve it.
|
||||
|
||||
Note: To AMD ROCm system with Instinct/MI GPUs, do following instead:
|
||||
|
||||
@@ -30,6 +32,7 @@ git clone -b v0.4.2 https://github.com/sgl-project/sglang.git
|
||||
cd sglang
|
||||
|
||||
pip install --upgrade pip
|
||||
pip install sgl-kernel --force-reinstall --no-deps
|
||||
pip install -e "python[all_hip]"
|
||||
```
|
||||
|
||||
|
||||
Reference in New Issue
Block a user