From 3efc8e2d2aebeb5a84eea070aa5a18a18f542693 Mon Sep 17 00:00:00 2001 From: mRSun15 <3150105645@zju.edu.cn> Date: Tue, 15 Apr 2025 17:16:34 -0700 Subject: [PATCH] add attention backend supporting matrix in the doc (#5211) Co-authored-by: Stefan He --- docs/backend/attention_backend.md | 39 +++++++++++++++++++++++++++++++ docs/index.rst | 1 + 2 files changed, 40 insertions(+) create mode 100644 docs/backend/attention_backend.md diff --git a/docs/backend/attention_backend.md b/docs/backend/attention_backend.md new file mode 100644 index 000000000..6647e3125 --- /dev/null +++ b/docs/backend/attention_backend.md @@ -0,0 +1,39 @@ +# Attention Backend + +## Supporting matrix for different attention backend + +| **Backend** | **Page Size > 1** | **Spec Decoding** | **MLA** | **Sliding Window** | **MultiModal** | +|--------------------------|-------------------|-------------------|--------|--------------------|------------| +| **FlashInfer** | ✅ | ✅ | ✅ | ✅ | ✅ | +| **FA3** | ✅ | ✅ | ✅ | ✅ | ✅ | +| **Triton** | ❌ | ✅ | ✅ | ❌ | ❌ | +| **Torch Native** | ❌ | ❌ | ❌ | ❌ | ❌ | + + +## User guide + +#### Launch command for different attention backend. + +- FlashInfer (Default for Non-Hopper Machines, e.g., A100, A40) +```bash +python3 -m sglang.launch_server --model meta-llama/Meta-Llama-3.1-8B-Instruct --attention-backend flashinfer +python3 -m sglang.launch_server --tp 8 --model deepseek-ai/DeepSeek-V3 --attention-backend flashinfer --trust-remote-code +``` + +- FlashAttention 3 (Default for Hopper Machines, e.g., H100, H200, H20) +```bash +python3 -m sglang.launch_server --model meta-llama/Meta-Llama-3.1-8B-Instruct --attention-backend fa3 +python3 -m sglang.launch_server --tp 8 --model deepseek-ai/DeepSeek-V3 --trust-remote-code --attention-backend fa3 +``` + +- Triton +```bash +python3 -m sglang.launch_server --model meta-llama/Meta-Llama-3.1-8B-Instruct --attention-backend triton +python3 -m sglang.launch_server --tp 8 --model deepseek-ai/DeepSeek-V3 --attention-backend triton --trust-remote-code + +``` + +- Torch Native +```bash +python3 -m sglang.launch_server --model meta-llama/Meta-Llama-3.1-8B-Instruct --attention-backend torch_native +``` diff --git a/docs/index.rst b/docs/index.rst index c31f1abcb..e55ff2b86 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -32,6 +32,7 @@ The core features include: backend/sampling_params.md backend/hyperparameter_tuning.md backend/structured_outputs_for_reasoning_models.ipynb + backend/attention_backend.md .. toctree:: :maxdepth: 1