Fix formatting in long code blocks (#10528)
This commit is contained in:
committed by
GitHub
parent
0abb41c70d
commit
7f028b07c4
@@ -29,51 +29,93 @@ The "❌" and "✅" symbols in the table above under "Page Size > 1" indicate wh
|
||||
|
||||
- FlashInfer (Default for Non-Hopper Machines, e.g., A100, A40)
|
||||
```bash
|
||||
python3 -m sglang.launch_server --model meta-llama/Meta-Llama-3.1-8B-Instruct --attention-backend flashinfer
|
||||
python3 -m sglang.launch_server --tp 8 --model deepseek-ai/DeepSeek-V3 --attention-backend flashinfer --trust-remote-code
|
||||
python3 -m sglang.launch_server \
|
||||
--model meta-llama/Meta-Llama-3.1-8B-Instruct \
|
||||
--attention-backend flashinfer
|
||||
python3 -m sglang.launch_server \
|
||||
--tp 8 \
|
||||
--model deepseek-ai/DeepSeek-V3 \
|
||||
--attention-backend flashinfer \
|
||||
--trust-remote-code
|
||||
```
|
||||
|
||||
- FlashAttention 3 (Default for Hopper Machines, e.g., H100, H200, H20)
|
||||
```bash
|
||||
python3 -m sglang.launch_server --model meta-llama/Meta-Llama-3.1-8B-Instruct --attention-backend fa3
|
||||
python3 -m sglang.launch_server --tp 8 --model deepseek-ai/DeepSeek-V3 --trust-remote-code --attention-backend fa3
|
||||
python3 -m sglang.launch_server \
|
||||
--model meta-llama/Meta-Llama-3.1-8B-Instruct \
|
||||
--attention-backend fa3
|
||||
python3 -m sglang.launch_server \
|
||||
--tp 8 \
|
||||
--model deepseek-ai/DeepSeek-V3 \
|
||||
--trust-remote-code \
|
||||
--attention-backend fa3
|
||||
```
|
||||
|
||||
- Triton
|
||||
```bash
|
||||
python3 -m sglang.launch_server --model meta-llama/Meta-Llama-3.1-8B-Instruct --attention-backend triton
|
||||
python3 -m sglang.launch_server --tp 8 --model deepseek-ai/DeepSeek-V3 --attention-backend triton --trust-remote-code
|
||||
python3 -m sglang.launch_server \
|
||||
--model meta-llama/Meta-Llama-3.1-8B-Instruct \
|
||||
--attention-backend triton
|
||||
python3 -m sglang.launch_server \
|
||||
--tp 8 \
|
||||
--model deepseek-ai/DeepSeek-V3 \
|
||||
--attention-backend triton \
|
||||
--trust-remote-code
|
||||
```
|
||||
|
||||
- Torch Native
|
||||
```bash
|
||||
python3 -m sglang.launch_server --model meta-llama/Meta-Llama-3.1-8B-Instruct --attention-backend torch_native
|
||||
python3 -m sglang.launch_server \
|
||||
--model meta-llama/Meta-Llama-3.1-8B-Instruct \
|
||||
--attention-backend torch_native
|
||||
```
|
||||
|
||||
- FlashMLA
|
||||
```bash
|
||||
python3 -m sglang.launch_server --tp 8 --model deepseek-ai/DeepSeek-R1 --attention-backend flashmla --trust-remote-code
|
||||
python3 -m sglang.launch_server --tp 8 --model deepseek-ai/DeepSeek-R1 --attention-backend flashmla --kv-cache-dtype fp8_e4m3 --trust-remote-code
|
||||
python3 -m sglang.launch_server \
|
||||
--tp 8 \
|
||||
--model deepseek-ai/DeepSeek-R1 \
|
||||
--attention-backend flashmla \
|
||||
--trust-remote-code
|
||||
python3 -m sglang.launch_server \
|
||||
--tp 8 \
|
||||
--model deepseek-ai/DeepSeek-R1 \
|
||||
--attention-backend flashmla \
|
||||
--kv-cache-dtype fp8_e4m3 \
|
||||
--trust-remote-code
|
||||
```
|
||||
|
||||
- TRTLLM MLA (Optimized for Blackwell Architecture, e.g., B200)
|
||||
```bash
|
||||
python3 -m sglang.launch_server --tp 8 --model deepseek-ai/DeepSeek-R1 --attention-backend trtllm_mla --trust-remote-code
|
||||
python3 -m sglang.launch_server \
|
||||
--tp 8 \
|
||||
--model deepseek-ai/DeepSeek-R1 \
|
||||
--attention-backend trtllm_mla \
|
||||
--trust-remote-code
|
||||
```
|
||||
|
||||
- TRTLLM MLA with FP8 KV Cache (Higher concurrency, lower memory footprint)
|
||||
```bash
|
||||
python3 -m sglang.launch_server --tp 8 --model deepseek-ai/DeepSeek-R1 --attention-backend trtllm_mla --kv-cache-dtype fp8_e4m3 --trust-remote-code
|
||||
python3 -m sglang.launch_server \
|
||||
--tp 8 \
|
||||
--model deepseek-ai/DeepSeek-R1 \
|
||||
--attention-backend trtllm_mla \
|
||||
--kv-cache-dtype fp8_e4m3 \
|
||||
--trust-remote-code
|
||||
```
|
||||
|
||||
- Ascend
|
||||
```bash
|
||||
python3 -m sglang.launch_server --model meta-llama/Meta-Llama-3.1-8B-Instruct --attention-backend ascend
|
||||
python3 -m sglang.launch_server \
|
||||
--model meta-llama/Meta-Llama-3.1-8B-Instruct \
|
||||
--attention-backend ascend
|
||||
```
|
||||
|
||||
- Wave
|
||||
```bash
|
||||
python3 -m sglang.launch_server --model meta-llama/Meta-Llama-3.1-8B-Instruct --attention-backend wave
|
||||
python3 -m sglang.launch_server \
|
||||
--model meta-llama/Meta-Llama-3.1-8B-Instruct \
|
||||
--attention-backend wave
|
||||
```
|
||||
|
||||
## Steps to add a new attention backend
|
||||
|
||||
@@ -34,22 +34,88 @@ uv pip install mooncake-transfer-engine
|
||||
### Llama Single Node
|
||||
|
||||
```bash
|
||||
$ python -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --disaggregation-mode prefill --disaggregation-ib-device mlx5_roce0
|
||||
$ python -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --disaggregation-mode decode --port 30001 --base-gpu-id 1 --disaggregation-ib-device mlx5_roce0
|
||||
$ python -m sglang_router.launch_router --pd-disaggregation --prefill http://127.0.0.1:30000 --decode http://127.0.0.1:30001 --host 0.0.0.0 --port 8000
|
||||
python -m sglang.launch_server \
|
||||
--model-path meta-llama/Llama-3.1-8B-Instruct \
|
||||
--disaggregation-mode prefill \
|
||||
--disaggregation-ib-device mlx5_roce0
|
||||
python -m sglang.launch_server \
|
||||
--model-path meta-llama/Llama-3.1-8B-Instruct \
|
||||
--disaggregation-mode decode \
|
||||
--port 30001 \
|
||||
--base-gpu-id 1 \
|
||||
--disaggregation-ib-device mlx5_roce0
|
||||
python -m sglang_router.launch_router --pd-disaggregation --prefill http://127.0.0.1:30000 --decode http://127.0.0.1:30001 --host 0.0.0.0 --port 8000
|
||||
```
|
||||
|
||||
### DeepSeek Multi-Node
|
||||
|
||||
```bash
|
||||
# prefill 0
|
||||
$ python -m sglang.launch_server --model-path deepseek-ai/DeepSeek-V3-0324 --disaggregation-ib-device ${device_name} --disaggregation-mode prefill --host ${local_ip} --port 30000 --trust-remote-code --dist-init-addr ${prefill_master_ip}:5000 --nnodes 2 --node-rank 0 --tp-size 16 --dp-size 8 --enable-dp-attention --moe-a2a-backend deepep --mem-fraction-static 0.8
|
||||
python -m sglang.launch_server \
|
||||
--model-path deepseek-ai/DeepSeek-V3-0324 \
|
||||
--disaggregation-ib-device ${device_name} \
|
||||
--disaggregation-mode prefill \
|
||||
--host ${local_ip} \
|
||||
--port 30000 \
|
||||
--trust-remote-code \
|
||||
--dist-init-addr ${prefill_master_ip}:5000 \
|
||||
--nnodes 2 \
|
||||
--node-rank 0 \
|
||||
--tp-size 16 \
|
||||
--dp-size 8 \
|
||||
--enable-dp-attention \
|
||||
--moe-a2a-backend deepep \
|
||||
--mem-fraction-static 0.8
|
||||
# prefill 1
|
||||
$ python -m sglang.launch_server --model-path deepseek-ai/DeepSeek-V3-0324 --disaggregation-ib-device ${device_name} --disaggregation-mode prefill --host ${local_ip} --port 30000 --trust-remote-code --dist-init-addr ${prefill_master_ip}:5000 --nnodes 2 --node-rank 1 --tp-size 16 --dp-size 8 --enable-dp-attention --moe-a2a-backend deepep --mem-fraction-static 0.8
|
||||
python -m sglang.launch_server \
|
||||
--model-path deepseek-ai/DeepSeek-V3-0324 \
|
||||
--disaggregation-ib-device ${device_name} \
|
||||
--disaggregation-mode prefill \
|
||||
--host ${local_ip} \
|
||||
--port 30000 \
|
||||
--trust-remote-code \
|
||||
--dist-init-addr ${prefill_master_ip}:5000 \
|
||||
--nnodes 2 \
|
||||
--node-rank 1 \
|
||||
--tp-size 16 \
|
||||
--dp-size 8 \
|
||||
--enable-dp-attention \
|
||||
--moe-a2a-backend deepep \
|
||||
--mem-fraction-static 0.8
|
||||
# decode 0
|
||||
$ python -m sglang.launch_server --model-path deepseek-ai/DeepSeek-V3-0324 --disaggregation-ib-device ${device_name} --disaggregation-mode decode --host ${local_ip} --port 30001 --trust-remote-code --dist-init-addr ${decode_master_ip}:5000 --nnodes 2 --node-rank 0 --tp-size 16 --dp-size 8 --enable-dp-attention --moe-a2a-backend deepep --mem-fraction-static 0.8 --max-running-requests 128
|
||||
python -m sglang.launch_server \
|
||||
--model-path deepseek-ai/DeepSeek-V3-0324 \
|
||||
--disaggregation-ib-device ${device_name} \
|
||||
--disaggregation-mode decode \
|
||||
--host ${local_ip} \
|
||||
--port 30001 \
|
||||
--trust-remote-code \
|
||||
--dist-init-addr ${decode_master_ip}:5000 \
|
||||
--nnodes 2 \
|
||||
--node-rank 0 \
|
||||
--tp-size 16 \
|
||||
--dp-size 8 \
|
||||
--enable-dp-attention \
|
||||
--moe-a2a-backend deepep \
|
||||
--mem-fraction-static 0.8 \
|
||||
--max-running-requests 128
|
||||
# decode 1
|
||||
$ python -m sglang.launch_server --model-path deepseek-ai/DeepSeek-V3-0324 --disaggregation-ib-device ${device_name} --disaggregation-mode decode --host ${local_ip} --port 30001 --trust-remote-code --dist-init-addr ${decode_master_ip}:5000 --nnodes 2 --node-rank 1 --tp-size 16 --dp-size 8 --enable-dp-attention --moe-a2a-backend deepep --mem-fraction-static 0.8 --max-running-requests 128
|
||||
python -m sglang.launch_server \
|
||||
--model-path deepseek-ai/DeepSeek-V3-0324 \
|
||||
--disaggregation-ib-device ${device_name} \
|
||||
--disaggregation-mode decode \
|
||||
--host ${local_ip} \
|
||||
--port 30001 \
|
||||
--trust-remote-code \
|
||||
--dist-init-addr ${decode_master_ip}:5000 \
|
||||
--nnodes 2 \
|
||||
--node-rank 1 \
|
||||
--tp-size 16 \
|
||||
--dp-size 8 \
|
||||
--enable-dp-attention \
|
||||
--moe-a2a-backend deepep \
|
||||
--mem-fraction-static 0.8 \
|
||||
--max-running-requests 128
|
||||
```
|
||||
### Advanced Configuration
|
||||
|
||||
@@ -98,22 +164,88 @@ pip install . --config-settings=setup-args="-Ducx_path=/path/to/ucx"
|
||||
### Llama Single Node
|
||||
|
||||
```bash
|
||||
$ python -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --disaggregation-mode prefill --disaggregation-transfer-backend nixl
|
||||
$ python -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --disaggregation-mode decode --port 30001 --base-gpu-id 1 --disaggregation-transfer-backend nixl
|
||||
$ python -m sglang_router.launch_router --pd-disaggregation --prefill http://127.0.0.1:30000 --decode http://127.0.0.1:30001 --host 0.0.0.0 --port 8000
|
||||
python -m sglang.launch_server \
|
||||
--model-path meta-llama/Llama-3.1-8B-Instruct \
|
||||
--disaggregation-mode prefill \
|
||||
--disaggregation-transfer-backend nixl
|
||||
python -m sglang.launch_server \
|
||||
--model-path meta-llama/Llama-3.1-8B-Instruct \
|
||||
--disaggregation-mode decode \
|
||||
--port 30001 \
|
||||
--base-gpu-id 1 \
|
||||
--disaggregation-transfer-backend nixl
|
||||
python -m sglang_router.launch_router --pd-disaggregation --prefill http://127.0.0.1:30000 --decode http://127.0.0.1:30001 --host 0.0.0.0 --port 8000
|
||||
```
|
||||
|
||||
### DeepSeek Multi-Node
|
||||
|
||||
```bash
|
||||
# prefill 0
|
||||
$ python -m sglang.launch_server --model-path deepseek-ai/DeepSeek-V3-0324 --disaggregation-transfer-backend nixl --disaggregation-mode prefill --host ${local_ip} --port 30000 --trust-remote-code --dist-init-addr ${prefill_master_ip}:5000 --nnodes 2 --node-rank 0 --tp-size 16 --dp-size 8 --enable-dp-attention --moe-a2a-backend deepep --mem-fraction-static 0.8
|
||||
python -m sglang.launch_server \
|
||||
--model-path deepseek-ai/DeepSeek-V3-0324 \
|
||||
--disaggregation-transfer-backend nixl \
|
||||
--disaggregation-mode prefill \
|
||||
--host ${local_ip} \
|
||||
--port 30000 \
|
||||
--trust-remote-code \
|
||||
--dist-init-addr ${prefill_master_ip}:5000 \
|
||||
--nnodes 2 \
|
||||
--node-rank 0 \
|
||||
--tp-size 16 \
|
||||
--dp-size 8 \
|
||||
--enable-dp-attention \
|
||||
--moe-a2a-backend deepep \
|
||||
--mem-fraction-static 0.8
|
||||
# prefill 1
|
||||
$ python -m sglang.launch_server --model-path deepseek-ai/DeepSeek-V3-0324 --disaggregation-transfer-backend nixl --disaggregation-mode prefill --host ${local_ip} --port 30000 --trust-remote-code --dist-init-addr ${prefill_master_ip}:5000 --nnodes 2 --node-rank 1 --tp-size 16 --dp-size 8 --enable-dp-attention --moe-a2a-backend deepep --mem-fraction-static 0.8
|
||||
python -m sglang.launch_server \
|
||||
--model-path deepseek-ai/DeepSeek-V3-0324 \
|
||||
--disaggregation-transfer-backend nixl \
|
||||
--disaggregation-mode prefill \
|
||||
--host ${local_ip} \
|
||||
--port 30000 \
|
||||
--trust-remote-code \
|
||||
--dist-init-addr ${prefill_master_ip}:5000 \
|
||||
--nnodes 2 \
|
||||
--node-rank 1 \
|
||||
--tp-size 16 \
|
||||
--dp-size 8 \
|
||||
--enable-dp-attention \
|
||||
--moe-a2a-backend deepep \
|
||||
--mem-fraction-static 0.8
|
||||
# decode 0
|
||||
$ python -m sglang.launch_server --model-path deepseek-ai/DeepSeek-V3-0324 --disaggregation-transfer-backend nixl --disaggregation-mode decode --host ${local_ip} --port 30001 --trust-remote-code --dist-init-addr ${decode_master_ip}:5000 --nnodes 2 --node-rank 0 --tp-size 16 --dp-size 8 --enable-dp-attention --moe-a2a-backend deepep --mem-fraction-static 0.8 --max-running-requests 128
|
||||
python -m sglang.launch_server \
|
||||
--model-path deepseek-ai/DeepSeek-V3-0324 \
|
||||
--disaggregation-transfer-backend nixl \
|
||||
--disaggregation-mode decode \
|
||||
--host ${local_ip} \
|
||||
--port 30001 \
|
||||
--trust-remote-code \
|
||||
--dist-init-addr ${decode_master_ip}:5000 \
|
||||
--nnodes 2 \
|
||||
--node-rank 0 \
|
||||
--tp-size 16 \
|
||||
--dp-size 8 \
|
||||
--enable-dp-attention \
|
||||
--moe-a2a-backend deepep \
|
||||
--mem-fraction-static 0.8 \
|
||||
--max-running-requests 128
|
||||
# decode 1
|
||||
$ python -m sglang.launch_server --model-path deepseek-ai/DeepSeek-V3-0324 --disaggregation-transfer-backend nixl --disaggregation-mode decode --host ${local_ip} --port 30001 --trust-remote-code --dist-init-addr ${decode_master_ip}:5000 --nnodes 2 --node-rank 1 --tp-size 16 --dp-size 8 --enable-dp-attention --moe-a2a-backend deepep --mem-fraction-static 0.8 --max-running-requests 128
|
||||
python -m sglang.launch_server \
|
||||
--model-path deepseek-ai/DeepSeek-V3-0324 \
|
||||
--disaggregation-transfer-backend nixl \
|
||||
--disaggregation-mode decode \
|
||||
--host ${local_ip} \
|
||||
--port 30001 \
|
||||
--trust-remote-code \
|
||||
--dist-init-addr ${decode_master_ip}:5000 \
|
||||
--nnodes 2 \
|
||||
--node-rank 1 \
|
||||
--tp-size 16 \
|
||||
--dp-size 8 \
|
||||
--enable-dp-attention \
|
||||
--moe-a2a-backend deepep \
|
||||
--mem-fraction-static 0.8 \
|
||||
--max-running-requests 128
|
||||
```
|
||||
|
||||
## ASCEND
|
||||
@@ -135,16 +267,44 @@ export ENABLE_ASCEND_TRANSFER_WITH_MOONCAKE=true
|
||||
### Llama Single Node
|
||||
|
||||
```bash
|
||||
$ python -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --disaggregation-mode prefill --disaggregation-transfer-backend ascend
|
||||
$ python -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --disaggregation-mode decode --port 30001 --base-gpu-id 1 --disaggregation-transfer-backend ascend
|
||||
$ python -m sglang_router.launch_router --pd-disaggregation --prefill http://127.0.0.1:30000 --decode http://127.0.0.1:30001 --host 0.0.0.0 --port 8000
|
||||
python -m sglang.launch_server \
|
||||
--model-path meta-llama/Llama-3.1-8B-Instruct \
|
||||
--disaggregation-mode prefill \
|
||||
--disaggregation-transfer-backend ascend
|
||||
python -m sglang.launch_server \
|
||||
--model-path meta-llama/Llama-3.1-8B-Instruct \
|
||||
--disaggregation-mode decode \
|
||||
--port 30001 \
|
||||
--base-gpu-id 1 \
|
||||
--disaggregation-transfer-backend ascend
|
||||
python -m sglang_router.launch_router --pd-disaggregation --prefill http://127.0.0.1:30000 --decode http://127.0.0.1:30001 --host 0.0.0.0 --port 8000
|
||||
```
|
||||
|
||||
### DeepSeek Multi-Node
|
||||
|
||||
```bash
|
||||
# prefill 0
|
||||
$ python -m sglang.launch_server --model-path deepseek-ai/DeepSeek-V3-0324 --disaggregation-transfer-backend ascend --disaggregation-mode prefill --host ${local_ip} --port 30000 --trust-remote-code --dist-init-addr ${prefill_master_ip}:5000 --nnodes 1 --node-rank 0 --tp-size 16
|
||||
python -m sglang.launch_server \
|
||||
--model-path deepseek-ai/DeepSeek-V3-0324 \
|
||||
--disaggregation-transfer-backend ascend \
|
||||
--disaggregation-mode prefill \
|
||||
--host ${local_ip} \
|
||||
--port 30000 \
|
||||
--trust-remote-code \
|
||||
--dist-init-addr ${prefill_master_ip}:5000 \
|
||||
--nnodes 1 \
|
||||
--node-rank 0 \
|
||||
--tp-size 16
|
||||
# decode 0
|
||||
$ python -m sglang.launch_server --model-path deepseek-ai/DeepSeek-V3-0324 --disaggregation-transfer-backend ascend --disaggregation-mode decode --host ${local_ip} --port 30001 --trust-remote-code --dist-init-addr ${decode_master_ip}:5000 --nnodes 1 --node-rank 0 --tp-size 16
|
||||
python -m sglang.launch_server \
|
||||
--model-path deepseek-ai/DeepSeek-V3-0324 \
|
||||
--disaggregation-transfer-backend ascend \
|
||||
--disaggregation-mode decode \
|
||||
--host ${local_ip} \
|
||||
--port 30001 \
|
||||
--trust-remote-code \
|
||||
--dist-init-addr ${decode_master_ip}:5000 \
|
||||
--nnodes 1 \
|
||||
--node-rank 0 \
|
||||
--tp-size 16
|
||||
```
|
||||
|
||||
@@ -43,10 +43,20 @@ You can find all arguments by `python3 -m sglang.launch_server --help`
|
||||
|
||||
```bash
|
||||
# Node 0
|
||||
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --dist-init-addr sgl-dev-0:50000 --nnodes 2 --node-rank 0
|
||||
python -m sglang.launch_server \
|
||||
--model-path meta-llama/Meta-Llama-3-8B-Instruct \
|
||||
--tp 4 \
|
||||
--dist-init-addr sgl-dev-0:50000 \
|
||||
--nnodes 2 \
|
||||
--node-rank 0
|
||||
|
||||
# Node 1
|
||||
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --dist-init-addr sgl-dev-0:50000 --nnodes 2 --node-rank 1
|
||||
python -m sglang.launch_server \
|
||||
--model-path meta-llama/Meta-Llama-3-8B-Instruct \
|
||||
--tp 4 \
|
||||
--dist-init-addr sgl-dev-0:50000 \
|
||||
--nnodes 2 \
|
||||
--node-rank 1
|
||||
```
|
||||
|
||||
Please consult the documentation below and [server_args.py](https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/server_args.py) to learn more about the arguments you may provide when launching a server.
|
||||
|
||||
Reference in New Issue
Block a user