From 2260af405f7c9a37e51d96cfbe984e1b96fc601d Mon Sep 17 00:00:00 2001
From: starmountain1997 <77533802+starmountain1997@users.noreply.github.com>
Date: Wed, 25 Feb 2026 14:43:51 +0800
Subject: [PATCH] [DOC] add request forwarding (#6780)

### What this PR does / why we need it?

- New section: "Request Forwarding" documentation in
docs/source/tutorials/models/DeepSeek-V3.2.md
- Environment fix: Changed VLLM_ASCEND_ENABLE_FLASHCOMM1 from 0 to 1 in
the DeepSeek-V3 configuration examples

### Does this PR introduce _any_ user-facing change?

Documentation update only - provides new configuration guidance for
request forwarding setups

### How was this patch tested?

- vLLM version: v0.15.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/9562912cead1f11e8540fb91306c5cbda66f0007

---------

Signed-off-by: guozr <guozr1997@hotmail.com>
Co-authored-by: guozr <guozr1997@hotmail.com>
---
 docs/source/tutorials/models/DeepSeek-V3.2.md | 35 +++++++++++++++++--
 1 file changed, 33 insertions(+), 2 deletions(-)

diff --git a/docs/source/tutorials/models/DeepSeek-V3.2.md b/docs/source/tutorials/models/DeepSeek-V3.2.md
index dd20aa84..92aaedec 100644
--- a/docs/source/tutorials/models/DeepSeek-V3.2.md
+++ b/docs/source/tutorials/models/DeepSeek-V3.2.md
@@ -297,7 +297,7 @@ export VLLM_USE_V1=1
 export HCCL_BUFFSIZE=200
 export VLLM_ASCEND_ENABLE_MLAPO=1
 export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
-export VLLM_ASCEND_ENABLE_FLASHCOMM1=0
+export VLLM_ASCEND_ENABLE_FLASHCOMM1=1
 export HCCL_CONNECT_TIMEOUT=120
 export HCCL_INTRA_PCIE_ENABLE=1
 export HCCL_INTRA_ROCE_ENABLE=0
@@ -350,7 +350,7 @@ export VLLM_USE_V1=1
 export HCCL_BUFFSIZE=200
 export VLLM_ASCEND_ENABLE_MLAPO=1
 export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
-export VLLM_ASCEND_ENABLE_FLASHCOMM1=0
+export VLLM_ASCEND_ENABLE_FLASHCOMM1=1
 export HCCL_CONNECT_TIMEOUT=120
 export HCCL_INTRA_PCIE_ENABLE=1
 export HCCL_INTRA_ROCE_ENABLE=0
@@ -830,6 +830,37 @@ python launch_online_dp.py --dp-size 8 --tp-size 4 --dp-size-local 4 --dp-rank-s
 python launch_online_dp.py --dp-size 8 --tp-size 4 --dp-size-local 4 --dp-rank-start 4 --dp-address 141.61.39.117 --dp-rpc-port 12777 --vllm-start-port 9100
 ```
 
+### Request Forwarding
+
+To set up request forwarding, run the following script on any machine :download:`load_balance_proxy_server_example.py <examples/disaggregated_prefill_v1/load_balance_proxy_server_example.py>`
+
+```shell
+unset http_proxy
+unset https_proxy
+
+python load_balance_proxy_server_example.py \
+    --port 8000 \
+    --host 0.0.0.0 \
+    --prefiller-hosts \
+       141.61.39.105 \
+       141.61.39.113 \
+    --prefiller-ports \
+       9100 \
+       9100 \
+    --decoder-hosts \
+      141.61.39.117 \
+      141.61.39.117 \
+      141.61.39.117 \
+      141.61.39.117 \
+      141.61.39.181 \
+      141.61.39.181 \
+      141.61.39.181 \
+      141.61.39.181 \
+    --decoder-ports \
+      9100 9101 9102 9103 \
+      9100 9101 9102 9103 \
+```
+
 ## Functional Verification
 
 Once your server is started, you can query the model with input prompts: