[P/D][main]Offline the llmdatadist connector related parts of the code and files. (#4780)
### What this PR does / why we need it?
As support for the mooncake connector is now available, the llmdatadist
connector is no longer being maintained, so the llmdatadist-related
files need to be retired.
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
By ci
- vLLM version: v0.12.0
- vLLM main:
ad32e3e19c
---------
Signed-off-by: wangxiaoteng <wangxiaoteng@huawei.com>
Signed-off-by: liziyu <liziyu16@huawei.com>
Co-authored-by: liziyu <liziyu16@huawei.com>
This commit is contained in:
@@ -139,7 +139,6 @@ jobs:
|
|||||||
--ignore tests/ut/model_loader/netloader/test_netloader_elastic.py \
|
--ignore tests/ut/model_loader/netloader/test_netloader_elastic.py \
|
||||||
--ignore tests/ut/kv_connector/test_remote_prefill_lifecycle.py \
|
--ignore tests/ut/kv_connector/test_remote_prefill_lifecycle.py \
|
||||||
--ignore tests/ut/kv_connector/test_remote_decode_lifecycle.py \
|
--ignore tests/ut/kv_connector/test_remote_decode_lifecycle.py \
|
||||||
--ignore tests/ut/kv_connector/test_llmdatadist_connector.py \
|
|
||||||
--ignore tests/ut/core/test_scheduler_dynamic_batch.py
|
--ignore tests/ut/core/test_scheduler_dynamic_batch.py
|
||||||
|
|
||||||
- name: Upload coverage to Codecov
|
- name: Upload coverage to Codecov
|
||||||
|
|||||||
@@ -104,7 +104,7 @@ vllm-ascend is a hardware plugin for vLLM. Basically, the version of vllm-ascend
|
|||||||
|
|
||||||
### 8. Does vllm-ascend support Prefill Disaggregation feature?
|
### 8. Does vllm-ascend support Prefill Disaggregation feature?
|
||||||
|
|
||||||
Yes, vllm-ascend supports Prefill Disaggregation feature with LLMdatadist, Mooncake backend. Take [official tutorial](https://vllm-ascend.readthedocs.io/en/latest/tutorials/multi_node_pd_disaggregation_llmdatadist.html) for example.
|
Yes, vllm-ascend supports Prefill Disaggregation feature with Mooncake backend. Take [official tutorial](https://vllm-ascend.readthedocs.io/en/latest/tutorials/multi_node_pd_disaggregation_mooncake.html) for example.
|
||||||
|
|
||||||
### 9. Does vllm-ascend support quantization method?
|
### 9. Does vllm-ascend support quantization method?
|
||||||
|
|
||||||
|
|||||||
@@ -9,7 +9,7 @@ single_npu_qwen2_audio
|
|||||||
single_npu_qwen3_embedding
|
single_npu_qwen3_embedding
|
||||||
single_npu_qwen3_quantization
|
single_npu_qwen3_quantization
|
||||||
single_npu_qwen3_w4a4
|
single_npu_qwen3_w4a4
|
||||||
single_node_pd_disaggregation_llmdatadist
|
single_node_pd_disaggregation_mooncake
|
||||||
multi_npu_qwen3_next
|
multi_npu_qwen3_next
|
||||||
multi_npu
|
multi_npu
|
||||||
multi_npu_moge
|
multi_npu_moge
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
# Prefill-Decode Disaggregation Llmdatadist Verification (Qwen2.5-VL)
|
# Prefill-Decode Disaggregation Mooncake Verification (Qwen2.5-VL)
|
||||||
|
|
||||||
## Getting Start
|
## Getting Start
|
||||||
|
|
||||||
@@ -69,10 +69,8 @@ export HCCL_IF_IP=192.0.0.1 # node ip
|
|||||||
export GLOO_SOCKET_IFNAME="eth0" # network card name
|
export GLOO_SOCKET_IFNAME="eth0" # network card name
|
||||||
export TP_SOCKET_IFNAME="eth0"
|
export TP_SOCKET_IFNAME="eth0"
|
||||||
export HCCL_SOCKET_IFNAME="eth0"
|
export HCCL_SOCKET_IFNAME="eth0"
|
||||||
export DISAGGREGATED_PREFILL_RANK_TABLE_PATH="/path/to/your/generated/ranktable.json"
|
|
||||||
export OMP_PROC_BIND=false
|
export OMP_PROC_BIND=false
|
||||||
export OMP_NUM_THREADS=10
|
export OMP_NUM_THREADS=10
|
||||||
export VLLM_ASCEND_LLMDD_RPC_PORT=5959
|
|
||||||
|
|
||||||
vllm serve /model/Qwen2.5-VL-7B-Instruct \
|
vllm serve /model/Qwen2.5-VL-7B-Instruct \
|
||||||
--host 0.0.0.0 \
|
--host 0.0.0.0 \
|
||||||
@@ -85,14 +83,22 @@ vllm serve /model/Qwen2.5-VL-7B-Instruct \
|
|||||||
--max-num-batched-tokens 40000 \
|
--max-num-batched-tokens 40000 \
|
||||||
--trust-remote-code \
|
--trust-remote-code \
|
||||||
--gpu-memory-utilization 0.9 \
|
--gpu-memory-utilization 0.9 \
|
||||||
--kv-transfer-config \
|
--kv-transfer-config \
|
||||||
'{"kv_connector": "LLMDataDistCMgrConnector",
|
'{"kv_connector": "MooncakeConnector",
|
||||||
"kv_buffer_device": "npu",
|
"kv_role": "kv_producer",
|
||||||
"kv_role": "kv_producer",
|
"kv_port": "30000",
|
||||||
"kv_parallel_size": 1,
|
"engine_id": "0",
|
||||||
"kv_port": "20001",
|
"kv_connector_module_path": "vllm_ascend.distributed.mooncake_connector",
|
||||||
"engine_id": "0",
|
"kv_connector_extra_config": {
|
||||||
"kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
|
"prefill": {
|
||||||
|
"dp_size": 1,
|
||||||
|
"tp_size": 1
|
||||||
|
},
|
||||||
|
"decode": {
|
||||||
|
"dp_size": 1,
|
||||||
|
"tp_size": 1
|
||||||
|
}
|
||||||
|
}
|
||||||
}'
|
}'
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -106,10 +112,8 @@ export HCCL_IF_IP=192.0.0.1 # node ip
|
|||||||
export GLOO_SOCKET_IFNAME="eth0" # network card name
|
export GLOO_SOCKET_IFNAME="eth0" # network card name
|
||||||
export TP_SOCKET_IFNAME="eth0"
|
export TP_SOCKET_IFNAME="eth0"
|
||||||
export HCCL_SOCKET_IFNAME="eth0"
|
export HCCL_SOCKET_IFNAME="eth0"
|
||||||
export DISAGGREGATED_PREFILL_RANK_TABLE_PATH="/path/to/your/generated/ranktable.json"
|
|
||||||
export OMP_PROC_BIND=false
|
export OMP_PROC_BIND=false
|
||||||
export OMP_NUM_THREADS=10
|
export OMP_NUM_THREADS=10
|
||||||
export VLLM_ASCEND_LLMDD_RPC_PORT=5979
|
|
||||||
|
|
||||||
vllm serve /model/Qwen2.5-VL-7B-Instruct \
|
vllm serve /model/Qwen2.5-VL-7B-Instruct \
|
||||||
--host 0.0.0.0 \
|
--host 0.0.0.0 \
|
||||||
@@ -122,14 +126,22 @@ vllm serve /model/Qwen2.5-VL-7B-Instruct \
|
|||||||
--max-num-batched-tokens 40000 \
|
--max-num-batched-tokens 40000 \
|
||||||
--trust-remote-code \
|
--trust-remote-code \
|
||||||
--gpu-memory-utilization 0.9 \
|
--gpu-memory-utilization 0.9 \
|
||||||
--kv-transfer-config \
|
--kv-transfer-config \
|
||||||
'{"kv_connector": "LLMDataDistCMgrConnector",
|
'{"kv_connector": "MooncakeConnector",
|
||||||
"kv_buffer_device": "npu",
|
|
||||||
"kv_role": "kv_consumer",
|
"kv_role": "kv_consumer",
|
||||||
"kv_parallel_size": 1,
|
"kv_port": "30100",
|
||||||
"kv_port": "20001",
|
"engine_id": "1",
|
||||||
"engine_id": "0",
|
"kv_connector_module_path": "vllm_ascend.distributed.mooncake_connector",
|
||||||
"kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
|
"kv_connector_extra_config": {
|
||||||
|
"prefill": {
|
||||||
|
"dp_size": 1,
|
||||||
|
"tp_size": 1
|
||||||
|
},
|
||||||
|
"decode": {
|
||||||
|
"dp_size": 1,
|
||||||
|
"tp_size": 1
|
||||||
|
}
|
||||||
|
}
|
||||||
}'
|
}'
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -137,7 +149,7 @@ vllm serve /model/Qwen2.5-VL-7B-Instruct \
|
|||||||
|
|
||||||
:::::
|
:::::
|
||||||
|
|
||||||
If you want to run "2P1D", please set ASCEND_RT_VISIBLE_DEVICES, VLLM_ASCEND_LLMDD_RPC_PORT and port to different values for each P process.
|
If you want to run "2P1D", please set ASCEND_RT_VISIBLE_DEVICES and port to different values for each P process.
|
||||||
|
|
||||||
## Example Proxy for Deployment
|
## Example Proxy for Deployment
|
||||||
|
|
||||||
@@ -1,238 +0,0 @@
|
|||||||
# Disaggregated Prefill-Decode Deployment Guide
|
|
||||||
|
|
||||||
## Overview
|
|
||||||
This demo document provides instructions for running a disaggregated vLLM-ascend service with separate prefill and decode stages across 4 nodes, uses 16 Ascend NPUs for two prefill nodes (P1/P2) and 16 Ascend NPUS for two decode nodes (D1/D2).
|
|
||||||
|
|
||||||
## Prerequisites
|
|
||||||
- Ascend NPU environment with vLLM 0.9.1 installed
|
|
||||||
- Network interfaces configured for distributed communication (eg: eth0)
|
|
||||||
- Model weights located at `/models/deepseek_r1_w8a8`
|
|
||||||
|
|
||||||
## Rank table generation
|
|
||||||
The rank table is a JSON file that specifies the mapping of Ascend NPU ranks to nodes. The following command generates a rank table for all nodes with 16 cards prefill and 16 cards decode:
|
|
||||||
|
|
||||||
Run the following command on every node to generate the rank table:
|
|
||||||
```shell
|
|
||||||
cd /vllm-workspace/vllm-ascend/examples/disaggregated_prefill_v1/
|
|
||||||
bash gen_ranktable.sh --ips 172.19.32.175 172.19.241.49 172.19.123.51 172.19.190.36 \
|
|
||||||
--npus-per-node 8 --network-card-name eth0 --prefill-device-cnt 16 --decode-device-cnt 16
|
|
||||||
```
|
|
||||||
Rank table will generated at `/vllm-workspace/vllm-ascend/examples/disaggregated_prefill_v1/ranktable.json`
|
|
||||||
|
|
||||||
## Start disaggregated vLLM-ascend service
|
|
||||||
For demonstration purposes, we will utilize the quantized version of Deepseek-R1. Recommended Parallelization Strategies:
|
|
||||||
- P-node: DP2-TP8-EP16 (Data Parallelism 2, Tensor Parallelism 8, Expert Parallelism 16)
|
|
||||||
- D-node: DP4-TP4-EP16 (Data Parallelism 4, Tensor Parallelism 4, Expert Parallelism 16)
|
|
||||||
|
|
||||||
Execution Sequence
|
|
||||||
- 4 configured node ip are: 172.19.32.175 172.19.241.49 172.19.123.51 172.19.190.36
|
|
||||||
- Start Prefill on Node 1 (P1)
|
|
||||||
- Start Prefill on Node 2 (P2)
|
|
||||||
- Start Decode on Node 1 (D1)
|
|
||||||
- Start Decode on Node 2 (D2)
|
|
||||||
- Start proxy server on Node1
|
|
||||||
|
|
||||||
Run prefill server P1 on first node:
|
|
||||||
```shell
|
|
||||||
export HCCL_IF_IP=172.19.32.175 # node ip
|
|
||||||
export GLOO_SOCKET_IFNAME="eth0" # network card name
|
|
||||||
export TP_SOCKET_IFNAME="eth0"
|
|
||||||
export HCCL_SOCKET_IFNAME="eth0"
|
|
||||||
export DISAGGREGATED_PREFILL_RANK_TABLE_PATH=/vllm-workspace/vllm-ascend/examples/disaggregated_prefill_v1/ranktable.json
|
|
||||||
export OMP_PROC_BIND=false
|
|
||||||
export OMP_NUM_THREADS=10
|
|
||||||
export VLLM_ASCEND_LLMDD_RPC_PORT=5559
|
|
||||||
|
|
||||||
vllm serve /models/deepseek_r1_w8a8 \
|
|
||||||
--host 0.0.0.0 \
|
|
||||||
--port 20002 \
|
|
||||||
--data-parallel-size 2 \
|
|
||||||
--data-parallel-size-local 1 \
|
|
||||||
--api-server-count 2 \
|
|
||||||
--data-parallel-address 172.19.32.175 \
|
|
||||||
--data-parallel-rpc-port 13356 \
|
|
||||||
--tensor-parallel-size 8 \
|
|
||||||
--enable-expert-parallel \
|
|
||||||
--seed 1024 \
|
|
||||||
--served-model-name deepseek \
|
|
||||||
--max-model-len 32768 \
|
|
||||||
--max-num-batched-tokens 32768 \
|
|
||||||
--max-num-seqs 256 \
|
|
||||||
--trust-remote-code \
|
|
||||||
--enforce-eager \
|
|
||||||
--gpu-memory-utilization 0.9 \
|
|
||||||
--kv-transfer-config \
|
|
||||||
'{"kv_connector": "LLMDataDistCMgrConnector",
|
|
||||||
"kv_buffer_device": "npu",
|
|
||||||
"kv_role": "kv_producer",
|
|
||||||
"kv_parallel_size": 1,
|
|
||||||
"kv_port": "20001",
|
|
||||||
"engine_id": "0",
|
|
||||||
"kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
|
|
||||||
}'
|
|
||||||
```
|
|
||||||
|
|
||||||
Run prefill server P2 on second node:
|
|
||||||
```shell
|
|
||||||
export HCCL_IF_IP=172.19.241.49
|
|
||||||
export GLOO_SOCKET_IFNAME="eth0"
|
|
||||||
export TP_SOCKET_IFNAME="eth0"
|
|
||||||
export HCCL_SOCKET_IFNAME="eth0"
|
|
||||||
export DISAGGREGATED_PREFILL_RANK_TABLE_PATH=/vllm-workspace/vllm-ascend/examples/disaggregated_prefill_v1/ranktable.json
|
|
||||||
export OMP_PROC_BIND=false
|
|
||||||
export OMP_NUM_THREADS=10
|
|
||||||
export VLLM_ASCEND_LLMDD_RPC_PORT=5659
|
|
||||||
|
|
||||||
vllm serve /models/deepseek_r1_w8a8 \
|
|
||||||
--host 0.0.0.0 \
|
|
||||||
--port 20002 \
|
|
||||||
--headless \
|
|
||||||
--data-parallel-size 2 \
|
|
||||||
--data-parallel-start-rank 1 \
|
|
||||||
--data-parallel-size-local 1 \
|
|
||||||
--data-parallel-address 172.19.32.175 \
|
|
||||||
--data-parallel-rpc-port 13356 \
|
|
||||||
--tensor-parallel-size 8 \
|
|
||||||
--enable-expert-parallel \
|
|
||||||
--seed 1024 \
|
|
||||||
--served-model-name deepseek \
|
|
||||||
--max-model-len 32768 \
|
|
||||||
--max-num-batched-tokens 32768 \
|
|
||||||
--max-num-seqs 256 \
|
|
||||||
--trust-remote-code \
|
|
||||||
--enforce-eager \
|
|
||||||
--gpu-memory-utilization 0.9 \
|
|
||||||
--kv-transfer-config \
|
|
||||||
'{"kv_connector": "LLMDataDistCMgrConnector",
|
|
||||||
"kv_buffer_device": "npu",
|
|
||||||
"kv_role": "kv_producer",
|
|
||||||
"kv_parallel_size": 1,
|
|
||||||
"kv_port": "20001",
|
|
||||||
"engine_id": "0",
|
|
||||||
"kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
|
|
||||||
}'
|
|
||||||
```
|
|
||||||
|
|
||||||
Run decode server d1 on third node:
|
|
||||||
|
|
||||||
* In the D node, the `max-num-batched-tokens` parameter can be set to a smaller value since the D node processes at most `max-num-seqs` batches concurrently. As the `profile_run` only needs to handle `max-num-seqs` sequences at a time, we can safely set `max-num-batched-tokens` equal to `max-num-seqs`. This optimization will help reduce activation memory consumption.
|
|
||||||
```shell
|
|
||||||
export HCCL_IF_IP=172.19.123.51
|
|
||||||
export GLOO_SOCKET_IFNAME="eth0"
|
|
||||||
export TP_SOCKET_IFNAME="eth0"
|
|
||||||
export HCCL_SOCKET_IFNAME="eth0"
|
|
||||||
export DISAGGREGATED_PREFILL_RANK_TABLE_PATH=/vllm-workspace/vllm-ascend/examples/disaggregated_prefill_v1/ranktable.json
|
|
||||||
export OMP_PROC_BIND=false
|
|
||||||
export OMP_NUM_THREADS=10
|
|
||||||
export VLLM_ASCEND_LLMDD_RPC_PORT=5759
|
|
||||||
|
|
||||||
vllm serve /models/deepseek_r1_w8a8 \
|
|
||||||
--host 0.0.0.0 \
|
|
||||||
--port 20002 \
|
|
||||||
--data-parallel-size 4 \
|
|
||||||
--data-parallel-size-local 2 \
|
|
||||||
--api-server-count 2 \
|
|
||||||
--data-parallel-address 172.19.123.51 \
|
|
||||||
--data-parallel-rpc-port 13356 \
|
|
||||||
--tensor-parallel-size 4 \
|
|
||||||
--enable-expert-parallel \
|
|
||||||
--seed 1024 \
|
|
||||||
--served-model-name deepseek \
|
|
||||||
--max-model-len 32768 \
|
|
||||||
--max-num-batched-tokens 256 \
|
|
||||||
--max-num-seqs 256 \
|
|
||||||
--trust-remote-code \
|
|
||||||
--gpu-memory-utilization 0.9 \
|
|
||||||
--kv-transfer-config \
|
|
||||||
'{"kv_connector": "LLMDataDistCMgrConnector",
|
|
||||||
"kv_buffer_device": "npu",
|
|
||||||
"kv_role": "kv_consumer",
|
|
||||||
"kv_parallel_size": 1,
|
|
||||||
"kv_port": "20001",
|
|
||||||
"engine_id": "0",
|
|
||||||
"kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
|
|
||||||
}' \
|
|
||||||
--additional-config \
|
|
||||||
'{"torchair_graph_config": {"enabled":true}}'
|
|
||||||
```
|
|
||||||
|
|
||||||
Run decode server d2 on last node:
|
|
||||||
```shell
|
|
||||||
export HCCL_IF_IP=172.19.190.36
|
|
||||||
export GLOO_SOCKET_IFNAME="eth0"
|
|
||||||
export TP_SOCKET_IFNAME="eth0"
|
|
||||||
export HCCL_SOCKET_IFNAME="eth0"
|
|
||||||
export DISAGGREGATED_PREFILL_RANK_TABLE_PATH=/vllm-workspace/vllm-ascend/examples/disaggregated_prefill_v1/ranktable.json
|
|
||||||
export OMP_PROC_BIND=false
|
|
||||||
export OMP_NUM_THREADS=10
|
|
||||||
export VLLM_ASCEND_LLMDD_RPC_PORT=5859
|
|
||||||
|
|
||||||
vllm serve /models/deepseek_r1_w8a8 \
|
|
||||||
--host 0.0.0.0 \
|
|
||||||
--port 20002 \
|
|
||||||
--headless \
|
|
||||||
--data-parallel-size 4 \
|
|
||||||
--data-parallel-start-rank 2 \
|
|
||||||
--data-parallel-size-local 2 \
|
|
||||||
--data-parallel-address 172.19.123.51 \
|
|
||||||
--data-parallel-rpc-port 13356 \
|
|
||||||
--tensor-parallel-size 4 \
|
|
||||||
--enable-expert-parallel \
|
|
||||||
--seed 1024 \
|
|
||||||
--served-model-name deepseek \
|
|
||||||
--max-model-len 32768 \
|
|
||||||
--max-num-batched-tokens 256 \
|
|
||||||
--max-num-seqs 256 \
|
|
||||||
--trust-remote-code \
|
|
||||||
--gpu-memory-utilization 0.9 \
|
|
||||||
--kv-transfer-config \
|
|
||||||
'{"kv_connector": "LLMDataDistCMgrConnector",
|
|
||||||
"kv_buffer_device": "npu",
|
|
||||||
"kv_role": "kv_consumer",
|
|
||||||
"kv_parallel_size": 1,
|
|
||||||
"kv_port": "20001",
|
|
||||||
"engine_id": "0",
|
|
||||||
"kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
|
|
||||||
}' \
|
|
||||||
--additional-config \
|
|
||||||
'{"torchair_graph_config": {"enabled":true}}'
|
|
||||||
```
|
|
||||||
|
|
||||||
Run proxy server on the first node:
|
|
||||||
```shell
|
|
||||||
cd /vllm-workspace/vllm-ascend/examples/disaggregated_prefill_v1
|
|
||||||
python load_balance_proxy_server_example.py --host 172.19.32.175 --port 1025 --prefiller-hosts 172.19.241.49 --prefiller-port 20002 --decoder-hosts 172.19.123.51 --decoder-ports 20002
|
|
||||||
```
|
|
||||||
|
|
||||||
Verification
|
|
||||||
Check service health using the proxy server endpoint:
|
|
||||||
```shell
|
|
||||||
curl http://localhost:1025/v1/completions \
|
|
||||||
-H "Content-Type: application/json" \
|
|
||||||
-d '{
|
|
||||||
"model": "deepseek",
|
|
||||||
"prompt": "Who are you?",
|
|
||||||
"max_tokens": 100,
|
|
||||||
"temperature": 0
|
|
||||||
}'
|
|
||||||
```
|
|
||||||
|
|
||||||
Performance
|
|
||||||
Test performance with vllm benchmark:
|
|
||||||
```shell
|
|
||||||
cd /vllm-workspace/vllm/benchmarks
|
|
||||||
python3 benchmark_serving.py \
|
|
||||||
--backend vllm \
|
|
||||||
--dataset-name random \
|
|
||||||
--random-input-len 4096 \
|
|
||||||
--random-output-len 1536 \
|
|
||||||
--num-prompts 256 \
|
|
||||||
--ignore-eos \
|
|
||||||
--model deepseek \
|
|
||||||
--tokenizer /models/deepseek_r1_w8a8 \
|
|
||||||
--host localhost \
|
|
||||||
--port 1025 \
|
|
||||||
--endpoint /v1/completions \
|
|
||||||
--max-concurrency 4 \
|
|
||||||
--request-rate 4
|
|
||||||
```
|
|
||||||
@@ -1,144 +0,0 @@
|
|||||||
import argparse
|
|
||||||
import json
|
|
||||||
import os
|
|
||||||
|
|
||||||
import torch.distributed as dist
|
|
||||||
|
|
||||||
from vllm_ascend.utils import AscendDeviceType, get_ascend_device_type
|
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(
|
|
||||||
description="Arguments of rank table generator", )
|
|
||||||
parser.add_argument("--local-host", type=str, required=True, help="local ip")
|
|
||||||
parser.add_argument("--prefill-device-cnt",
|
|
||||||
type=int,
|
|
||||||
required=True,
|
|
||||||
help="number of prefill devices")
|
|
||||||
parser.add_argument("--decode-device-cnt",
|
|
||||||
type=int,
|
|
||||||
required=True,
|
|
||||||
help="number of decode devices")
|
|
||||||
parser.add_argument("--local-device-ids",
|
|
||||||
type=str,
|
|
||||||
required=False,
|
|
||||||
help="local device ids")
|
|
||||||
parser.add_argument("--ranktable-path",
|
|
||||||
type=str,
|
|
||||||
default="./ranktable.json",
|
|
||||||
help="output rank table path")
|
|
||||||
args = parser.parse_args()
|
|
||||||
local_host = args.local_host
|
|
||||||
prefill_device_cnt = args.prefill_device_cnt
|
|
||||||
decode_device_cnt = args.decode_device_cnt
|
|
||||||
|
|
||||||
print("enter py")
|
|
||||||
|
|
||||||
hccn_tool_path = os.environ.get("HCCN_TOOL_PATH",
|
|
||||||
"/usr/local/Ascend/driver/tools/hccn_tool")
|
|
||||||
master_addr = os.environ.get("MASTER_ADDR")
|
|
||||||
master_port = os.environ.get("MASTER_PORT")
|
|
||||||
rank = os.environ.get("RANK")
|
|
||||||
local_rank = os.environ.get("LOCAL_RANK")
|
|
||||||
# This variable is set by torchrun,
|
|
||||||
# and is different from WORLD_SIZE in gen_rank_table.sh.
|
|
||||||
world_size = os.environ.get("WORLD_SIZE")
|
|
||||||
|
|
||||||
device_type = get_ascend_device_type()
|
|
||||||
|
|
||||||
|
|
||||||
def get_cmd_stdout(cmd):
|
|
||||||
import subprocess
|
|
||||||
return subprocess.run(cmd, capture_output=True,
|
|
||||||
shell=True).stdout.decode("utf-8").strip()
|
|
||||||
|
|
||||||
|
|
||||||
print(f"local_host: {local_host}")
|
|
||||||
print("gen ranktable.json")
|
|
||||||
|
|
||||||
num_cards = get_cmd_stdout("npu-smi info -l | grep \"Total Count\"").split(
|
|
||||||
":")[1].strip()
|
|
||||||
num_cards = int(num_cards)
|
|
||||||
chips_per_card = get_cmd_stdout("npu-smi info -l | grep \"Chip Count\"").split(
|
|
||||||
"\n")[0].split(":")[1].strip()
|
|
||||||
chips_per_card = int(chips_per_card)
|
|
||||||
|
|
||||||
if args.local_device_ids:
|
|
||||||
try:
|
|
||||||
local_device_ids = [int(id_str) for id_str in args.local_device_ids.split(',')]
|
|
||||||
except ValueError:
|
|
||||||
print(f"Error: --local-device-ids must be a comma-separated list of integers. Received: '{args.local_device_ids}'")
|
|
||||||
exit(1)
|
|
||||||
else:
|
|
||||||
local_device_ids = []
|
|
||||||
for card_id in range(num_cards):
|
|
||||||
for chip_id in range(chips_per_card):
|
|
||||||
device_id = card_id * chips_per_card + chip_id
|
|
||||||
local_device_ids.append(device_id)
|
|
||||||
|
|
||||||
# generate local device list for local rank 0, and gather it to all ranks
|
|
||||||
local_device_list: list[dict[str, str]] = list()
|
|
||||||
if local_rank == "0":
|
|
||||||
super_pod_id = "0"
|
|
||||||
for idx in range(len(local_device_ids)):
|
|
||||||
device_id = local_device_ids[idx]
|
|
||||||
chip_id = device_id % chips_per_card
|
|
||||||
card_id = device_id // chips_per_card
|
|
||||||
if device_type == AscendDeviceType._910_93:
|
|
||||||
device_ip = get_cmd_stdout(
|
|
||||||
f"{hccn_tool_path} -i {device_id} -vnic -g | grep ipaddr"
|
|
||||||
).split(":")[1].strip()
|
|
||||||
super_device_id = get_cmd_stdout(
|
|
||||||
f"npu-smi info -t spod-info -i {card_id} -c {chip_id} | grep SDID"
|
|
||||||
).split(":")[1].strip()
|
|
||||||
super_pod_id = get_cmd_stdout(
|
|
||||||
f"npu-smi info -t spod-info -i {card_id} -c {chip_id} | grep \"Super Pod ID\""
|
|
||||||
).split(":")[1].strip()
|
|
||||||
else:
|
|
||||||
device_ip = get_cmd_stdout(
|
|
||||||
f"{hccn_tool_path} -i {device_id} -ip -g | grep ipaddr"
|
|
||||||
).split(":")[1].strip()
|
|
||||||
|
|
||||||
device_info = {
|
|
||||||
"server_id": local_host,
|
|
||||||
"device_id": str(device_id),
|
|
||||||
"device_ip": str(device_ip),
|
|
||||||
}
|
|
||||||
if device_type == AscendDeviceType._910_93:
|
|
||||||
device_info.update({
|
|
||||||
"super_pod_id": str(super_pod_id),
|
|
||||||
"super_device_id": str(super_device_id)
|
|
||||||
})
|
|
||||||
local_device_list.append(device_info)
|
|
||||||
|
|
||||||
dist.init_process_group(backend=dist.Backend.GLOO)
|
|
||||||
global_device_list = [None] * dist.get_world_size()
|
|
||||||
dist.all_gather_object(global_device_list, local_device_list)
|
|
||||||
global_device_list = [
|
|
||||||
device_info for device_list in global_device_list
|
|
||||||
for device_info in device_list # type: ignore[attr-defined]
|
|
||||||
]
|
|
||||||
cnt = 1
|
|
||||||
for device_info in global_device_list: # type: ignore[assignment]
|
|
||||||
device_info["cluster_id"] = str(cnt)
|
|
||||||
cnt += 1
|
|
||||||
assert (prefill_device_cnt + decode_device_cnt) <= len(global_device_list), \
|
|
||||||
"prefill_device_cnt + decode_device_cnt must be less than or equal to number of all devices in cluster"
|
|
||||||
ranktable = {
|
|
||||||
"version":
|
|
||||||
"1.2",
|
|
||||||
"server_count":
|
|
||||||
str(world_size),
|
|
||||||
"prefill_device_list":
|
|
||||||
global_device_list[:prefill_device_cnt],
|
|
||||||
"decode_device_list":
|
|
||||||
global_device_list[prefill_device_cnt:prefill_device_cnt +
|
|
||||||
decode_device_cnt],
|
|
||||||
"status":
|
|
||||||
"completed"
|
|
||||||
}
|
|
||||||
|
|
||||||
if local_rank == '0':
|
|
||||||
os.makedirs(os.path.dirname(args.ranktable_path), exist_ok=True)
|
|
||||||
with open(args.ranktable_path, "w") as f:
|
|
||||||
json.dump(ranktable, f, indent=4)
|
|
||||||
|
|
||||||
print("gen ranktable.json done")
|
|
||||||
@@ -1,89 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
source /usr/local/Ascend/ascend-toolkit/set_env.sh
|
|
||||||
export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/vendors/customize/op_api/lib/:${LD_LIBRARY_PATH}
|
|
||||||
|
|
||||||
NPUS_PER_NODE=8
|
|
||||||
while [[ $# -gt 0 ]]; do
|
|
||||||
case "$1" in
|
|
||||||
--ips)
|
|
||||||
shift
|
|
||||||
while [[ $# -gt 0 && ! "$1" == --* ]]; do
|
|
||||||
IPs+=("$1")
|
|
||||||
shift
|
|
||||||
done
|
|
||||||
;;
|
|
||||||
--npus-per-node)
|
|
||||||
shift
|
|
||||||
NPUS_PER_NODE="$1"
|
|
||||||
shift
|
|
||||||
;;
|
|
||||||
--network-card-name)
|
|
||||||
shift
|
|
||||||
NETWORK_CARD_NAME="$1"
|
|
||||||
shift
|
|
||||||
;;
|
|
||||||
--prefill-device-cnt)
|
|
||||||
shift
|
|
||||||
PREFILL_DEVICE_CNT="$1"
|
|
||||||
shift
|
|
||||||
;;
|
|
||||||
--decode-device-cnt)
|
|
||||||
shift
|
|
||||||
DECODE_DEVICE_CNT="$1"
|
|
||||||
shift
|
|
||||||
;;
|
|
||||||
--local-device-ids)
|
|
||||||
shift
|
|
||||||
LOCAL_DEVICE_IDS="$1"
|
|
||||||
shift
|
|
||||||
;;
|
|
||||||
esac
|
|
||||||
done
|
|
||||||
LOCAL_HOSTS=($(hostname -I))
|
|
||||||
LOCAL_HOST="127.0.0.1"
|
|
||||||
MASTER_ADDR=${IPs[0]}
|
|
||||||
MASTER_PORT=6657
|
|
||||||
NNODES=${#IPs[@]}
|
|
||||||
NODE_RANK="8"
|
|
||||||
for i in "${!IPs[@]}"; do
|
|
||||||
ip="${IPs[$i]}"
|
|
||||||
for local_host in "${LOCAL_HOSTS[@]}"; do
|
|
||||||
if [[ "$local_host" == "$ip" ]]; then
|
|
||||||
LOCAL_HOST=$local_host
|
|
||||||
NODE_RANK=$i
|
|
||||||
break 2
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
done
|
|
||||||
|
|
||||||
if [[ $NODE_RANK == "" ]];then
|
|
||||||
echo "[Error] para \"NODE_RANK\" must be defined"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
WORLD_SIZE=$(($NPUS_PER_NODE * $NNODES))
|
|
||||||
RANKSTART=`expr $NPUS_PER_NODE \* $NODE_RANK`
|
|
||||||
|
|
||||||
echo "========>param:"
|
|
||||||
echo "LOCAL_HOST": $LOCAL_HOST
|
|
||||||
echo "WORLD_SIZE: " $WORLD_SIZE
|
|
||||||
echo "RANKSTART": $RANKSTART
|
|
||||||
echo "NNODES": $NNODES
|
|
||||||
echo "NODE_RANK": $NODE_RANK
|
|
||||||
echo "==============="
|
|
||||||
|
|
||||||
if [ -n "$LOCAL_DEVICE_IDS" ]; then
|
|
||||||
OPTIONAL_SECTION=" --local-device-ids $LOCAL_DEVICE_IDS"
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ -n "${GEN_RANKTABLE}" || ! -e ${PWD}/ranktable.json ]]; then
|
|
||||||
timeout 180s \
|
|
||||||
GLOO_SOCKET_IFNAME=$NETWORK_CARD_NAME torchrun \
|
|
||||||
--nproc_per_node 1 \
|
|
||||||
--nnodes ${NNODES} \
|
|
||||||
--node_rank ${NODE_RANK} \
|
|
||||||
--master_addr ${MASTER_ADDR} \
|
|
||||||
--master_port ${MASTER_PORT} \
|
|
||||||
gen_ranktable.py --local-host $LOCAL_HOST --prefill-device-cnt $PREFILL_DEVICE_CNT --decode-device-cnt $DECODE_DEVICE_CNT $OPTIONAL_SECTION
|
|
||||||
fi
|
|
||||||
@@ -1,30 +0,0 @@
|
|||||||
export HCCL_IF_IP=141.61.39.117
|
|
||||||
export GLOO_SOCKET_IFNAME="enp48s3u1u1"
|
|
||||||
export TP_SOCKET_IFNAME="enp48s3u1u1"
|
|
||||||
export HCCL_SOCKET_IFNAME="enp48s3u1u1"
|
|
||||||
export DISAGGREGATED_PREFILL_RANK_TABLE_PATH=path-to-rank-table
|
|
||||||
|
|
||||||
export OMP_PROC_BIND=false
|
|
||||||
export OMP_NUM_THREADS=10
|
|
||||||
|
|
||||||
vllm serve model_path \
|
|
||||||
--host 0.0.0.0 \
|
|
||||||
--port 20002 \
|
|
||||||
--tensor-parallel-size 1\
|
|
||||||
--seed 1024 \
|
|
||||||
--served-model-name dsv3 \
|
|
||||||
--max-model-len 2000 \
|
|
||||||
---max-num-batched-tokens 2000 \
|
|
||||||
--trust-remote-code \
|
|
||||||
--gpu-memory-utilization 0.9 \
|
|
||||||
--kv-transfer-config \
|
|
||||||
'{"kv_connector": "LLMDataDistCMgrConnector",
|
|
||||||
"kv_buffer_device": "npu",
|
|
||||||
"kv_role": "kv_consumer",
|
|
||||||
"kv_parallel_size": 1,
|
|
||||||
"kv_port": "20001",
|
|
||||||
"engine_id": 0,
|
|
||||||
"kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_connector_v1_a3"
|
|
||||||
}' \
|
|
||||||
--additional-config \
|
|
||||||
'{"enable_graph_mode": "True"}'\
|
|
||||||
@@ -24,6 +24,7 @@ from multiprocessing import Event, Process
|
|||||||
os.environ["VLLM_USE_MODELSCOPE"] = "True"
|
os.environ["VLLM_USE_MODELSCOPE"] = "True"
|
||||||
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
|
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
|
||||||
|
|
||||||
|
|
||||||
def clean_up():
|
def clean_up():
|
||||||
import gc
|
import gc
|
||||||
|
|
||||||
@@ -37,9 +38,6 @@ def clean_up():
|
|||||||
|
|
||||||
|
|
||||||
def run_prefill(prefill_done, process_close):
|
def run_prefill(prefill_done, process_close):
|
||||||
# ranktable.json needs be generated using gen_ranktable.sh
|
|
||||||
# from the examples/disaggregated_prefill_v1 in the main branch.
|
|
||||||
os.environ['DISAGGREGATED_PREFILL_RANK_TABLE_PATH'] = "./ranktable.json"
|
|
||||||
os.environ["ASCEND_RT_VISIBLE_DEVICES"] = "0"
|
os.environ["ASCEND_RT_VISIBLE_DEVICES"] = "0"
|
||||||
|
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
@@ -51,9 +49,22 @@ def run_prefill(prefill_done, process_close):
|
|||||||
]
|
]
|
||||||
sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1)
|
sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1)
|
||||||
|
|
||||||
ktc = KVTransferConfig(kv_connector="LLMDataDistCMgrConnector", kv_buffer_device="npu", kv_role="kv_producer",
|
ktc = KVTransferConfig(
|
||||||
kv_parallel_size=1,
|
kv_connector="MooncakeConnector",
|
||||||
kv_connector_module_path="vllm_ascend.distributed.llmdatadist_c_mgr_connector")
|
kv_role="kv_producer",
|
||||||
|
kv_port="30000",
|
||||||
|
engine_id="0",
|
||||||
|
kv_connector_module_path="vllm_ascend.distributed.mooncake_connector",
|
||||||
|
kv_connector_extra_config={
|
||||||
|
"prefill": {
|
||||||
|
"dp_size": 1,
|
||||||
|
"tp_size": 1
|
||||||
|
},
|
||||||
|
"decode": {
|
||||||
|
"dp_size": 1,
|
||||||
|
"tp_size": 1
|
||||||
|
}
|
||||||
|
})
|
||||||
# Set NPU memory utilization to 0.8
|
# Set NPU memory utilization to 0.8
|
||||||
llm = LLM(model="deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
|
llm = LLM(model="deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
|
||||||
kv_transfer_config=ktc,
|
kv_transfer_config=ktc,
|
||||||
@@ -79,10 +90,6 @@ def run_prefill(prefill_done, process_close):
|
|||||||
|
|
||||||
|
|
||||||
def run_decode(prefill_done):
|
def run_decode(prefill_done):
|
||||||
os.environ['VLLM_ASCEND_LLMDD_RPC_PORT'] = '6634'
|
|
||||||
# ranktable.json needs be generated using gen_ranktable.sh
|
|
||||||
# from the examples/disaggregated_prefill_v1 module in the main branch.
|
|
||||||
os.environ['DISAGGREGATED_PREFILL_RANK_TABLE_PATH'] = "./ranktable.json"
|
|
||||||
os.environ["ASCEND_RT_VISIBLE_DEVICES"] = "1"
|
os.environ["ASCEND_RT_VISIBLE_DEVICES"] = "1"
|
||||||
|
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
@@ -94,8 +101,22 @@ def run_decode(prefill_done):
|
|||||||
]
|
]
|
||||||
sampling_params = SamplingParams(temperature=0, top_p=0.95)
|
sampling_params = SamplingParams(temperature=0, top_p=0.95)
|
||||||
|
|
||||||
ktc = KVTransferConfig(kv_connector="LLMDataDistCMgrConnector", kv_buffer_device="npu", kv_role="kv_consumer",
|
ktc = KVTransferConfig(
|
||||||
kv_parallel_size=1, kv_connector_module_path="vllm_ascend.distributed.llmdatadist_c_mgr_connector")
|
kv_connector="MooncakeConnector",
|
||||||
|
kv_role="kv_consumer",
|
||||||
|
kv_port="30100",
|
||||||
|
engine_id="1",
|
||||||
|
kv_connector_module_path="vllm_ascend.distributed.mooncake_connector",
|
||||||
|
kv_connector_extra_config={
|
||||||
|
"prefill": {
|
||||||
|
"dp_size": 1,
|
||||||
|
"tp_size": 1
|
||||||
|
},
|
||||||
|
"decode": {
|
||||||
|
"dp_size": 1,
|
||||||
|
"tp_size": 1
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
llm = LLM(model="deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
|
llm = LLM(model="deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
|
||||||
kv_transfer_config=ktc,
|
kv_transfer_config=ktc,
|
||||||
|
|||||||
@@ -41,13 +41,21 @@ deployment:
|
|||||||
--gpu-memory-utilization 0.9
|
--gpu-memory-utilization 0.9
|
||||||
--speculative-config '{"num_speculative_tokens": 1, "method":"mtp"}'
|
--speculative-config '{"num_speculative_tokens": 1, "method":"mtp"}'
|
||||||
--kv-transfer-config
|
--kv-transfer-config
|
||||||
'{"kv_connector": "LLMDataDistCMgrConnector",
|
'{"kv_connector": "MooncakeConnector",
|
||||||
"kv_buffer_device": "npu",
|
|
||||||
"kv_role": "kv_producer",
|
"kv_role": "kv_producer",
|
||||||
"kv_parallel_size": 1,
|
"kv_port": "30000",
|
||||||
"kv_port": "20001",
|
|
||||||
"engine_id": "0",
|
"engine_id": "0",
|
||||||
"kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
|
"kv_connector_module_path": "vllm_ascend.distributed.mooncake_connector",
|
||||||
|
"kv_connector_extra_config": {
|
||||||
|
"prefill": {
|
||||||
|
"dp_size": 2,
|
||||||
|
"tp_size": 8
|
||||||
|
},
|
||||||
|
"decode": {
|
||||||
|
"dp_size": 32,
|
||||||
|
"tp_size": 1
|
||||||
|
}
|
||||||
|
}
|
||||||
}'
|
}'
|
||||||
--additional-config
|
--additional-config
|
||||||
'{"torchair_graph_config":{"enabled":false,"enable_multistream_shared_expert":false},"enable_prefill_optimizations":true,"enable_weight_nz_layout":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}'
|
'{"torchair_graph_config":{"enabled":false,"enable_multistream_shared_expert":false},"enable_prefill_optimizations":true,"enable_weight_nz_layout":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}'
|
||||||
@@ -71,13 +79,21 @@ deployment:
|
|||||||
--gpu-memory-utilization 0.9
|
--gpu-memory-utilization 0.9
|
||||||
--speculative-config '{"num_speculative_tokens": 1, "method":"mtp"}'
|
--speculative-config '{"num_speculative_tokens": 1, "method":"mtp"}'
|
||||||
--kv-transfer-config
|
--kv-transfer-config
|
||||||
'{"kv_connector": "LLMDataDistCMgrConnector",
|
'{"kv_connector": "MooncakeConnector",
|
||||||
"kv_buffer_device": "npu",
|
|
||||||
"kv_role": "kv_producer",
|
"kv_role": "kv_producer",
|
||||||
"kv_parallel_size": 1,
|
"kv_port": "30100",
|
||||||
"kv_port": "20001",
|
|
||||||
"engine_id": "1",
|
"engine_id": "1",
|
||||||
"kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
|
"kv_connector_module_path": "vllm_ascend.distributed.mooncake_connector",
|
||||||
|
"kv_connector_extra_config": {
|
||||||
|
"prefill": {
|
||||||
|
"dp_size": 2,
|
||||||
|
"tp_size": 8
|
||||||
|
},
|
||||||
|
"decode": {
|
||||||
|
"dp_size": 32,
|
||||||
|
"tp_size": 1
|
||||||
|
}
|
||||||
|
}
|
||||||
}'
|
}'
|
||||||
--additional-config
|
--additional-config
|
||||||
'{"torchair_graph_config":{"enabled":false,"enable_multistream_shared_expert":false},"enable_prefill_optimizations":true,"enable_weight_nz_layout":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}'
|
'{"torchair_graph_config":{"enabled":false,"enable_multistream_shared_expert":false},"enable_prefill_optimizations":true,"enable_weight_nz_layout":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}'
|
||||||
@@ -102,13 +118,21 @@ deployment:
|
|||||||
--gpu-memory-utilization 0.9
|
--gpu-memory-utilization 0.9
|
||||||
--speculative-config '{"num_speculative_tokens": 1, "method":"mtp"}'
|
--speculative-config '{"num_speculative_tokens": 1, "method":"mtp"}'
|
||||||
--kv-transfer-config
|
--kv-transfer-config
|
||||||
'{"kv_connector": "LLMDataDistCMgrConnector",
|
'{"kv_connector": "MooncakeConnector",
|
||||||
"kv_buffer_device": "npu",
|
|
||||||
"kv_role": "kv_consumer",
|
"kv_role": "kv_consumer",
|
||||||
"kv_parallel_size": 1,
|
"kv_port": "30200",
|
||||||
"kv_port": "20001",
|
|
||||||
"engine_id": "2",
|
"engine_id": "2",
|
||||||
"kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
|
"kv_connector_module_path": "vllm_ascend.distributed.mooncake_connector",
|
||||||
|
"kv_connector_extra_config": {
|
||||||
|
"prefill": {
|
||||||
|
"dp_size": 2,
|
||||||
|
"tp_size": 8
|
||||||
|
},
|
||||||
|
"decode": {
|
||||||
|
"dp_size": 32,
|
||||||
|
"tp_size": 1
|
||||||
|
}
|
||||||
|
}
|
||||||
}'
|
}'
|
||||||
--additional-config
|
--additional-config
|
||||||
'{"torchair_graph_config":{"enabled":true,"enable_multistream_mla":true,"graph_batch_sizes":[28],"use_cached_graph":true,"enable_super_kernel":false},"multistream_overlap_shared_expert":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}'
|
'{"torchair_graph_config":{"enabled":true,"enable_multistream_mla":true,"graph_batch_sizes":[28],"use_cached_graph":true,"enable_super_kernel":false},"multistream_overlap_shared_expert":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}'
|
||||||
@@ -132,13 +156,21 @@ deployment:
|
|||||||
--gpu-memory-utilization 0.9
|
--gpu-memory-utilization 0.9
|
||||||
--speculative-config '{"num_speculative_tokens": 1, "method":"mtp"}'
|
--speculative-config '{"num_speculative_tokens": 1, "method":"mtp"}'
|
||||||
--kv-transfer-config
|
--kv-transfer-config
|
||||||
'{"kv_connector": "LLMDataDistCMgrConnector",
|
'{"kv_connector": "MooncakeConnector",
|
||||||
"kv_buffer_device": "npu",
|
|
||||||
"kv_role": "kv_consumer",
|
"kv_role": "kv_consumer",
|
||||||
"kv_parallel_size": 1,
|
"kv_port": "30200",
|
||||||
"kv_port": "20001",
|
|
||||||
"engine_id": "2",
|
"engine_id": "2",
|
||||||
"kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
|
"kv_connector_module_path": "vllm_ascend.distributed.mooncake_connector",
|
||||||
|
"kv_connector_extra_config": {
|
||||||
|
"prefill": {
|
||||||
|
"dp_size": 2,
|
||||||
|
"tp_size": 8
|
||||||
|
},
|
||||||
|
"decode": {
|
||||||
|
"dp_size": 32,
|
||||||
|
"tp_size": 1
|
||||||
|
}
|
||||||
|
}
|
||||||
}'
|
}'
|
||||||
--additional-config
|
--additional-config
|
||||||
'{"torchair_graph_config":{"enabled":true,"enable_multistream_mla":true,"graph_batch_sizes":[28],"use_cached_graph":true,"enable_super_kernel":false},"multistream_overlap_shared_expert":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}'
|
'{"torchair_graph_config":{"enabled":true,"enable_multistream_mla":true,"graph_batch_sizes":[28],"use_cached_graph":true,"enable_super_kernel":false},"multistream_overlap_shared_expert":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}'
|
||||||
|
|||||||
@@ -40,13 +40,21 @@ deployment:
|
|||||||
--gpu-memory-utilization 0.9
|
--gpu-memory-utilization 0.9
|
||||||
--speculative-config '{"num_speculative_tokens": 1, "method":"mtp"}'
|
--speculative-config '{"num_speculative_tokens": 1, "method":"mtp"}'
|
||||||
--kv-transfer-config
|
--kv-transfer-config
|
||||||
'{"kv_connector": "LLMDataDistCMgrConnector",
|
'{"kv_connector": "MooncakeConnector",
|
||||||
"kv_buffer_device": "npu",
|
|
||||||
"kv_role": "kv_producer",
|
"kv_role": "kv_producer",
|
||||||
"kv_parallel_size": 1,
|
"kv_port": "30000",
|
||||||
"kv_port": "20001",
|
|
||||||
"engine_id": "0",
|
"engine_id": "0",
|
||||||
"kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
|
"kv_connector_module_path": "vllm_ascend.distributed.mooncake_connector",
|
||||||
|
"kv_connector_extra_config": {
|
||||||
|
"prefill": {
|
||||||
|
"dp_size": 2,
|
||||||
|
"tp_size": 8
|
||||||
|
},
|
||||||
|
"decode": {
|
||||||
|
"dp_size": 32,
|
||||||
|
"tp_size": 1
|
||||||
|
}
|
||||||
|
}
|
||||||
}'
|
}'
|
||||||
--additional-config
|
--additional-config
|
||||||
'{"torchair_graph_config":{"enabled":false,"enable_multistream_shared_expert":false},"enable_prefill_optimizations":true,"enable_weight_nz_layout":true}'
|
'{"torchair_graph_config":{"enabled":false,"enable_multistream_shared_expert":false},"enable_prefill_optimizations":true,"enable_weight_nz_layout":true}'
|
||||||
@@ -70,13 +78,21 @@ deployment:
|
|||||||
--gpu-memory-utilization 0.9
|
--gpu-memory-utilization 0.9
|
||||||
--speculative-config '{"num_speculative_tokens": 1, "method":"mtp"}'
|
--speculative-config '{"num_speculative_tokens": 1, "method":"mtp"}'
|
||||||
--kv-transfer-config
|
--kv-transfer-config
|
||||||
'{"kv_connector": "LLMDataDistCMgrConnector",
|
'{"kv_connector": "MooncakeConnector",
|
||||||
"kv_buffer_device": "npu",
|
|
||||||
"kv_role": "kv_producer",
|
"kv_role": "kv_producer",
|
||||||
"kv_parallel_size": 1,
|
"kv_port": "30100",
|
||||||
"kv_port": "20001",
|
"engine_id": "1",
|
||||||
"engine_id": "0",
|
"kv_connector_module_path": "vllm_ascend.distributed.mooncake_connector",
|
||||||
"kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
|
"kv_connector_extra_config": {
|
||||||
|
"prefill": {
|
||||||
|
"dp_size": 2,
|
||||||
|
"tp_size": 8
|
||||||
|
},
|
||||||
|
"decode": {
|
||||||
|
"dp_size": 32,
|
||||||
|
"tp_size": 1
|
||||||
|
}
|
||||||
|
}
|
||||||
}'
|
}'
|
||||||
--additional-config
|
--additional-config
|
||||||
'{"torchair_graph_config":{"enabled":false,"enable_multistream_shared_expert":false},"enable_prefill_optimizations":true,"enable_weight_nz_layout":true}'
|
'{"torchair_graph_config":{"enabled":false,"enable_multistream_shared_expert":false},"enable_prefill_optimizations":true,"enable_weight_nz_layout":true}'
|
||||||
@@ -101,13 +117,21 @@ deployment:
|
|||||||
--gpu-memory-utilization 0.9
|
--gpu-memory-utilization 0.9
|
||||||
--speculative-config '{"num_speculative_tokens": 1, "method":"mtp"}'
|
--speculative-config '{"num_speculative_tokens": 1, "method":"mtp"}'
|
||||||
--kv-transfer-config
|
--kv-transfer-config
|
||||||
'{"kv_connector": "LLMDataDistCMgrConnector",
|
'{"kv_connector": "MooncakeConnector",
|
||||||
"kv_buffer_device": "npu",
|
|
||||||
"kv_role": "kv_consumer",
|
"kv_role": "kv_consumer",
|
||||||
"kv_parallel_size": 1,
|
"kv_port": "30200",
|
||||||
"kv_port": "20001",
|
"engine_id": "2",
|
||||||
"engine_id": "0",
|
"kv_connector_module_path": "vllm_ascend.distributed.mooncake_connector",
|
||||||
"kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
|
"kv_connector_extra_config": {
|
||||||
|
"prefill": {
|
||||||
|
"dp_size": 2,
|
||||||
|
"tp_size": 8
|
||||||
|
},
|
||||||
|
"decode": {
|
||||||
|
"dp_size": 32,
|
||||||
|
"tp_size": 1
|
||||||
|
}
|
||||||
|
}
|
||||||
}'
|
}'
|
||||||
--additional-config
|
--additional-config
|
||||||
'{"torchair_graph_config":{"enabled":true,"enable_multistream_mla":true,"graph_batch_sizes":[28],"use_cached_graph":true,"enable_super_kernel":false},"multistream_overlap_shared_expert":true}'
|
'{"torchair_graph_config":{"enabled":true,"enable_multistream_mla":true,"graph_batch_sizes":[28],"use_cached_graph":true,"enable_super_kernel":false},"multistream_overlap_shared_expert":true}'
|
||||||
@@ -131,13 +155,21 @@ deployment:
|
|||||||
--gpu-memory-utilization 0.9
|
--gpu-memory-utilization 0.9
|
||||||
--speculative-config '{"num_speculative_tokens": 1, "method":"mtp"}'
|
--speculative-config '{"num_speculative_tokens": 1, "method":"mtp"}'
|
||||||
--kv-transfer-config
|
--kv-transfer-config
|
||||||
'{"kv_connector": "LLMDataDistCMgrConnector",
|
'{"kv_connector": "MooncakeConnector",
|
||||||
"kv_buffer_device": "npu",
|
|
||||||
"kv_role": "kv_consumer",
|
"kv_role": "kv_consumer",
|
||||||
"kv_parallel_size": 1,
|
"kv_port": "30200",
|
||||||
"kv_port": "20001",
|
"engine_id": "2",
|
||||||
"engine_id": "0",
|
"kv_connector_module_path": "vllm_ascend.distributed.mooncake_connector",
|
||||||
"kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
|
"kv_connector_extra_config": {
|
||||||
|
"prefill": {
|
||||||
|
"dp_size": 2,
|
||||||
|
"tp_size": 8
|
||||||
|
},
|
||||||
|
"decode": {
|
||||||
|
"dp_size": 32,
|
||||||
|
"tp_size": 1
|
||||||
|
}
|
||||||
|
}
|
||||||
}'
|
}'
|
||||||
--additional-config
|
--additional-config
|
||||||
'{"torchair_graph_config":{"enabled":true,"enable_multistream_mla":true,"graph_batch_sizes":[28],"use_cached_graph":true,"enable_super_kernel":false},"multistream_overlap_shared_expert":true}'
|
'{"torchair_graph_config":{"enabled":true,"enable_multistream_mla":true,"graph_batch_sizes":[28],"use_cached_graph":true,"enable_super_kernel":false},"multistream_overlap_shared_expert":true}'
|
||||||
|
|||||||
@@ -127,9 +127,6 @@ class MultiNodeConfig:
|
|||||||
|
|
||||||
master_ip = self.master_ip
|
master_ip = self.master_ip
|
||||||
if self.disaggregated_prefill:
|
if self.disaggregated_prefill:
|
||||||
self.envs[
|
|
||||||
"DISAGGREGATED_PREFILL_RANK_TABLE_PATH"] = self.disaggregated_prefill.get(
|
|
||||||
"ranktable_path")
|
|
||||||
if self.cur_index < self.decode_start_index:
|
if self.cur_index < self.decode_start_index:
|
||||||
# For prefiller nodes, use the default master ip(index==0) as DP master
|
# For prefiller nodes, use the default master ip(index==0) as DP master
|
||||||
master_ip = self.master_ip
|
master_ip = self.master_ip
|
||||||
|
|||||||
@@ -16,17 +16,6 @@ GIT_ROOT=$(git rev-parse --show-toplevel)
|
|||||||
# Trap the SIGINT signal (triggered by Ctrl+C)
|
# Trap the SIGINT signal (triggered by Ctrl+C)
|
||||||
trap 'kill $(jobs -pr)' SIGINT SIGTERM EXIT
|
trap 'kill $(jobs -pr)' SIGINT SIGTERM EXIT
|
||||||
|
|
||||||
# Gen ranktable
|
|
||||||
RANKTABLE_PATH=${GIT_ROOT}/examples/disaggregate_prefill_v1/ranktable.json
|
|
||||||
if [ -f "$RANKTABLE_PATH" ]; then
|
|
||||||
rm "$RANKTABLE_PATH"
|
|
||||||
fi
|
|
||||||
cd ${GIT_ROOT}/examples/disaggregate_prefill_v1
|
|
||||||
LOCAL_HOST=`hostname -I|awk -F " " '{print$1}'`
|
|
||||||
bash gen_ranktable.sh --ips $LOCAL_HOST --network-card-name enp189s0f0 --prefill-device-cnt 1 --decode-device-cnt 1
|
|
||||||
cd -
|
|
||||||
export DISAGGREGATED_PREFILL_RANK_TABLE_PATH="$RANKTABLE_PATH"
|
|
||||||
|
|
||||||
# Waits for vLLM to start.
|
# Waits for vLLM to start.
|
||||||
wait_for_server() {
|
wait_for_server() {
|
||||||
local port=$1
|
local port=$1
|
||||||
@@ -69,12 +58,14 @@ run_tests_for_model() {
|
|||||||
# Start prefill instance
|
# Start prefill instance
|
||||||
PREFILL_PORT=8001
|
PREFILL_PORT=8001
|
||||||
|
|
||||||
BASE_CMD="ASCEND_RT_VISIBLE_DEVICES=0 VLLM_ASCEND_LLMDD_RPC_PORT=5559 vllm serve $model_name \
|
BASE_CMD="ASCEND_RT_VISIBLE_DEVICES=0 vllm serve $model_name \
|
||||||
--port $PREFILL_PORT \
|
--port $PREFILL_PORT \
|
||||||
--seed 1024 \
|
--seed 1024 \
|
||||||
|
--enforce-eager \
|
||||||
--disable-log-requests \
|
--disable-log-requests \
|
||||||
--gpu-memory-utilization 0.8 \
|
--gpu-memory-utilization 0.8 \
|
||||||
--kv-transfer-config '{\"kv_connector\":\"LLMDataDistCMgrConnector\",\"kv_role\":\"kv_producer\",\"kv_buffer_device\":\"npu\",\"kv_parallel_size\":\"1\",\"kv_port\":\"20001\",\"engine_id\":\"0\",\"kv_connector_module_path\":\"vllm_ascend.distributed.llmdatadist_c_mgr_connector\"}'"
|
--distributed-executor-backend mp \
|
||||||
|
--kv-transfer-config '{\"kv_connector\":\"MooncakeConnector\",\"kv_role\":\"kv_producer\",\"kv_port\":\"30000\",\"engine_id\":\"0\",\"kv_connector_module_path\":\"vllm_ascend.distributed.mooncake_connector\",\"kv_connector_extra_config\":{\"prefill\":{\"dp_size\":1,\"tp_size\":1},\"decode\":{\"dp_size\":1,\"tp_size\":1}}}'"
|
||||||
|
|
||||||
if [ -n "$model_args" ]; then
|
if [ -n "$model_args" ]; then
|
||||||
FULL_CMD="$BASE_CMD $model_args"
|
FULL_CMD="$BASE_CMD $model_args"
|
||||||
@@ -88,12 +79,14 @@ run_tests_for_model() {
|
|||||||
DECODE_PORT=8002
|
DECODE_PORT=8002
|
||||||
|
|
||||||
# Build the command with or without model-specific args
|
# Build the command with or without model-specific args
|
||||||
BASE_CMD="ASCEND_RT_VISIBLE_DEVICES=1 VLLM_ASCEND_LLMDD_RPC_PORT=6000 vllm serve $model_name \
|
BASE_CMD="ASCEND_RT_VISIBLE_DEVICES=1 vllm serve $model_name \
|
||||||
--port $DECODE_PORT \
|
--port $DECODE_PORT \
|
||||||
--seed 1024 \
|
--seed 1024 \
|
||||||
|
--enforce-eager \
|
||||||
--disable-log-requests \
|
--disable-log-requests \
|
||||||
--gpu-memory-utilization 0.8 \
|
--gpu-memory-utilization 0.8 \
|
||||||
--kv-transfer-config '{\"kv_connector\":\"LLMDataDistCMgrConnector\",\"kv_role\":\"kv_consumer\",\"kv_buffer_device\":\"npu\",\"kv_parallel_size\":\"1\",\"kv_port\":\"20001\",\"engine_id\":\"0\",\"kv_connector_module_path\":\"vllm_ascend.distributed.llmdatadist_c_mgr_connector\"}'"
|
--distributed-executor-backend mp \
|
||||||
|
--kv-transfer-config '{\"kv_connector\":\"MooncakeConnector\",\"kv_role\":\"kv_consumer\",\"kv_port\":\"30100\",\"engine_id\":\"1\",\"kv_connector_module_path\":\"vllm_ascend.distributed.mooncake_connector\",\"kv_connector_extra_config\":{\"prefill\":{\"dp_size\":1,\"tp_size\":1},\"decode\":{\"dp_size\":1,\"tp_size\":1}}}'"
|
||||||
|
|
||||||
if [ -n "$model_args" ]; then
|
if [ -n "$model_args" ]; then
|
||||||
FULL_CMD="$BASE_CMD $model_args"
|
FULL_CMD="$BASE_CMD $model_args"
|
||||||
@@ -111,7 +104,7 @@ run_tests_for_model() {
|
|||||||
|
|
||||||
# Build the command for the proxy server with all the hosts and ports
|
# Build the command for the proxy server with all the hosts and ports
|
||||||
PROXY_PORT=8192
|
PROXY_PORT=8192
|
||||||
PROXY_CMD="python ${GIT_ROOT}/examples/disaggregate_prefill_v1/toy_proxy_server.py --port $PROXY_PORT"
|
PROXY_CMD="python ${GIT_ROOT}/examples/disaggregated_prefill_v1/load_balance_proxy_server_example.py --port $PROXY_PORT"
|
||||||
PROXY_CMD+=" --prefiller-ports ${PREFILL_PORT}"
|
PROXY_CMD+=" --prefiller-ports ${PREFILL_PORT}"
|
||||||
PROXY_CMD+=" --decoder-ports ${DECODE_PORT}"
|
PROXY_CMD+=" --decoder-ports ${DECODE_PORT}"
|
||||||
# Start the proxy server
|
# Start the proxy server
|
||||||
|
|||||||
@@ -1,98 +0,0 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
|
||||||
|
|
||||||
import os
|
|
||||||
import types
|
|
||||||
|
|
||||||
from tests.ut.kv_connector.utils import (create_request, create_scheduler,
|
|
||||||
create_vllm_config)
|
|
||||||
from vllm_ascend.distributed.llmdatadist_c_mgr_connector import (
|
|
||||||
LLMDataDistCMgrConnectorMetadata, LLMDataDistCMgrConnectorWorker, LLMRole)
|
|
||||||
|
|
||||||
|
|
||||||
def test_basic_inferface():
|
|
||||||
"""Unit test for basic LLMDataDistCMgrConnector interface functionality."""
|
|
||||||
|
|
||||||
vllm_config = create_vllm_config()
|
|
||||||
scheduler = create_scheduler(vllm_config)
|
|
||||||
|
|
||||||
# 2 Full Blocks and 1 Half Block.
|
|
||||||
BLOCK_SIZE = vllm_config.cache_config.block_size
|
|
||||||
NUM_EXTERNAL_FULL_BLOCKS = 2
|
|
||||||
NUM_TOKENS = int(BLOCK_SIZE * (NUM_EXTERNAL_FULL_BLOCKS + 0.5))
|
|
||||||
|
|
||||||
request = create_request(request_id=1,
|
|
||||||
num_tokens=NUM_TOKENS,
|
|
||||||
do_remote_prefill=True)
|
|
||||||
request_id = request.request_id
|
|
||||||
|
|
||||||
scheduler.add_request(request)
|
|
||||||
|
|
||||||
# Remote Prefill, triggers LLMDataDistCMgrConnectorMetadata.
|
|
||||||
scheduler_output = scheduler.schedule()
|
|
||||||
kv_connector_metadata = scheduler_output.kv_connector_metadata
|
|
||||||
assert kv_connector_metadata is not None
|
|
||||||
assert isinstance(kv_connector_metadata, LLMDataDistCMgrConnectorMetadata)
|
|
||||||
|
|
||||||
assert len(kv_connector_metadata.requests) == 1
|
|
||||||
assert request_id in kv_connector_metadata.requests
|
|
||||||
req_meta = kv_connector_metadata.requests[request_id]
|
|
||||||
|
|
||||||
for block_id, block in zip(
|
|
||||||
req_meta.local_block_ids, scheduler.kv_cache_manager.coordinator.
|
|
||||||
single_type_managers[0].req_to_blocks[request_id]):
|
|
||||||
assert block_id == block.block_id
|
|
||||||
|
|
||||||
|
|
||||||
def test_read_agent_metadata():
|
|
||||||
rank_table = {
|
|
||||||
"version":
|
|
||||||
"1.2",
|
|
||||||
"server_count":
|
|
||||||
"2",
|
|
||||||
"prefill_device_list": [{
|
|
||||||
"server_id": "192.168.1.1",
|
|
||||||
"device_id": "0",
|
|
||||||
"device_ip": "10.30.0.1",
|
|
||||||
"cluster_id": "0",
|
|
||||||
}, {
|
|
||||||
"server_id": "192.168.1.1",
|
|
||||||
"device_id": "1",
|
|
||||||
"device_ip": "10.30.0.2",
|
|
||||||
"cluster_id": "1",
|
|
||||||
}, {
|
|
||||||
"server_id": "192.168.1.2",
|
|
||||||
"device_id": "0",
|
|
||||||
"device_ip": "10.30.0.3",
|
|
||||||
"cluster_id": "2",
|
|
||||||
}, {
|
|
||||||
"server_id": "192.168.1.2",
|
|
||||||
"device_id": "1",
|
|
||||||
"device_ip": "10.30.0.4",
|
|
||||||
"cluster_id": "3",
|
|
||||||
}]
|
|
||||||
}
|
|
||||||
|
|
||||||
def get_device_ip(worker_local_ip, worker_tp_rank, worker_visible_devices):
|
|
||||||
old_visible_devices = os.environ.get("ASCEND_RT_VISIBLE_DEVICES", "")
|
|
||||||
worker = types.SimpleNamespace()
|
|
||||||
worker.local_ip = worker_local_ip
|
|
||||||
worker.tp_rank = worker_tp_rank
|
|
||||||
worker.llm_datadist_role = LLMRole.PROMPT
|
|
||||||
worker.pcp_rank = 0
|
|
||||||
worker.tp_size = worker_tp_rank + 1
|
|
||||||
os.environ["ASCEND_RT_VISIBLE_DEVICES"] = worker_visible_devices
|
|
||||||
agent_metadata = LLMDataDistCMgrConnectorWorker.read_agent_metadata(
|
|
||||||
worker, rank_table)
|
|
||||||
os.environ["ASCEND_RT_VISIBLE_DEVICES"] = old_visible_devices
|
|
||||||
return agent_metadata.device_ip
|
|
||||||
|
|
||||||
assert get_device_ip("192.168.1.1", 0, "0") == "10.30.0.1"
|
|
||||||
assert get_device_ip("192.168.1.1", 0, "1") == "10.30.0.2"
|
|
||||||
assert get_device_ip("192.168.1.2", 0, "0") == "10.30.0.3"
|
|
||||||
assert get_device_ip("192.168.1.2", 0, "1") == "10.30.0.4"
|
|
||||||
assert get_device_ip("192.168.1.1", 0, "0,1") == "10.30.0.1"
|
|
||||||
assert get_device_ip("192.168.1.1", 1, "0,1") == "10.30.0.2"
|
|
||||||
assert get_device_ip("192.168.1.1", 0, "") == "10.30.0.1"
|
|
||||||
assert get_device_ip("192.168.1.1", 1, "") == "10.30.0.2"
|
|
||||||
@@ -78,10 +78,9 @@ def create_vllm_config(
|
|||||||
enable_prefix_caching=True,
|
enable_prefix_caching=True,
|
||||||
)
|
)
|
||||||
kv_transfer_config = KVTransferConfig(
|
kv_transfer_config = KVTransferConfig(
|
||||||
kv_connector="LLMDataDistCMgrConnector",
|
kv_connector="MooncakeConnector",
|
||||||
kv_role="kv_both",
|
kv_role="kv_both",
|
||||||
kv_connector_module_path=
|
kv_connector_module_path="vllm_ascend.distributed.mooncake_connector")
|
||||||
"vllm_ascend.distributed.llmdatadist_c_mgr_connector")
|
|
||||||
return VllmConfig(scheduler_config=scheduler_config,
|
return VllmConfig(scheduler_config=scheduler_config,
|
||||||
model_config=model_config,
|
model_config=model_config,
|
||||||
cache_config=cache_config,
|
cache_config=cache_config,
|
||||||
|
|||||||
@@ -20,11 +20,6 @@ from vllm.distributed.kv_transfer.kv_connector.factory import \
|
|||||||
|
|
||||||
|
|
||||||
def register_connector():
|
def register_connector():
|
||||||
KVConnectorFactory.register_connector(
|
|
||||||
"LLMDataDistCMgrConnector",
|
|
||||||
"vllm_ascend.distributed.llmdatadist_c_mgr_connector",
|
|
||||||
"LLMDataDistCMgrConnector")
|
|
||||||
|
|
||||||
KVConnectorFactory.register_connector(
|
KVConnectorFactory.register_connector(
|
||||||
"MooncakeConnectorV1", "vllm_ascend.distributed.mooncake_connector",
|
"MooncakeConnectorV1", "vllm_ascend.distributed.mooncake_connector",
|
||||||
"MooncakeConnector")
|
"MooncakeConnector")
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@@ -103,23 +103,6 @@ env_variables: Dict[str, Callable[[], Any]] = {
|
|||||||
"VLLM_ASCEND_ENABLE_TOPK_TOPP_OPTIMIZATION":
|
"VLLM_ASCEND_ENABLE_TOPK_TOPP_OPTIMIZATION":
|
||||||
lambda: bool(
|
lambda: bool(
|
||||||
int(os.getenv("VLLM_ASCEND_ENABLE_TOPK_TOPP_OPTIMIZATION", '1'))),
|
int(os.getenv("VLLM_ASCEND_ENABLE_TOPK_TOPP_OPTIMIZATION", '1'))),
|
||||||
# `LLMDataDistCMgrConnector` required variable. `DISAGGREGATED_PREFILL_RANK_TABLE_PATH` is
|
|
||||||
# used for llmdatadist to build the communication topology for kv cache transfer, it is
|
|
||||||
# a required variable if `LLMDataDistCMgrConnector` is used as kv connector for disaggregated
|
|
||||||
# pd. The rank table can be generated by adopting the script `gen_ranktable.sh`
|
|
||||||
# in vllm_ascend's example folder.
|
|
||||||
"DISAGGREGATED_PREFILL_RANK_TABLE_PATH":
|
|
||||||
lambda: os.getenv("DISAGGREGATED_PREFILL_RANK_TABLE_PATH", None),
|
|
||||||
# `LLMDataDistCMgrConnector` required variable. `VLLM_ASCEND_LLMDD_RPC_IP` is used as the
|
|
||||||
# rpc communication listening ip, which will be used to receive the agent metadata from the
|
|
||||||
# remote worker.
|
|
||||||
"VLLM_ASCEND_LLMDD_RPC_IP":
|
|
||||||
lambda: os.getenv("VLLM_ASCEND_LLMDD_RPC_IP", "0.0.0.0"),
|
|
||||||
# `LLMDataDistCMgrConnector` required variable. `VLLM_ASCEND_LLMDD_RPC_PORT` is used as the
|
|
||||||
# rpc communication listening port, which will be used to receive the agent metadata from the
|
|
||||||
# remote worker.
|
|
||||||
"VLLM_ASCEND_LLMDD_RPC_PORT":
|
|
||||||
lambda: int(os.getenv("VLLM_ASCEND_LLMDD_RPC_PORT", 5557)),
|
|
||||||
# Whether to enable mla_pa for deepseek mla decode, this flag will be removed after its available torch_npu is public accessible
|
# Whether to enable mla_pa for deepseek mla decode, this flag will be removed after its available torch_npu is public accessible
|
||||||
# and the mla_pa will be the default path of deepseek decode path.
|
# and the mla_pa will be the default path of deepseek decode path.
|
||||||
"VLLM_ASCEND_MLA_PA":
|
"VLLM_ASCEND_MLA_PA":
|
||||||
|
|||||||
@@ -3398,7 +3398,7 @@ class NPUModelRunner(LoRAModelRunnerMixin, ECConnectorModelRunnerMixin):
|
|||||||
# init kv cache tensors
|
# init kv cache tensors
|
||||||
kv_cache_raw_tensors: dict[str, Union[torch.Tensor,
|
kv_cache_raw_tensors: dict[str, Union[torch.Tensor,
|
||||||
Optional[torch.Tensor]]] = {}
|
Optional[torch.Tensor]]] = {}
|
||||||
# llmdatadist need the addr of cache tensor be aligned with 2M
|
# prefill disaggregation need the addr of cache tensor be aligned with 2M
|
||||||
alignment = 2 * 1024 * 1024
|
alignment = 2 * 1024 * 1024
|
||||||
for kv_cache_tensor in kv_cache_config.kv_cache_tensors:
|
for kv_cache_tensor in kv_cache_config.kv_cache_tensors:
|
||||||
# TODO: REFACTOR ME to sharing hybrid cache
|
# TODO: REFACTOR ME to sharing hybrid cache
|
||||||
@@ -3426,7 +3426,7 @@ class NPUModelRunner(LoRAModelRunnerMixin, ECConnectorModelRunnerMixin):
|
|||||||
elif "attn" in layer_name and layer_name not in kv_cache_raw_tensors.keys(
|
elif "attn" in layer_name and layer_name not in kv_cache_raw_tensors.keys(
|
||||||
):
|
):
|
||||||
# NOTE: We need to init k cache tensor (nope cache tensor in mla) and
|
# NOTE: We need to init k cache tensor (nope cache tensor in mla) and
|
||||||
# v cache tensor (rope cache tensor in mla) separately to support llmdatadist,
|
# v cache tensor (rope cache tensor in mla) separately to support prefill disaggregation,
|
||||||
# as it only support the 0-dim of kv_cache is `num_blocks`.
|
# as it only support the 0-dim of kv_cache is `num_blocks`.
|
||||||
# For deepseek mla, we need to spilt cache tensor accrodding to the nope head dim
|
# For deepseek mla, we need to spilt cache tensor accrodding to the nope head dim
|
||||||
# and rope head dim.
|
# and rope head dim.
|
||||||
|
|||||||
Reference in New Issue
Block a user