init v0.11.0rc0

This commit is contained in:
2025-10-14 10:38:28 +08:00
parent 67afd0ea78
commit 66dc16f966
278 changed files with 28130 additions and 11708 deletions

View File

@@ -42,7 +42,7 @@ export DISAGGREGATED_PREFILL_RANK_TABLE_PATH=/vllm-workspace/vllm-ascend/example
export OMP_PROC_BIND=false
export OMP_NUM_THREADS=100
export VLLM_USE_V1=1
export VLLM_LLMDD_RPC_PORT=5559
export VLLM_ASCEND_LLMDD_RPC_PORT=5559
vllm serve /models/deepseek_r1_w8a8 \
--host 0.0.0.0 \
@@ -70,9 +70,7 @@ vllm serve /models/deepseek_r1_w8a8 \
"kv_port": "20001",
"engine_id": "0",
"kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
}' \
--additional-config \
'{"chunked_prefill_for_mla":true}'
}'
```
Run prefill server P2 on second node:
@@ -85,7 +83,7 @@ export DISAGGREGATED_PREFILL_RANK_TABLE_PATH=/vllm-workspace/vllm-ascend/example
export OMP_PROC_BIND=false
export OMP_NUM_THREADS=100
export VLLM_USE_V1=1
export VLLM_LLMDD_RPC_PORT=5659
export VLLM_ASCEND_LLMDD_RPC_PORT=5659
vllm serve /models/deepseek_r1_w8a8 \
--host 0.0.0.0 \
@@ -114,9 +112,7 @@ vllm serve /models/deepseek_r1_w8a8 \
"kv_port": "20001",
"engine_id": "0",
"kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
}' \
--additional-config \
'{"chunked_prefill_for_mla":true}'
}'
```
Run decode server d1 on third node:
@@ -131,7 +127,7 @@ export DISAGGREGATED_PREFILL_RANK_TABLE_PATH=/vllm-workspace/vllm-ascend/example
export OMP_PROC_BIND=false
export OMP_NUM_THREADS=100
export VLLM_USE_V1=1
export VLLM_LLMDD_RPC_PORT=5759
export VLLM_ASCEND_LLMDD_RPC_PORT=5759
vllm serve /models/deepseek_r1_w8a8 \
--host 0.0.0.0 \
@@ -173,7 +169,7 @@ export DISAGGREGATED_PREFILL_RANK_TABLE_PATH=/vllm-workspace/vllm-ascend/example
export OMP_PROC_BIND=false
export OMP_NUM_THREADS=100
export VLLM_USE_V1=1
export VLLM_LLMDD_RPC_PORT=5859
export VLLM_ASCEND_LLMDD_RPC_PORT=5859
vllm serve /models/deepseek_r1_w8a8 \
--host 0.0.0.0 \

View File

@@ -17,6 +17,10 @@ parser.add_argument("--decode-device-cnt",
type=int,
required=True,
help="number of decode devices")
parser.add_argument("--local-device-ids",
type=str,
required=False,
help="local device ids")
args = parser.parse_args()
local_host = args.local_host
prefill_device_cnt = args.prefill_device_cnt
@@ -54,39 +58,49 @@ chips_per_card = get_cmd_stdout("npu-smi info -l | grep \"Chip Count\"").split(
"\n")[0].split(":")[1].strip()
chips_per_card = int(chips_per_card)
if args.local_device_ids:
local_device_ids = args.local_device_ids.split(',')
else:
local_device_ids = []
for card_id in range(num_cards):
for chip_id in range(chips_per_card):
device_id = card_id * chips_per_card + chip_id
local_device_ids.append(device_id)
# generate local device list for local rank 0, and gather it to all ranks
local_device_list: list[dict[str, str]] = list()
if local_rank == "0":
super_pod_id = "0"
for card_id in range(num_cards):
for chip_id in range(chips_per_card):
device_id = card_id * chips_per_card + chip_id
if soc_info == AscendSocVersion.A3:
device_ip = get_cmd_stdout(
f"{hccn_tool_path} -i {device_id} -vnic -g | grep ipaddr"
).split(":")[1].strip()
super_device_id = get_cmd_stdout(
f"npu-smi info -t spod-info -i {card_id} -c {chip_id} | grep SDID"
).split(":")[1].strip()
super_pod_id = get_cmd_stdout(
f"npu-smi info -t spod-info -i {card_id} -c {chip_id} | grep \"Super Pod ID\""
).split(":")[1].strip()
else:
device_ip = get_cmd_stdout(
f"{hccn_tool_path} -i {device_id} -ip -g | grep ipaddr"
).split(":")[1].strip()
for idx in range(len(local_device_ids)):
device_id = local_device_ids[idx]
chip_id = device_id % chips_per_card
card_id = device_id // chips_per_card
if soc_info == AscendSocVersion.A3:
device_ip = get_cmd_stdout(
f"{hccn_tool_path} -i {device_id} -vnic -g | grep ipaddr"
).split(":")[1].strip()
super_device_id = get_cmd_stdout(
f"npu-smi info -t spod-info -i {card_id} -c {chip_id} | grep SDID"
).split(":")[1].strip()
super_pod_id = get_cmd_stdout(
f"npu-smi info -t spod-info -i {card_id} -c {chip_id} | grep \"Super Pod ID\""
).split(":")[1].strip()
else:
device_ip = get_cmd_stdout(
f"{hccn_tool_path} -i {device_id} -ip -g | grep ipaddr"
).split(":")[1].strip()
device_info = {
"server_id": local_host,
"device_id": str(device_id),
"device_ip": str(device_ip),
}
if soc_info == AscendSocVersion.A3:
device_info.update({
"super_pod_id": str(super_pod_id),
"super_device_id": str(super_device_id)
})
local_device_list.append(device_info)
device_info = {
"server_id": local_host,
"device_id": str(device_id),
"device_ip": str(device_ip),
}
if soc_info == AscendSocVersion.A3:
device_info.update({
"super_pod_id": str(super_pod_id),
"super_device_id": str(super_device_id)
})
local_device_list.append(device_info)
dist.init_process_group(backend=dist.Backend.GLOO)
global_device_list = [None] * dist.get_world_size()

View File

@@ -33,6 +33,11 @@ while [[ $# -gt 0 ]]; do
DECODE_DEVICE_CNT="$1"
shift
;;
--local-device-ids)
shift
LOCAL_DEVICE_IDS="$1"
shift
;;
esac
done
LOCAL_HOSTS=($(hostname -I))
@@ -68,6 +73,10 @@ echo "NNODES": $NNODES
echo "NODE_RANK": $NODE_RANK
echo "==============="
if [ -n "$LOCAL_DEVICE_IDS" ]; then
OPTIONAL_SECTION=" --local-device-ids $LOCAL_DEVICE_IDS"
fi
if [[ -n "${GEN_RANKTABLE}" || ! -e ${PWD}/ranktable.json ]]; then
GLOO_SOCKET_IFNAME=$NETWORK_CARD_NAME torchrun \
--nproc_per_node 1 \
@@ -75,5 +84,5 @@ if [[ -n "${GEN_RANKTABLE}" || ! -e ${PWD}/ranktable.json ]]; then
--node_rank ${NODE_RANK} \
--master_addr ${MASTER_ADDR} \
--master_port ${MASTER_PORT} \
gen_ranktable.py --local-host $LOCAL_HOST --prefill-device-cnt $PREFILL_DEVICE_CNT --decode-device-cnt $DECODE_DEVICE_CNT
gen_ranktable.py --local-host $LOCAL_HOST --prefill-device-cnt $PREFILL_DEVICE_CNT --decode-device-cnt $DECODE_DEVICE_CNT $OPTIONAL_SECTION
fi

View File

@@ -363,6 +363,7 @@ async def send_request_to_service(client: httpx.AsyncClient,
}
req_data["stream"] = False
req_data["max_tokens"] = 1
req_data["min_tokens"] = 1
if "stream_options" in req_data:
del req_data["stream_options"]
headers = {

View File

@@ -0,0 +1,272 @@
# Mooncacke Store Deployment Guide
## Environmental Dependencies
* Software:
* Python >= 3.9, < 3.12
* CANN >= 8.2.rc1
* PyTorch >= 2.7.1, torch-npu >= 2.7.1.dev20250724
* vLLMmain branch
* vLLM-Ascendmain branch
* Mooncake[AscendTransport/Mooncake at pooling-async-memcpy](https://github.com/AscendTransport/Mooncake/tree/pooling-async-memcpy)(Currently available branch code, continuously updated.)
Installation and Compilation Guidehttps://github.com/AscendTransport/Mooncake/tree/pooling-async-memcpy?tab=readme-ov-file#build-and-use-binaries
## run mooncake master
### 1.Configure mooncake.json
The environment variable **MOONCAKE_CONFIG_PATH** is configured to the full path where mooncake.json is located.
```
{
"local_hostname": "xx.xx.xx.xx",
"metadata_server": "P2PHANDSHAKE",
"protocol": "ascend",
"device_name": "",
"master_server_address": "xx.xx.xx.xx:50088",
"global_segment_size": 30000000000
}
```
**local_hostname**: Configured as the IP address of the current master node,
**metadata_server**: Configured as **P2PHANDSHAKE**,
**protocol:** Configured for Ascend to use Mooncake's HCCL communication,
**device_name**: ""
**master_server_address**: Configured with the IP and port of the master service
**global_segment_size**: Expands the kvcache size registered by the PD node to the master
### 2. Start mooncake_master
Under the mooncake folder:
```
mooncake_master --port 50088
```
## Pooling and Prefill Decode Disaggregate Scenario
### 1.Run `prefill` Node and `decode` Node
Using MultiConnector to simultaneously utilize both p2p connectors and pooled connectors. P2P performs kv_transfer, while pooling creates a larger prefix-cache.
`prefill` Node
```
bash multi_producer.sh
```
The content of the multi_producer.sh script:
```
export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH
export PYTHONPATH=$PYTHONPATH:/xxxxx/vllm
export MOONCAKE_CONFIG_PATH="/xxxxxx/mooncake.json"
export VLLM_USE_V1=1
export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3
export ASCEND_TRANSPORT_PRINT=1
export ACL_OP_INIT_MODE=1
# The upper boundary environment variable for memory swap logging is set to mooncake, where 1 indicates enabled and 0 indicates disabled.
export ASCEND_AGGREGATE_ENABLE=1
# The upper-level environment variable is the switch for enabling the mooncake aggregation function, where 1 means on and 0 means off.
python3 -m vllm.entrypoints.openai.api_server \
--model /xxxxx/Qwen2.5-7B-Instruct \
--port 8100 \
--trust-remote-code \
--enforce-eager \
--no_enable_prefix_caching \
--tensor-parallel-size 1 \
--data-parallel-size 1 \
--max-model-len 10000 \
--block-size 128 \
--max-num-batched-tokens 4096 \
--kv-transfer-config \
'{
"kv_connector": "MultiConnector",
"kv_role": "kv_producer",
"kv_connector_extra_config": {
"use_layerwise": false,
"connectors": [
{
"kv_connector": "MooncakeConnectorV1",
"kv_role": "kv_producer",
"kv_port": "20001",
"kv_connector_extra_config": {
"prefill": {
"dp_size": 1,
"tp_size": 1
},
"decode": {
"dp_size": 1,
"tp_size": 1
}
}
},
{
"kv_connector": "MooncakeConnectorStoreV1",
"kv_role": "kv_producer",
"mooncake_rpc_port":"0"
}
]
}
}' > p.log 2>&1
```
`decode` Node
```
bash multi_consumer.sh
```
The content of multi_consumer.sh:
```
export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH
export PYTHONPATH=$PYTHONPATH:/xxxxx/vllm
export MOONCAKE_CONFIG_PATH="/xxxxx/mooncake.json"
export VLLM_USE_V1=1
export ASCEND_RT_VISIBLE_DEVICES=4,5,6,7
export ACL_OP_INIT_MODE=1
export ASCEND_TRANSPORT_PRINT=1
# The upper boundary environment variable for memory swap logging is set to mooncake, where 1 indicates enabled and 0 indicates disabled.
export ASCEND_AGGREGATE_ENABLE=1
# The upper-level environment variable is the switch for enabling the mooncake aggregation function, where 1 means on and 0 means off.
python3 -m vllm.entrypoints.openai.api_server \
--model /xxxxx/Qwen2.5-7B-Instruct \
--port 8200 \
--trust-remote-code \
--enforce-eager \
--no_enable_prefix_caching \
--tensor-parallel-size 1 \
--data-parallel-size 1 \
--max-model-len 10000 \
--block-size 128 \
--max-num-batched-tokens 4096 \
--kv-transfer-config \
'{
"kv_connector": "MultiConnector",
"kv_role": "kv_consumer",
"kv_connector_extra_config": {
"use_layerwise": false,
"connectors": [
{
"kv_connector": "MooncakeConnectorV1",
"kv_role": "kv_consumer",
"kv_port": "20002",
"kv_connector_extra_config": {
"prefill": {
"dp_size": 1,
"tp_size": 1
},
"decode": {
"dp_size": 1,
"tp_size": 1
}
}
},
{
"kv_connector": "MooncakeConnectorStoreV1",
"kv_role": "kv_consumer",
"mooncake_rpc_port":"1"
}
]
}
}' > d.log 2>&1
```
### 2、Start proxy_server.
```
bash proxy.sh
```
proxy.sh content:
Change localhost to your actual IP address.
```
python vllm-ascend/examples/disaggregated_prefill_v1/load_balance_proxy_server_example.py \
--host localhost\
--prefiller-hosts localhost \
--prefiller-ports 8100 \
--decoder-hosts localhost\
--decoder-ports 8200 \
```
### 3. Run Inference
Configure the localhost, port, and model weight path in the command to your own settings.
Short question:
```
curl -s http://localhost:8000/v1/completions -H "Content-Type: application/json" -d '{ "model": "/xxxxx/Qwen2.5-7B-Instruct", "prompt": "Hello. I have a question. The president of the United States is", "max_tokens": 200, "temperature":0.0 }'
```
Long question:
```
curl -s http://localhost:8000/v1/completions -H "Content-Type: application/json" -d '{ "model": "/xxxxx/Qwen2.5-7B-Instruct", "prompt": "Given the accelerating impacts of climate change—including rising sea levels, increasing frequency of extreme weather events, loss of biodiversity, and adverse effects on agriculture and human health—there is an urgent need for a robust, globally coordinated response. However, international efforts are complicated by a range of factors: economic disparities between high-income and low-income countries, differing levels of industrialization, varying access to clean energy technologies, and divergent political systems that influence climate policy implementation. In this context, how can global agreements like the Paris Accord be redesigned or strengthened to not only encourage but effectively enforce emission reduction targets? Furthermore, what mechanisms can be introduced to promote fair and transparent technology transfer, provide adequate financial support for climate adaptation in vulnerable regions, and hold nations accountable without exacerbating existing geopolitical tensions or disproportionately burdening those with historically lower emissions?", "max_tokens": 256, "temperature":0.0 }'
```
## Pooling and Mixed Deployment Scenario
### 1、Run Mixed Department Script
The mixed script is essentially a pure pooling scenario for the P node.
```
bash mixed_department.sh
```
Content of mixed_department.sh:
```
export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH
export PYTHONPATH=$PYTHONPATH:/xxxxx/vllm
export MOONCAKE_CONFIG_PATH="/xxxxxx/mooncake.json"
export VLLM_USE_V1=1
export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3
export ACL_OP_INIT_MODE=1
export ASCEND_TRANSPORT_PRINT=1
# The upper boundary environment variable for memory swap logging is set to mooncake, where 1 indicates enabled and 0 indicates disabled.
export ASCEND_AGGREGATE_ENABLE=1
# The upper-level environment variable is the switch for enabling the mooncake aggregation function, where 1 means on and 0 means off.
python3 -m vllm.entrypoints.openai.api_server \
--model /xxxxx/Qwen2.5-7B-Instruct \
--port 8100 \
--trust-remote-code \
--enforce-eager \
--no_enable_prefix_caching \
--tensor-parallel-size 1 \
--data-parallel-size 1 \
--max-model-len 10000 \
--block-size 128 \
--max-num-batched-tokens 4096 \
--kv-transfer-config \
'{
"kv_connector": "MooncakeConnectorStoreV1",
"kv_role": "kv_both",
"kv_connector_extra_config": {
"use_layerwise": false,
"mooncake_rpc_port":"0"
}
}' > mix.log 2>&1
```
### 2. Run Inference
Configure the localhost, port, and model weight path in the command to your own settings. The requests sent will only go to the port where the mixed deployment script is located, and there is no need to start a separate proxy.
Short question:
```
curl -s http://localhost:8100/v1/completions -H "Content-Type: application/json" -d '{ "model": "/xxxxx/Qwen2.5-7B-Instruct", "prompt": "Hello. I have a question. The president of the United States is", "max_tokens": 200, "temperature":0.0 }'
```
Long question:
```
curl -s http://localhost:8100/v1/completions -H "Content-Type: application/json" -d '{ "model": "/xxxxx/Qwen2.5-7B-Instruct", "prompt": "Given the accelerating impacts of climate change—including rising sea levels, increasing frequency of extreme weather events, loss of biodiversity, and adverse effects on agriculture and human health—there is an urgent need for a robust, globally coordinated response. However, international efforts are complicated by a range of factors: economic disparities between high-income and low-income countries, differing levels of industrialization, varying access to clean energy technologies, and divergent political systems that influence climate policy implementation. In this context, how can global agreements like the Paris Accord be redesigned or strengthened to not only encourage but effectively enforce emission reduction targets? Furthermore, what mechanisms can be introduced to promote fair and transparent technology transfer, provide adequate financial support for climate adaptation in vulnerable regions, and hold nations accountable without exacerbating existing geopolitical tensions or disproportionately burdening those with historically lower emissions?", "max_tokens": 256, "temperature":0.0 }'
```