init v0.11.0rc0
This commit is contained in:
@@ -42,7 +42,7 @@ export DISAGGREGATED_PREFILL_RANK_TABLE_PATH=/vllm-workspace/vllm-ascend/example
|
||||
export OMP_PROC_BIND=false
|
||||
export OMP_NUM_THREADS=100
|
||||
export VLLM_USE_V1=1
|
||||
export VLLM_LLMDD_RPC_PORT=5559
|
||||
export VLLM_ASCEND_LLMDD_RPC_PORT=5559
|
||||
|
||||
vllm serve /models/deepseek_r1_w8a8 \
|
||||
--host 0.0.0.0 \
|
||||
@@ -70,9 +70,7 @@ vllm serve /models/deepseek_r1_w8a8 \
|
||||
"kv_port": "20001",
|
||||
"engine_id": "0",
|
||||
"kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
|
||||
}' \
|
||||
--additional-config \
|
||||
'{"chunked_prefill_for_mla":true}'
|
||||
}'
|
||||
```
|
||||
|
||||
Run prefill server P2 on second node:
|
||||
@@ -85,7 +83,7 @@ export DISAGGREGATED_PREFILL_RANK_TABLE_PATH=/vllm-workspace/vllm-ascend/example
|
||||
export OMP_PROC_BIND=false
|
||||
export OMP_NUM_THREADS=100
|
||||
export VLLM_USE_V1=1
|
||||
export VLLM_LLMDD_RPC_PORT=5659
|
||||
export VLLM_ASCEND_LLMDD_RPC_PORT=5659
|
||||
|
||||
vllm serve /models/deepseek_r1_w8a8 \
|
||||
--host 0.0.0.0 \
|
||||
@@ -114,9 +112,7 @@ vllm serve /models/deepseek_r1_w8a8 \
|
||||
"kv_port": "20001",
|
||||
"engine_id": "0",
|
||||
"kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
|
||||
}' \
|
||||
--additional-config \
|
||||
'{"chunked_prefill_for_mla":true}'
|
||||
}'
|
||||
```
|
||||
|
||||
Run decode server d1 on third node:
|
||||
@@ -131,7 +127,7 @@ export DISAGGREGATED_PREFILL_RANK_TABLE_PATH=/vllm-workspace/vllm-ascend/example
|
||||
export OMP_PROC_BIND=false
|
||||
export OMP_NUM_THREADS=100
|
||||
export VLLM_USE_V1=1
|
||||
export VLLM_LLMDD_RPC_PORT=5759
|
||||
export VLLM_ASCEND_LLMDD_RPC_PORT=5759
|
||||
|
||||
vllm serve /models/deepseek_r1_w8a8 \
|
||||
--host 0.0.0.0 \
|
||||
@@ -173,7 +169,7 @@ export DISAGGREGATED_PREFILL_RANK_TABLE_PATH=/vllm-workspace/vllm-ascend/example
|
||||
export OMP_PROC_BIND=false
|
||||
export OMP_NUM_THREADS=100
|
||||
export VLLM_USE_V1=1
|
||||
export VLLM_LLMDD_RPC_PORT=5859
|
||||
export VLLM_ASCEND_LLMDD_RPC_PORT=5859
|
||||
|
||||
vllm serve /models/deepseek_r1_w8a8 \
|
||||
--host 0.0.0.0 \
|
||||
|
||||
@@ -17,6 +17,10 @@ parser.add_argument("--decode-device-cnt",
|
||||
type=int,
|
||||
required=True,
|
||||
help="number of decode devices")
|
||||
parser.add_argument("--local-device-ids",
|
||||
type=str,
|
||||
required=False,
|
||||
help="local device ids")
|
||||
args = parser.parse_args()
|
||||
local_host = args.local_host
|
||||
prefill_device_cnt = args.prefill_device_cnt
|
||||
@@ -54,39 +58,49 @@ chips_per_card = get_cmd_stdout("npu-smi info -l | grep \"Chip Count\"").split(
|
||||
"\n")[0].split(":")[1].strip()
|
||||
chips_per_card = int(chips_per_card)
|
||||
|
||||
if args.local_device_ids:
|
||||
local_device_ids = args.local_device_ids.split(',')
|
||||
else:
|
||||
local_device_ids = []
|
||||
for card_id in range(num_cards):
|
||||
for chip_id in range(chips_per_card):
|
||||
device_id = card_id * chips_per_card + chip_id
|
||||
local_device_ids.append(device_id)
|
||||
|
||||
# generate local device list for local rank 0, and gather it to all ranks
|
||||
local_device_list: list[dict[str, str]] = list()
|
||||
if local_rank == "0":
|
||||
super_pod_id = "0"
|
||||
for card_id in range(num_cards):
|
||||
for chip_id in range(chips_per_card):
|
||||
device_id = card_id * chips_per_card + chip_id
|
||||
if soc_info == AscendSocVersion.A3:
|
||||
device_ip = get_cmd_stdout(
|
||||
f"{hccn_tool_path} -i {device_id} -vnic -g | grep ipaddr"
|
||||
).split(":")[1].strip()
|
||||
super_device_id = get_cmd_stdout(
|
||||
f"npu-smi info -t spod-info -i {card_id} -c {chip_id} | grep SDID"
|
||||
).split(":")[1].strip()
|
||||
super_pod_id = get_cmd_stdout(
|
||||
f"npu-smi info -t spod-info -i {card_id} -c {chip_id} | grep \"Super Pod ID\""
|
||||
).split(":")[1].strip()
|
||||
else:
|
||||
device_ip = get_cmd_stdout(
|
||||
f"{hccn_tool_path} -i {device_id} -ip -g | grep ipaddr"
|
||||
).split(":")[1].strip()
|
||||
for idx in range(len(local_device_ids)):
|
||||
device_id = local_device_ids[idx]
|
||||
chip_id = device_id % chips_per_card
|
||||
card_id = device_id // chips_per_card
|
||||
if soc_info == AscendSocVersion.A3:
|
||||
device_ip = get_cmd_stdout(
|
||||
f"{hccn_tool_path} -i {device_id} -vnic -g | grep ipaddr"
|
||||
).split(":")[1].strip()
|
||||
super_device_id = get_cmd_stdout(
|
||||
f"npu-smi info -t spod-info -i {card_id} -c {chip_id} | grep SDID"
|
||||
).split(":")[1].strip()
|
||||
super_pod_id = get_cmd_stdout(
|
||||
f"npu-smi info -t spod-info -i {card_id} -c {chip_id} | grep \"Super Pod ID\""
|
||||
).split(":")[1].strip()
|
||||
else:
|
||||
device_ip = get_cmd_stdout(
|
||||
f"{hccn_tool_path} -i {device_id} -ip -g | grep ipaddr"
|
||||
).split(":")[1].strip()
|
||||
|
||||
device_info = {
|
||||
"server_id": local_host,
|
||||
"device_id": str(device_id),
|
||||
"device_ip": str(device_ip),
|
||||
}
|
||||
if soc_info == AscendSocVersion.A3:
|
||||
device_info.update({
|
||||
"super_pod_id": str(super_pod_id),
|
||||
"super_device_id": str(super_device_id)
|
||||
})
|
||||
local_device_list.append(device_info)
|
||||
device_info = {
|
||||
"server_id": local_host,
|
||||
"device_id": str(device_id),
|
||||
"device_ip": str(device_ip),
|
||||
}
|
||||
if soc_info == AscendSocVersion.A3:
|
||||
device_info.update({
|
||||
"super_pod_id": str(super_pod_id),
|
||||
"super_device_id": str(super_device_id)
|
||||
})
|
||||
local_device_list.append(device_info)
|
||||
|
||||
dist.init_process_group(backend=dist.Backend.GLOO)
|
||||
global_device_list = [None] * dist.get_world_size()
|
||||
|
||||
@@ -33,6 +33,11 @@ while [[ $# -gt 0 ]]; do
|
||||
DECODE_DEVICE_CNT="$1"
|
||||
shift
|
||||
;;
|
||||
--local-device-ids)
|
||||
shift
|
||||
LOCAL_DEVICE_IDS="$1"
|
||||
shift
|
||||
;;
|
||||
esac
|
||||
done
|
||||
LOCAL_HOSTS=($(hostname -I))
|
||||
@@ -68,6 +73,10 @@ echo "NNODES": $NNODES
|
||||
echo "NODE_RANK": $NODE_RANK
|
||||
echo "==============="
|
||||
|
||||
if [ -n "$LOCAL_DEVICE_IDS" ]; then
|
||||
OPTIONAL_SECTION=" --local-device-ids $LOCAL_DEVICE_IDS"
|
||||
fi
|
||||
|
||||
if [[ -n "${GEN_RANKTABLE}" || ! -e ${PWD}/ranktable.json ]]; then
|
||||
GLOO_SOCKET_IFNAME=$NETWORK_CARD_NAME torchrun \
|
||||
--nproc_per_node 1 \
|
||||
@@ -75,5 +84,5 @@ if [[ -n "${GEN_RANKTABLE}" || ! -e ${PWD}/ranktable.json ]]; then
|
||||
--node_rank ${NODE_RANK} \
|
||||
--master_addr ${MASTER_ADDR} \
|
||||
--master_port ${MASTER_PORT} \
|
||||
gen_ranktable.py --local-host $LOCAL_HOST --prefill-device-cnt $PREFILL_DEVICE_CNT --decode-device-cnt $DECODE_DEVICE_CNT
|
||||
gen_ranktable.py --local-host $LOCAL_HOST --prefill-device-cnt $PREFILL_DEVICE_CNT --decode-device-cnt $DECODE_DEVICE_CNT $OPTIONAL_SECTION
|
||||
fi
|
||||
|
||||
@@ -363,6 +363,7 @@ async def send_request_to_service(client: httpx.AsyncClient,
|
||||
}
|
||||
req_data["stream"] = False
|
||||
req_data["max_tokens"] = 1
|
||||
req_data["min_tokens"] = 1
|
||||
if "stream_options" in req_data:
|
||||
del req_data["stream_options"]
|
||||
headers = {
|
||||
|
||||
@@ -0,0 +1,272 @@
|
||||
# Mooncacke Store Deployment Guide
|
||||
|
||||
## Environmental Dependencies
|
||||
|
||||
* Software:
|
||||
* Python >= 3.9, < 3.12
|
||||
* CANN >= 8.2.rc1
|
||||
* PyTorch >= 2.7.1, torch-npu >= 2.7.1.dev20250724
|
||||
* vLLM:main branch
|
||||
* vLLM-Ascend:main branch
|
||||
* Mooncake:[AscendTransport/Mooncake at pooling-async-memcpy](https://github.com/AscendTransport/Mooncake/tree/pooling-async-memcpy)(Currently available branch code, continuously updated.)
|
||||
Installation and Compilation Guide:https://github.com/AscendTransport/Mooncake/tree/pooling-async-memcpy?tab=readme-ov-file#build-and-use-binaries
|
||||
|
||||
## run mooncake master
|
||||
|
||||
### 1.Configure mooncake.json
|
||||
|
||||
The environment variable **MOONCAKE_CONFIG_PATH** is configured to the full path where mooncake.json is located.
|
||||
|
||||
```
|
||||
{
|
||||
"local_hostname": "xx.xx.xx.xx",
|
||||
"metadata_server": "P2PHANDSHAKE",
|
||||
"protocol": "ascend",
|
||||
"device_name": "",
|
||||
"master_server_address": "xx.xx.xx.xx:50088",
|
||||
"global_segment_size": 30000000000
|
||||
}
|
||||
```
|
||||
|
||||
**local_hostname**: Configured as the IP address of the current master node,
|
||||
**metadata_server**: Configured as **P2PHANDSHAKE**,
|
||||
**protocol:** Configured for Ascend to use Mooncake's HCCL communication,
|
||||
**device_name**: ""
|
||||
**master_server_address**: Configured with the IP and port of the master service
|
||||
**global_segment_size**: Expands the kvcache size registered by the PD node to the master
|
||||
|
||||
### 2. Start mooncake_master
|
||||
|
||||
Under the mooncake folder:
|
||||
|
||||
```
|
||||
mooncake_master --port 50088
|
||||
```
|
||||
|
||||
## Pooling and Prefill Decode Disaggregate Scenario
|
||||
|
||||
### 1.Run `prefill` Node and `decode` Node
|
||||
|
||||
Using MultiConnector to simultaneously utilize both p2p connectors and pooled connectors. P2P performs kv_transfer, while pooling creates a larger prefix-cache.
|
||||
|
||||
`prefill` Node:
|
||||
|
||||
```
|
||||
bash multi_producer.sh
|
||||
```
|
||||
|
||||
The content of the multi_producer.sh script:
|
||||
|
||||
```
|
||||
export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH
|
||||
export PYTHONPATH=$PYTHONPATH:/xxxxx/vllm
|
||||
export MOONCAKE_CONFIG_PATH="/xxxxxx/mooncake.json"
|
||||
export VLLM_USE_V1=1
|
||||
export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3
|
||||
export ASCEND_TRANSPORT_PRINT=1
|
||||
export ACL_OP_INIT_MODE=1
|
||||
# The upper boundary environment variable for memory swap logging is set to mooncake, where 1 indicates enabled and 0 indicates disabled.
|
||||
export ASCEND_AGGREGATE_ENABLE=1
|
||||
# The upper-level environment variable is the switch for enabling the mooncake aggregation function, where 1 means on and 0 means off.
|
||||
|
||||
python3 -m vllm.entrypoints.openai.api_server \
|
||||
--model /xxxxx/Qwen2.5-7B-Instruct \
|
||||
--port 8100 \
|
||||
--trust-remote-code \
|
||||
--enforce-eager \
|
||||
--no_enable_prefix_caching \
|
||||
--tensor-parallel-size 1 \
|
||||
--data-parallel-size 1 \
|
||||
--max-model-len 10000 \
|
||||
--block-size 128 \
|
||||
--max-num-batched-tokens 4096 \
|
||||
--kv-transfer-config \
|
||||
'{
|
||||
"kv_connector": "MultiConnector",
|
||||
"kv_role": "kv_producer",
|
||||
"kv_connector_extra_config": {
|
||||
"use_layerwise": false,
|
||||
"connectors": [
|
||||
{
|
||||
"kv_connector": "MooncakeConnectorV1",
|
||||
"kv_role": "kv_producer",
|
||||
"kv_port": "20001",
|
||||
"kv_connector_extra_config": {
|
||||
"prefill": {
|
||||
"dp_size": 1,
|
||||
"tp_size": 1
|
||||
},
|
||||
"decode": {
|
||||
"dp_size": 1,
|
||||
"tp_size": 1
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"kv_connector": "MooncakeConnectorStoreV1",
|
||||
"kv_role": "kv_producer",
|
||||
"mooncake_rpc_port":"0"
|
||||
}
|
||||
]
|
||||
}
|
||||
}' > p.log 2>&1
|
||||
```
|
||||
|
||||
`decode` Node:
|
||||
|
||||
```
|
||||
bash multi_consumer.sh
|
||||
```
|
||||
|
||||
The content of multi_consumer.sh:
|
||||
|
||||
```
|
||||
export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH
|
||||
export PYTHONPATH=$PYTHONPATH:/xxxxx/vllm
|
||||
export MOONCAKE_CONFIG_PATH="/xxxxx/mooncake.json"
|
||||
export VLLM_USE_V1=1
|
||||
export ASCEND_RT_VISIBLE_DEVICES=4,5,6,7
|
||||
export ACL_OP_INIT_MODE=1
|
||||
export ASCEND_TRANSPORT_PRINT=1
|
||||
# The upper boundary environment variable for memory swap logging is set to mooncake, where 1 indicates enabled and 0 indicates disabled.
|
||||
export ASCEND_AGGREGATE_ENABLE=1
|
||||
# The upper-level environment variable is the switch for enabling the mooncake aggregation function, where 1 means on and 0 means off.
|
||||
|
||||
python3 -m vllm.entrypoints.openai.api_server \
|
||||
--model /xxxxx/Qwen2.5-7B-Instruct \
|
||||
--port 8200 \
|
||||
--trust-remote-code \
|
||||
--enforce-eager \
|
||||
--no_enable_prefix_caching \
|
||||
--tensor-parallel-size 1 \
|
||||
--data-parallel-size 1 \
|
||||
--max-model-len 10000 \
|
||||
--block-size 128 \
|
||||
--max-num-batched-tokens 4096 \
|
||||
--kv-transfer-config \
|
||||
'{
|
||||
"kv_connector": "MultiConnector",
|
||||
"kv_role": "kv_consumer",
|
||||
"kv_connector_extra_config": {
|
||||
"use_layerwise": false,
|
||||
"connectors": [
|
||||
{
|
||||
"kv_connector": "MooncakeConnectorV1",
|
||||
"kv_role": "kv_consumer",
|
||||
"kv_port": "20002",
|
||||
"kv_connector_extra_config": {
|
||||
"prefill": {
|
||||
"dp_size": 1,
|
||||
"tp_size": 1
|
||||
},
|
||||
"decode": {
|
||||
"dp_size": 1,
|
||||
"tp_size": 1
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"kv_connector": "MooncakeConnectorStoreV1",
|
||||
"kv_role": "kv_consumer",
|
||||
"mooncake_rpc_port":"1"
|
||||
}
|
||||
]
|
||||
}
|
||||
}' > d.log 2>&1
|
||||
```
|
||||
|
||||
### 2、Start proxy_server.
|
||||
|
||||
```
|
||||
bash proxy.sh
|
||||
```
|
||||
|
||||
proxy.sh content:
|
||||
Change localhost to your actual IP address.
|
||||
|
||||
```
|
||||
python vllm-ascend/examples/disaggregated_prefill_v1/load_balance_proxy_server_example.py \
|
||||
--host localhost\
|
||||
--prefiller-hosts localhost \
|
||||
--prefiller-ports 8100 \
|
||||
--decoder-hosts localhost\
|
||||
--decoder-ports 8200 \
|
||||
```
|
||||
|
||||
### 3. Run Inference
|
||||
|
||||
Configure the localhost, port, and model weight path in the command to your own settings.
|
||||
|
||||
Short question:
|
||||
|
||||
```
|
||||
curl -s http://localhost:8000/v1/completions -H "Content-Type: application/json" -d '{ "model": "/xxxxx/Qwen2.5-7B-Instruct", "prompt": "Hello. I have a question. The president of the United States is", "max_tokens": 200, "temperature":0.0 }'
|
||||
```
|
||||
|
||||
Long question:
|
||||
|
||||
```
|
||||
curl -s http://localhost:8000/v1/completions -H "Content-Type: application/json" -d '{ "model": "/xxxxx/Qwen2.5-7B-Instruct", "prompt": "Given the accelerating impacts of climate change—including rising sea levels, increasing frequency of extreme weather events, loss of biodiversity, and adverse effects on agriculture and human health—there is an urgent need for a robust, globally coordinated response. However, international efforts are complicated by a range of factors: economic disparities between high-income and low-income countries, differing levels of industrialization, varying access to clean energy technologies, and divergent political systems that influence climate policy implementation. In this context, how can global agreements like the Paris Accord be redesigned or strengthened to not only encourage but effectively enforce emission reduction targets? Furthermore, what mechanisms can be introduced to promote fair and transparent technology transfer, provide adequate financial support for climate adaptation in vulnerable regions, and hold nations accountable without exacerbating existing geopolitical tensions or disproportionately burdening those with historically lower emissions?", "max_tokens": 256, "temperature":0.0 }'
|
||||
```
|
||||
|
||||
## Pooling and Mixed Deployment Scenario
|
||||
|
||||
### 1、Run Mixed Department Script
|
||||
|
||||
The mixed script is essentially a pure pooling scenario for the P node.
|
||||
|
||||
```
|
||||
bash mixed_department.sh
|
||||
```
|
||||
|
||||
Content of mixed_department.sh:
|
||||
|
||||
```
|
||||
export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH
|
||||
export PYTHONPATH=$PYTHONPATH:/xxxxx/vllm
|
||||
export MOONCAKE_CONFIG_PATH="/xxxxxx/mooncake.json"
|
||||
export VLLM_USE_V1=1
|
||||
export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3
|
||||
export ACL_OP_INIT_MODE=1
|
||||
export ASCEND_TRANSPORT_PRINT=1
|
||||
# The upper boundary environment variable for memory swap logging is set to mooncake, where 1 indicates enabled and 0 indicates disabled.
|
||||
export ASCEND_AGGREGATE_ENABLE=1
|
||||
# The upper-level environment variable is the switch for enabling the mooncake aggregation function, where 1 means on and 0 means off.
|
||||
|
||||
python3 -m vllm.entrypoints.openai.api_server \
|
||||
--model /xxxxx/Qwen2.5-7B-Instruct \
|
||||
--port 8100 \
|
||||
--trust-remote-code \
|
||||
--enforce-eager \
|
||||
--no_enable_prefix_caching \
|
||||
--tensor-parallel-size 1 \
|
||||
--data-parallel-size 1 \
|
||||
--max-model-len 10000 \
|
||||
--block-size 128 \
|
||||
--max-num-batched-tokens 4096 \
|
||||
--kv-transfer-config \
|
||||
'{
|
||||
"kv_connector": "MooncakeConnectorStoreV1",
|
||||
"kv_role": "kv_both",
|
||||
"kv_connector_extra_config": {
|
||||
"use_layerwise": false,
|
||||
"mooncake_rpc_port":"0"
|
||||
}
|
||||
}' > mix.log 2>&1
|
||||
```
|
||||
|
||||
### 2. Run Inference
|
||||
|
||||
Configure the localhost, port, and model weight path in the command to your own settings. The requests sent will only go to the port where the mixed deployment script is located, and there is no need to start a separate proxy.
|
||||
|
||||
Short question:
|
||||
|
||||
```
|
||||
curl -s http://localhost:8100/v1/completions -H "Content-Type: application/json" -d '{ "model": "/xxxxx/Qwen2.5-7B-Instruct", "prompt": "Hello. I have a question. The president of the United States is", "max_tokens": 200, "temperature":0.0 }'
|
||||
```
|
||||
|
||||
Long question:
|
||||
|
||||
```
|
||||
curl -s http://localhost:8100/v1/completions -H "Content-Type: application/json" -d '{ "model": "/xxxxx/Qwen2.5-7B-Instruct", "prompt": "Given the accelerating impacts of climate change—including rising sea levels, increasing frequency of extreme weather events, loss of biodiversity, and adverse effects on agriculture and human health—there is an urgent need for a robust, globally coordinated response. However, international efforts are complicated by a range of factors: economic disparities between high-income and low-income countries, differing levels of industrialization, varying access to clean energy technologies, and divergent political systems that influence climate policy implementation. In this context, how can global agreements like the Paris Accord be redesigned or strengthened to not only encourage but effectively enforce emission reduction targets? Furthermore, what mechanisms can be introduced to promote fair and transparent technology transfer, provide adequate financial support for climate adaptation in vulnerable regions, and hold nations accountable without exacerbating existing geopolitical tensions or disproportionately burdening those with historically lower emissions?", "max_tokens": 256, "temperature":0.0 }'
|
||||
```
|
||||
@@ -43,4 +43,4 @@ vllm serve model_path \
|
||||
"kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
|
||||
}' \
|
||||
--additional-config \
|
||||
'{"ascend_scheduler_config": {"enabled": true}, "torchair_graph_config":{"enabled":true,"enable_kv_nz":false, "enable_multistream_moe":false, "graph_batch_size":[28]}, "enable_weight_nz_layout":true}'
|
||||
'{"ascend_scheduler_config": {"enabled": true}, "torchair_graph_config":{"enabled":true,"enable_kv_nz":false, "graph_batch_size":[28]}, "enable_weight_nz_layout":true, "enable_multistream_moe":false}'
|
||||
@@ -79,7 +79,7 @@ def run_prefill(prefill_done, process_close):
|
||||
|
||||
|
||||
def run_decode(prefill_done):
|
||||
os.environ['VLLM_LLMDD_RPC_PORT'] = '6634'
|
||||
os.environ['VLLM_ASCEND_LLMDD_RPC_PORT'] = '6634'
|
||||
# ranktable.json needs be generated using gen_ranktable.sh
|
||||
# from the examples/disaggregated_prefill_v1 module in the main branch.
|
||||
os.environ['DISAGGREGATED_PREFILL_RANK_TABLE_PATH'] = "./ranktable.json"
|
||||
|
||||
326
examples/offline_weight_load.py
Normal file
326
examples/offline_weight_load.py
Normal file
@@ -0,0 +1,326 @@
|
||||
#
|
||||
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
||||
# Copyright 2023 The vLLM team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# This file is a part of the vllm-ascend project.
|
||||
# Adapted from vllm-project/vllm/examples/offline_inference/data_parallel.py
|
||||
|
||||
# Note: This script is designed to run with e2e test,
|
||||
# please be careful to modify it.
|
||||
"""
|
||||
Usage:
|
||||
Single node:
|
||||
Dense models:
|
||||
python examples/offline_weight_load.py \
|
||||
--model="Qwen/Qwen2.5-0.5B-Instruct" \
|
||||
--tp-size=1 \
|
||||
--proc-per-node=2
|
||||
MOE models:
|
||||
python examples/offline_weight_load.py \
|
||||
--model="Qwen/Qwen3-30B-A3B" \
|
||||
--tp-size=2 \
|
||||
--proc-per-node=2 \
|
||||
--enable-expert-parallel
|
||||
|
||||
Multi-node:
|
||||
Node 0 (assume the node has ip of 10.99.48.128):
|
||||
python examples/offline_weight_load.py \
|
||||
--model="Qwen/Qwen3-30B-A3B" \
|
||||
--tp-size=2 \
|
||||
--node-size=2 \
|
||||
--node-rank=0 \
|
||||
--proc-per-node=2 \
|
||||
--enable-expert-parallel \
|
||||
--master-addr=10.99.48.128 \
|
||||
--master-port=13345
|
||||
Node 1:
|
||||
python examples/offline_weight_load.py \
|
||||
--model="Qwen/Qwen3-30B-A3B" \
|
||||
--tp-size=2 \
|
||||
--node-size=2 \
|
||||
--node-rank=1 \
|
||||
--enable-expert-parallel \
|
||||
--master-addr=10.99.48.128 \
|
||||
--master-port=13345
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import contextlib
|
||||
import gc
|
||||
import os
|
||||
from multiprocessing import Process
|
||||
from time import sleep
|
||||
|
||||
import torch
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.distributed.parallel_state import ( # noqa E402
|
||||
destroy_distributed_environment, destroy_model_parallel, get_tp_group)
|
||||
from vllm.utils import get_open_port, GiB_bytes
|
||||
from safetensors.torch import load_file
|
||||
|
||||
os.environ["VLLM_USE_MODELSCOPE"] = "True"
|
||||
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
|
||||
|
||||
def patch_vllm_moe_model_weight_loader(model):
|
||||
# Define MLP attribute mapping for different model types
|
||||
|
||||
model = getattr(model, "model", None) or getattr(model, "language_model", None)
|
||||
if model is None:
|
||||
raise ValueError("The provided model does not have a valid 'model' or 'language_model' attribute.")
|
||||
|
||||
for layer in model.layers:
|
||||
mlp_attr = "mlp"
|
||||
mlp = getattr(layer, mlp_attr)
|
||||
|
||||
param_dict = dict(mlp.named_parameters())
|
||||
for name, param in param_dict.items():
|
||||
if "w13_weight" in name or "w2_weight" in name:
|
||||
param.weight_loader = mlp.experts.weight_loader
|
||||
|
||||
def load_and_merge_safetensors(directory):
|
||||
merged_dict = {}
|
||||
|
||||
if not os.path.isdir(directory):
|
||||
raise ValueError(f"directory is not exist : {directory}")
|
||||
|
||||
for filename in os.listdir(directory):
|
||||
if filename.endswith('.safetensors'):
|
||||
file_path = os.path.join(directory, filename)
|
||||
print(f"loading file: {file_path}")
|
||||
|
||||
f = load_file(file_path)
|
||||
merged_dict.update(f)
|
||||
|
||||
return merged_dict
|
||||
|
||||
def parse_args():
|
||||
|
||||
parser = argparse.ArgumentParser(description="External launcher Inference")
|
||||
parser.add_argument(
|
||||
"--model",
|
||||
type=str,
|
||||
default="Qwen/Qwen3-0.6B",
|
||||
help="Model name or path",
|
||||
)
|
||||
parser.add_argument("--tp-size",
|
||||
type=int,
|
||||
default=1,
|
||||
help="Tensor parallel size")
|
||||
parser.add_argument("--node-size",
|
||||
type=int,
|
||||
default=1,
|
||||
help="Total number of nodes")
|
||||
parser.add_argument("--node-rank",
|
||||
type=int,
|
||||
default=0,
|
||||
help="Rank of the current node")
|
||||
parser.add_argument("--proc-per-node",
|
||||
type=int,
|
||||
default=1,
|
||||
help="Number of processes per node")
|
||||
parser.add_argument("--master-addr",
|
||||
type=str,
|
||||
default="",
|
||||
help="Master node IP address")
|
||||
parser.add_argument("--master-port",
|
||||
type=int,
|
||||
default=0,
|
||||
help="Master node port")
|
||||
parser.add_argument("--enforce-eager",
|
||||
action="store_true",
|
||||
help="Enforce eager mode execution.")
|
||||
parser.add_argument("--trust-remote-code",
|
||||
action="store_true",
|
||||
help="Trust remote code.")
|
||||
parser.add_argument("--enable-expert-parallel",
|
||||
action="store_true",
|
||||
help="Enable expert parallel, used in MOE models.")
|
||||
parser.add_argument("--enable-sleep-mode",
|
||||
action="store_true",
|
||||
help="Enable sleep mode for the engine.")
|
||||
parser.add_argument("--temperature",
|
||||
type=float,
|
||||
default=0.8,
|
||||
help="Float that controls the randomness of the sampling.")
|
||||
parser.add_argument("--model-weight-gib",
|
||||
type=float,
|
||||
default=None,
|
||||
help="Model weight memory usage in GiB (e.g., 1.0 for 0.5B model).")
|
||||
|
||||
args = parser.parse_args()
|
||||
if args.enable_sleep_mode:
|
||||
if args.model_weight_gib is None or args.temperature != 0:
|
||||
parser.error("model-weight-gib must be provided, and temperature must be zero when enable-sleep-mode is set.")
|
||||
if args.model_weight_gib <= 0:
|
||||
parser.error("model-weight-gib must be greater than 0 when enable-sleep-mode is set.")
|
||||
if args.model == parser.get_default("model") and args.model_weight_gib is None:
|
||||
parser.error("model-weight-gib must be provided for default model when enable-sleep-mode is set.")
|
||||
|
||||
return args
|
||||
|
||||
|
||||
def main(
|
||||
local_rank: int,
|
||||
rank: int,
|
||||
master_addr: str,
|
||||
master_port: int,
|
||||
model_weight_gib: float,
|
||||
model: str = "Qwen/Qwen3-30B-A3B",
|
||||
world_size: int = 4,
|
||||
tensor_parallel_size: int = 2,
|
||||
enable_expert_parallel: bool = False,
|
||||
enforce_eager: bool = True,
|
||||
trust_remote_code: bool = True,
|
||||
enable_sleep_mode: bool = False,
|
||||
temperature: float = 0.8,
|
||||
):
|
||||
os.environ["MASTER_ADDR"] = master_addr
|
||||
os.environ["MASTER_PORT"] = str(master_port)
|
||||
os.environ["RANK"] = str(rank)
|
||||
os.environ["LOCAL_RANK"] = str(local_rank)
|
||||
os.environ["WORLD_SIZE"] = str(world_size)
|
||||
if not torch.distributed.is_initialized():
|
||||
torch.distributed.init_process_group(
|
||||
backend="cpu:gloo,npu:hccl",
|
||||
world_size=world_size,
|
||||
rank=rank,
|
||||
)
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
] * 10
|
||||
sampling_params = SamplingParams(
|
||||
temperature=temperature,
|
||||
top_p=0.95,
|
||||
max_tokens=10,
|
||||
)
|
||||
llm = LLM(
|
||||
model=model,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
enable_expert_parallel=enable_expert_parallel,
|
||||
enforce_eager=enforce_eager,
|
||||
trust_remote_code=trust_remote_code,
|
||||
distributed_executor_backend="external_launcher",
|
||||
seed=0,
|
||||
gpu_memory_utilization = 0.95,
|
||||
enable_sleep_mode=enable_sleep_mode,
|
||||
)
|
||||
model_path = model
|
||||
runmodel = llm.llm_engine.model_executor.driver_worker.worker.model_runner.model
|
||||
patch_vllm_moe_model_weight_loader(runmodel)
|
||||
sd = load_and_merge_safetensors(model_path)
|
||||
runmodel.load_weights(sd.items())
|
||||
print('load state dict done')
|
||||
tp_ranks = get_tp_group().ranks
|
||||
print(f'TP RANKS: {tp_ranks}')
|
||||
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
|
||||
if enable_sleep_mode:
|
||||
if rank == 0:
|
||||
free_bytes_before_sleep, total = torch.npu.mem_get_info()
|
||||
llm.sleep(level=1)
|
||||
if rank == 0:
|
||||
free_bytes_after_sleep, total = torch.npu.mem_get_info()
|
||||
freed_bytes = free_bytes_after_sleep - free_bytes_before_sleep
|
||||
print(f"Freed memory: {freed_bytes / 1024 ** 3:.2f} GiB")
|
||||
# now the freed memory should be larger than the model weights
|
||||
assert freed_bytes >= model_weight_gib / tensor_parallel_size * GiB_bytes
|
||||
|
||||
llm.wake_up()
|
||||
outputs_after_wakeup = llm.generate(prompts, sampling_params)
|
||||
if rank == 0:
|
||||
# cmp output
|
||||
assert outputs[0].outputs[0].text == outputs_after_wakeup[0].outputs[0].text
|
||||
print("Sleep and wake up successfully!!")
|
||||
|
||||
for i, output in enumerate(outputs):
|
||||
if i >= 5:
|
||||
# print only 5 outputs
|
||||
break
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Global rank: {rank}, Prompt: {prompt!r}, "
|
||||
f"Generated text: {generated_text!r}")
|
||||
|
||||
# Give engines time to pause their processing loops before exiting.
|
||||
sleep(5)
|
||||
del llm
|
||||
cleanup_env_and_memory()
|
||||
|
||||
|
||||
def cleanup_env_and_memory():
|
||||
destroy_model_parallel()
|
||||
destroy_distributed_environment()
|
||||
with contextlib.suppress(AssertionError):
|
||||
torch.distributed.destroy_process_group()
|
||||
gc.collect()
|
||||
torch.npu.empty_cache()
|
||||
torch.npu.reset_peak_memory_stats()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_args()
|
||||
|
||||
tp_size = args.tp_size
|
||||
node_size = args.node_size
|
||||
proc_per_node = args.proc_per_node
|
||||
node_rank = args.node_rank
|
||||
|
||||
if node_size == 1:
|
||||
master_addr = "127.0.0.1"
|
||||
master_port = get_open_port()
|
||||
else:
|
||||
master_addr = args.master_addr
|
||||
master_port = args.master_port
|
||||
|
||||
world_size = node_size * proc_per_node
|
||||
|
||||
procs = []
|
||||
for local_rank, rank in enumerate(
|
||||
range(proc_per_node * node_rank, proc_per_node * (node_rank + 1))):
|
||||
proc = Process(target=main,
|
||||
args=(
|
||||
local_rank,
|
||||
rank,
|
||||
master_addr,
|
||||
master_port,
|
||||
args.model_weight_gib,
|
||||
args.model,
|
||||
world_size,
|
||||
tp_size,
|
||||
args.enable_expert_parallel,
|
||||
args.enforce_eager,
|
||||
args.trust_remote_code,
|
||||
args.enable_sleep_mode,
|
||||
args.temperature,
|
||||
))
|
||||
|
||||
proc.start()
|
||||
procs.append(proc)
|
||||
exit_code = 0
|
||||
for proc in procs:
|
||||
proc.join(timeout=600)
|
||||
if proc.exitcode is None:
|
||||
print(
|
||||
f"Killing process {proc.pid} that didn't stop within 30 minutes."
|
||||
)
|
||||
proc.kill()
|
||||
exit_code = 1
|
||||
elif proc.exitcode:
|
||||
exit_code = proc.exitcode
|
||||
|
||||
exit(exit_code)
|
||||
@@ -29,4 +29,4 @@ vllm serve Qwen/Qwen1.5-MoE-A2.7B \
|
||||
--gpu-memory-utilization 0.9 \
|
||||
--trust-remote-code \
|
||||
--enforce-eager \
|
||||
--additional-config '{"ascend_scheduler_config":{"enabled":true},"torchair_graph_config":{"enabled":false, "enable_multistream_moe":false, "use_cached_graph":false}}'
|
||||
--additional-config '{"ascend_scheduler_config":{"enabled":true},"torchair_graph_config":{"enabled":false, "use_cached_graph":false}}'
|
||||
|
||||
Reference in New Issue
Block a user