From 3b3dd2a889f77f32046eca143c7abf01e6c987f5 Mon Sep 17 00:00:00 2001 From: aipaes <82140963+aipaes@users.noreply.github.com> Date: Tue, 17 Mar 2026 23:09:12 +0800 Subject: [PATCH] [doc] Refresh the documentation for GLM-4.7 (#7292) ### What this PR does / why we need it? Refresh the documentation for GLM4.7. --------- Signed-off-by: zjks98 Co-authored-by: zjks98 --- docs/source/tutorials/models/GLM4.x.md | 597 ++++++++++++++++++++++--- 1 file changed, 527 insertions(+), 70 deletions(-) diff --git a/docs/source/tutorials/models/GLM4.x.md b/docs/source/tutorials/models/GLM4.x.md index f9f4da03..72921913 100644 --- a/docs/source/tutorials/models/GLM4.x.md +++ b/docs/source/tutorials/models/GLM4.x.md @@ -23,6 +23,7 @@ Refer to [feature guide](../../user_guide/feature_guide/index.md) to get the fea - `GLM-4.7`(BF16 version): [Download model weight](https://www.modelscope.cn/models/ZhipuAI/GLM-4.7). - `GLM-4.5-w8a8-with-float-mtp`(Quantized version with mtp): [Download model weight](https://modelers.cn/models/Modelers_Park/GLM-4.5-w8a8). - `GLM-4.6-w8a8`(Quantized version without mtp): [Download model weight](https://modelers.cn/models/Modelers_Park/GLM-4.6-w8a8). Because vllm do not support GLM4.6 mtp in October, so we do not provide mtp version. And last month, it supported, you can use the following quantization scheme to add mtp weights to Quantized weights. +- `GLM-4.7-w8a8-with-float-mtp`(Quantized version without mtp): [Download model weight](https://modelscope.cn/models/Eco-Tech/GLM-4.7-W8A8-floatmtp-floatmtp/summary). - `Method of Quantify`: [quantization scheme](https://blog.csdn.net/qq_37368095/article/details/156429653?spm=1011.2124.3001.6209). You can use these methods to quantify the model. It is recommended to download the model weight to the shared directory of multiple nodes, such as `/root/.cache/`. @@ -74,38 +75,45 @@ docker run --rm \ ### Single-node Deployment - In low-latency scenarios, we recommend a single-machine deployment. -- Quantized model `glm4.5_w8a8_with_float_mtp` can be deployed on 1 Atlas 800 A3 (64G × 16) or 1 Atlas 800 A2 (64G × 8). +- Quantized model `glm4.7_w8a8_with_float_mtp` can be deployed on 1 Atlas 800 A3 (64G × 16) or 1 Atlas 800 A2 (64G × 8). Run the following script to execute online inference. ```shell #!/bin/sh -export HCCL_BUFFSIZE=1024 +export HCCL_BUFFSIZE=512 export OMP_PROC_BIND=false -export OMP_NUM_THREADS=10 +export OMP_NUM_THREADS=1 export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True export HCCL_OP_EXPANSION_MODE=AIV +export VLLM_ASCEND_BALANCE_SCHEDULING=1 +export VLLM_ASCEND_ENABLE_TOPK_OPTIMIZE=1 +export VLLM_ASCEND_ENABLE_FLASHCOMM1=1 +export VLLM_ASCEND_ENABLE_FUSED_MC2=1 -vllm serve /weight/glm4.5_w8a8_with_float_mtp \ - --data-parallel-size 1 \ - --tensor-parallel-size 16 \ +vllm serve Eco-Tech/GLM-4.7-W8A8-floatmtp \ + --data-parallel-size 2 \ + --tensor-parallel-size 8 \ + --enable-expert-parallel \ --seed 1024 \ --served-model-name glm \ - --max-model-len 35000 \ - --max-num-batched-tokens 16384 \ + --max-model-len 133000 \ + --max-num-batched-tokens 8192 \ --max-num-seqs 16 \ + --async-scheduling \ + --quantization ascend \ --trust-remote-code \ --gpu-memory-utilization 0.9 \ - --speculative-config '{"num_speculative_tokens": 1, "model":"/weight/glm4.5_w8a8_with_float_mtp", "method":"mtp"}' \ - --compilation-config '{"cudagraph_capture_sizes": [1,2,4,8,16,32], "cudagraph_mode": "FULL_DECODE_ONLY"}' \ - --async-scheduling + --speculative-config '{"num_speculative_tokens": 3, "model":"Eco-Tech/GLM-4.7-W8A8-floatmtp", "method":"mtp"}' \ + --compilation-config '{"cudagraph_capture_sizes": [1,2,4,8,16,32,64,128,256,512], "cudagraph_mode": "FULL_DECODE_ONLY"}' \ + --additional-config '{"enable_shared_expert_dp": true, "ascend_fusion_config": {"fusion_ops_gmmswigluquant": false}}'\ ``` **Notice:** The parameters are explained as follows: -- For single-node deployment, we recommend using `dp1tp16` and turn off expert parallel in low-latency scenarios. - `--async-scheduling` Asynchronous scheduling is a technique used to optimize inference efficiency. It allows non-blocking task scheduling to improve concurrency and throughput, especially when processing large-scale models. +- `fusion_ops_gmmswigluquant` The performance of the GmmSwigluQuant fusion operator tends to degrade when the total number of NPUs is ≤ 16. ### Multi-node Deployment @@ -125,37 +133,41 @@ export HCCL_IF_IP=$local_ip export GLOO_SOCKET_IFNAME=$nic_name export TP_SOCKET_IFNAME=$nic_name export HCCL_SOCKET_IFNAME=$nic_name +export HCCL_BUFFSIZE=512 export OMP_PROC_BIND=false export OMP_NUM_THREADS=1 -export HCCL_BUFFSIZE=200 export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True +export HCCL_OP_EXPANSION_MODE=AIV export VLLM_ASCEND_BALANCE_SCHEDULING=1 -export HCCL_INTRA_PCIE_ENABLE=1 -export HCCL_INTRA_ROCE_ENABLE=0 -export VLLM_USE_MODELSCOPE=True - -vllm serve ZhipuAI/GLM-4.7 \ - --host 0.0.0.0 \ - --port 30000 \ - --data-parallel-size 4 \ - --data-parallel-size-local 2 \ - --data-parallel-address $local_ip \ - --data-parallel-rpc-port 13389 \ - --tensor-parallel-size 4 \ - --seed 1024 \ - --async-scheduling \ - --max-num-seqs 16 \ - --max-model-len 16384 \ - --max-num-batched-tokens 4096 \ - --gpu-memory-utilization 0.92 \ - --enable-auto-tool-choice \ - --reasoning-parser glm45 \ - --tool-call-parser glm47 \ - --speculative-config {"num_speculative_tokens":3,"method":"mtp"} \ - --compilation-config {"cudagraph_capture_sizes":[4,16,32,48,64], "cudagraph_mode": "FULL_DECODE_ONLY"} \ - --trust-remote-code \ - --served-model-name glm47 +export VLLM_ASCEND_ENABLE_TOPK_OPTIMIZE=1 +export VLLM_ASCEND_ENABLE_FLASHCOMM1=1 +export VLLM_ASCEND_ENABLE_FUSED_MC2=1 +vllm serve Eco-Tech/GLM-4.7-W8A8-floatmtp \ + --host 0.0.0.0 \ + --port 8004 \ + --data-parallel-size 2 \ + --data-parallel-size-local 1 \ + --data-parallel-start-rank 0 \ + --data-parallel-address $local_ip \ + --data-parallel-rpc-port 13389 \ + --tensor-parallel-size 8 \ + --enable-expert-parallel \ + --seed 1024 \ + --max-model-len 140000 \ + --max-num-batched-tokens 8192 \ + --max-num-seqs 16 \ + --async-scheduling \ + --quantization ascend \ + --trust-remote-code \ + --gpu-memory-utilization 0.9 \ + --enable-auto-tool-choice \ + --reasoning-parser glm45 \ + --tool-call-parser glm47 \ + --served-model-name glm47 \ + --speculative-config '{"num_speculative_tokens": 3, "model":"Eco-Tech/GLM-4.7-W8A8-floatmtp", "method":"mtp"}' \ + --compilation-config '{"cudagraph_capture_sizes": [1,2,4,8,16,32,64,128,256,512], "cudagraph_mode": "FULL_DECODE_ONLY"}' \ + --additional-config '{"enable_shared_expert_dp": true, "ascend_fusion_config": {"fusion_ops_gmmswigluquant": false}}' ``` **Node 1** @@ -173,43 +185,488 @@ export HCCL_IF_IP=$local_ip export GLOO_SOCKET_IFNAME=$nic_name export TP_SOCKET_IFNAME=$nic_name export HCCL_SOCKET_IFNAME=$nic_name +export HCCL_BUFFSIZE=512 export OMP_PROC_BIND=false export OMP_NUM_THREADS=1 -export HCCL_BUFFSIZE=200 export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True +export HCCL_OP_EXPANSION_MODE=AIV export VLLM_ASCEND_BALANCE_SCHEDULING=1 -export HCCL_INTRA_PCIE_ENABLE=1 -export HCCL_INTRA_ROCE_ENABLE=0 -export VLLM_USE_MODELSCOPE=True +export VLLM_ASCEND_ENABLE_TOPK_OPTIMIZE=1 +export VLLM_ASCEND_ENABLE_FLASHCOMM1=1 +export VLLM_ASCEND_ENABLE_FUSED_MC2=1 -vllm serve ZhipuAI/GLM-4.7 \ - --host 0.0.0.0 \ - --port 30000 \ - --headless \ - --data-parallel-size 4 \ - --data-parallel-size-local 2 \ - --data-parallel-start-rank 2 \ - --data-parallel-address $node0_ip \ - --data-parallel-rpc-port 13389 \ - --tensor-parallel-size 4 \ - --seed 1024 \ - --async-scheduling \ - --max-num-seqs 16 \ - --max-model-len 16384 \ - --max-num-batched-tokens 4096 \ - --gpu-memory-utilization 0.92 \ - --enable-auto-tool-choice \ - --reasoning-parser glm45 \ - --tool-call-parser glm47 \ - --speculative-config {"num_speculative_tokens":3,"method":"mtp"} \ - --compilation-config {"cudagraph_capture_sizes":[4,16,32,48,64], "cudagraph_mode": "FULL_DECODE_ONLY"} \ - --trust-remote-code \ - --served-model-name glm47 +vllm serve Eco-Tech/GLM-4.7-W8A8-floatmtp \ + --host 0.0.0.0 \ + --port 8004 \ + --headless \ + --data-parallel-size 2 \ + --data-parallel-size-local 1 \ + --data-parallel-start-rank 1 \ + --data-parallel-address $node0_ip \ + --data-parallel-rpc-port 13389 \ + --tensor-parallel-size 8 \ + --enable-expert-parallel \ + --seed 1024 \ + --max-model-len 140000 \ + --max-num-batched-tokens 8192 \ + --max-num-seqs 16 \ + --async-scheduling \ + --quantization ascend \ + --trust-remote-code \ + --gpu-memory-utilization 0.9 \ + --enable-auto-tool-choice \ + --reasoning-parser glm45 \ + --tool-call-parser glm47 \ + --served-model-name glm47 \ + --speculative-config '{"num_speculative_tokens": 3, "model":"Eco-Tech/GLM-4.7-W8A8-floatmtp", "method":"mtp"}' \ + --compilation-config '{"cudagraph_capture_sizes": [1,2,4,8,16,32,64,128,256,512], "cudagraph_mode": "FULL_DECODE_ONLY"}' \ + --additional-config '{"enable_shared_expert_dp": true, "ascend_fusion_config": {"fusion_ops_gmmswigluquant": false}}' ``` ### Prefill-Decode Disaggregation -Not test yet. +We'd like to show the deployment guide of `GLM4.7` on multi-node environment with 2P1D for better performance. + +Before you start, please + +1. prepare the script `launch_online_dp.py` on each node: + + ```python + import argparse + import multiprocessing + import os + import subprocess + import sys + + def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--dp-size", + type=int, + required=True, + help="Data parallel size." + ) + parser.add_argument( + "--tp-size", + type=int, + default=1, + help="Tensor parallel size." + ) + parser.add_argument( + "--dp-size-local", + type=int, + default=-1, + help="Local data parallel size." + ) + parser.add_argument( + "--dp-rank-start", + type=int, + default=0, + help="Starting rank for data parallel." + ) + parser.add_argument( + "--dp-address", + type=str, + required=True, + help="IP address for data parallel master node." + ) + parser.add_argument( + "--dp-rpc-port", + type=str, + default=12345, + help="Port for data parallel master node." + ) + parser.add_argument( + "--vllm-start-port", + type=int, + default=9000, + help="Starting port for the engine." + ) + return parser.parse_args() + + args = parse_args() + dp_size = args.dp_size + tp_size = args.tp_size + dp_size_local = args.dp_size_local + if dp_size_local == -1: + dp_size_local = dp_size + dp_rank_start = args.dp_rank_start + dp_address = args.dp_address + dp_rpc_port = args.dp_rpc_port + vllm_start_port = args.vllm_start_port + + def run_command(visible_devices, dp_rank, vllm_engine_port): + command = [ + "bash", + "./run_dp_template.sh", + visible_devices, + str(vllm_engine_port), + str(dp_size), + str(dp_rank), + dp_address, + dp_rpc_port, + str(tp_size), + ] + subprocess.run(command, check=True) + + if __name__ == "__main__": + template_path = "./run_dp_template.sh" + if not os.path.exists(template_path): + print(f"Template file {template_path} does not exist.") + sys.exit(1) + + processes = [] + num_cards = dp_size_local * tp_size + for i in range(dp_size_local): + dp_rank = dp_rank_start + i + vllm_engine_port = vllm_start_port + i + visible_devices = ",".join(str(x) for x in range(i * tp_size, (i + 1) * tp_size)) + process = multiprocessing.Process(target=run_command, + args=(visible_devices, dp_rank, + vllm_engine_port)) + processes.append(process) + process.start() + + for process in processes: + process.join() + + ``` + +2. prepare the script `run_dp_template.sh` on each node. + + 1. Prefill node 0 + + ```shell + nic_name="xxxx" # change to your own nic name + local_ip="xxxx" # change to your own ip + + export HCCL_IF_IP=$local_ip + export GLOO_SOCKET_IFNAME=$nic_name + export TP_SOCKET_IFNAME=$nic_name + export HCCL_SOCKET_IFNAME=$nic_name + export HCCL_BUFFSIZE=256 + export HCCL_OP_EXPANSION_MODE="AIV" + export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True + export OMP_PROC_BIND=false + export OMP_NUM_THREADS=1 + export ASCEND_AGGREGATE_ENABLE=1 + export ASCEND_TRANSPORT_PRINT=1 + export ACL_OP_INIT_MODE=1 + export ASCEND_A3_ENABLE=1 + export VLLM_ASCEND_BALANCE_SCHEDULING=1 + export VLLM_ASCEND_ENABLE_TOPK_OPTIMIZE=1 + export VLLM_ASCEND_ENABLE_FLASHCOMM1=1 + export VLLM_ASCEND_ENABLE_FUSED_MC2=1 + export ASCEND_RT_VISIBLE_DEVICES=$1 + export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/mooncake:$LD_LIBRARY_PATH + + vllm serve Eco-Tech/GLM-4.7-W8A8-floatmtp \ + --host 0.0.0.0 \ + --port $2 \ + --data-parallel-size $3 \ + --data-parallel-rank $4 \ + --data-parallel-address $5 \ + --data-parallel-rpc-port $6 \ + --tensor-parallel-size $7 \ + --enable-expert-parallel \ + --seed 1024 \ + --served-model-name glm \ + --max-model-len 133000 \ + --max-num-batched-tokens 8192 \ + --trust-remote-code \ + --max-num-seqs 64 \ + --gpu-memory-utilization 0.9 \ + --quantization ascend \ + --enforce-eager \ + --speculative-config '{"num_speculative_tokens": 3, "model":"Eco-Tech/GLM-4.7-W8A8-floatmtp", "method":"mtp"}' \ + --profiler-config '{"profiler": "torch", "torch_profiler_dir": "./vllm_profile", "torch_profiler_with_stack": false}' \ + --additional-config '{"recompute_scheduler_enable": true, "enable_shared_expert_dp": true, "ascend_fusion_config": {"fusion_ops_gmmswigluquant": false}}' \ + --kv-transfer-config \ + '{"kv_connector": "MooncakeConnectorV1", + "kv_role": "kv_producer", + "kv_port": "30000", + "engine_id": "0", + "kv_connector_extra_config": { + "prefill": { + "dp_size": 2, + "tp_size": 8 + }, + "decode": { + "dp_size": 8, + "tp_size": 4 + } + } + }' 2>&1 + + ``` + + 2. Prefill node 1 + + ```shell + nic_name="xxxx" # change to your own nic name + local_ip="xxxx" # change to your own ip + + export HCCL_IF_IP=$local_ip + export GLOO_SOCKET_IFNAME=$nic_name + export TP_SOCKET_IFNAME=$nic_name + export HCCL_SOCKET_IFNAME=$nic_name + export HCCL_BUFFSIZE=256 + export HCCL_OP_EXPANSION_MODE="AIV" + export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True + export OMP_PROC_BIND=false + export OMP_NUM_THREADS=1 + export ASCEND_AGGREGATE_ENABLE=1 + export ASCEND_TRANSPORT_PRINT=1 + export ACL_OP_INIT_MODE=1 + export ASCEND_A3_ENABLE=1 + export VLLM_ASCEND_BALANCE_SCHEDULING=1 + export VLLM_ASCEND_ENABLE_TOPK_OPTIMIZE=1 + export VLLM_ASCEND_ENABLE_FLASHCOMM1=1 + export VLLM_ASCEND_ENABLE_FUSED_MC2=1 + export ASCEND_RT_VISIBLE_DEVICES=$1 + export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/mooncake:$LD_LIBRARY_PATH + + vllm serve Eco-Tech/GLM-4.7-W8A8-floatmtp \ + --host 0.0.0.0 \ + --port $2 \ + --data-parallel-size $3 \ + --data-parallel-rank $4 \ + --data-parallel-address $5 \ + --data-parallel-rpc-port $6 \ + --tensor-parallel-size $7 \ + --enable-expert-parallel \ + --seed 1024 \ + --served-model-name glm \ + --max-model-len 133000 \ + --max-num-batched-tokens 8192 \ + --trust-remote-code \ + --max-num-seqs 64 \ + --gpu-memory-utilization 0.9 \ + --quantization ascend \ + --enforce-eager \ + --speculative-config '{"num_speculative_tokens": 3, "model":"Eco-Tech/GLM-4.7-W8A8-floatmtp", "method":"mtp"}' \ + --profiler-config '{"profiler": "torch", "torch_profiler_dir": "./vllm_profile", "torch_profiler_with_stack": false}' \ + --additional-config '{"recompute_scheduler_enable": true, "enable_shared_expert_dp": true, "ascend_fusion_config": {"fusion_ops_gmmswigluquant": false}}' \ + --kv-transfer-config \ + '{"kv_connector": "MooncakeConnectorV1", + "kv_role": "kv_producer", + "kv_port": "30000", + "engine_id": "0", + "kv_connector_extra_config": { + "prefill": { + "dp_size": 2, + "tp_size": 8 + }, + "decode": { + "dp_size": 8, + "tp_size": 4 + } + } + }' 2>&1 + ``` + + 3. Decode node 0 + + ```shell + nic_name="xxxx" # change to your own nic name + local_ip="xxxx" # change to your own ip + export HCCL_IF_IP=$local_ip + export GLOO_SOCKET_IFNAME=$nic_name + export TP_SOCKET_IFNAME=$nic_name + export HCCL_SOCKET_IFNAME=$nic_name + + export HCCL_BUFFSIZE=512 + export HCCL_OP_EXPANSION_MODE="AIV" + export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True + export OMP_PROC_BIND=false + export OMP_NUM_THREADS=1 + export ASCEND_AGGREGATE_ENABLE=1 + export ASCEND_TRANSPORT_PRINT=1 + export ACL_OP_INIT_MODE=1 + export ASCEND_A3_ENABLE=1 + export VLLM_NIXL_ABORT_REQUEST_TIMEOUT=300000 + export TASK_QUEUE_ENABLE=1 + export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/mooncake:$LD_LIBRARY_PATH + export VLLM_ASCEND_BALANCE_SCHEDULING=1 + export VLLM_ASCEND_ENABLE_TOPK_OPTIMIZE=1 + export VLLM_ASCEND_ENABLE_FLASHCOMM1=1 + export VLLM_ASCEND_ENABLE_FUSED_MC2=1 + export ASCEND_RT_VISIBLE_DEVICES=$1 + + vllm serve Eco-Tech/GLM-4.7-W8A8-floatmtp \ + --host 0.0.0.0 \ + --port $2 \ + --data-parallel-size $3 \ + --data-parallel-rank $4 \ + --data-parallel-address $5 \ + --data-parallel-rpc-port $6 \ + --tensor-parallel-size $7 \ + --enable-expert-parallel \ + --seed 1024 \ + --served-model-name glm \ + --max-model-len 133000 \ + --max-num-batched-tokens 128 \ + --max-num-seqs 4 \ + --trust-remote-code \ + --async-scheduling \ + --gpu-memory-utilization 0.9 \ + --quantization ascend \ + --speculative-config '{"num_speculative_tokens": 3, "model":"Eco-Tech/GLM-4.7-W8A8-floatmtp", "method":"mtp"}' \ + --profiler-config \ + '{"profiler": "torch", + "torch_profiler_dir": "./vllm_profile", + "torch_profiler_with_stack": false}' \ + --compilation-config '{"cudagraph_mode":"FULL_DECODE_ONLY", "cudagraph_capture_sizes":[1,2,4,6,8,10,12,14,16,18,20,24,26,28,30,32,64,128,256,512]}' \ + --additional-config '{"recompute_scheduler_enable": true, "enable_shared_expert_dp": true, "ascend_fusion_config": {"fusion_ops_gmmswigluquant": false}}' \ + --kv-transfer-config \ + '{"kv_connector": "MooncakeConnectorV1", + "kv_role": "kv_consumer", + "kv_port": "30200", + "engine_id": "2", + "kv_connector_extra_config": { + "prefill": { + "dp_size": 2, + "tp_size": 8 + }, + "decode": { + "dp_size": 8, + "tp_size": 4 + } + } + }' \ + ``` + + 4. Decode node 1 + + ```shell + nic_name="xxxx" # change to your own nic name + local_ip="xxxx" # change to your own ip + export HCCL_IF_IP=$local_ip + export GLOO_SOCKET_IFNAME=$nic_name + export TP_SOCKET_IFNAME=$nic_name + export HCCL_SOCKET_IFNAME=$nic_name + + export HCCL_BUFFSIZE=512 + export HCCL_OP_EXPANSION_MODE="AIV" + export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True + export OMP_PROC_BIND=false + export OMP_NUM_THREADS=1 + export ASCEND_AGGREGATE_ENABLE=1 + export ASCEND_TRANSPORT_PRINT=1 + export ACL_OP_INIT_MODE=1 + export ASCEND_A3_ENABLE=1 + export VLLM_NIXL_ABORT_REQUEST_TIMEOUT=300000 + export TASK_QUEUE_ENABLE=1 + export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/mooncake:$LD_LIBRARY_PATH + export VLLM_ASCEND_BALANCE_SCHEDULING=1 + export VLLM_ASCEND_ENABLE_TOPK_OPTIMIZE=1 + export VLLM_ASCEND_ENABLE_FLASHCOMM1=1 + export VLLM_ASCEND_ENABLE_FUSED_MC2=1 + export ASCEND_RT_VISIBLE_DEVICES=$1 + + vllm serve Eco-Tech/GLM-4.7-W8A8-floatmtp \ + --host 0.0.0.0 \ + --port $2 \ + --data-parallel-size $3 \ + --data-parallel-rank $4 \ + --data-parallel-address $5 \ + --data-parallel-rpc-port $6 \ + --tensor-parallel-size $7 \ + --enable-expert-parallel \ + --seed 1024 \ + --served-model-name glm \ + --max-model-len 133000 \ + --max-num-batched-tokens 128 \ + --max-num-seqs 4 \ + --trust-remote-code \ + --async-scheduling \ + --gpu-memory-utilization 0.9 \ + --quantization ascend \ + --speculative-config '{"num_speculative_tokens": 3, "model":"Eco-Tech/GLM-4.7-W8A8-floatmtp", "method":"mtp"}' \ + --profiler-config \ + '{"profiler": "torch", + "torch_profiler_dir": "./vllm_profile", + "torch_profiler_with_stack": false}' \ + --compilation-config '{"cudagraph_mode":"FULL_DECODE_ONLY", "cudagraph_capture_sizes":[1,2,4,6,8,10,12,14,16,18,20,24,26,28,30,32,64,128,256,512]}' \ + --additional-config '{"recompute_scheduler_enable": true, "enable_shared_expert_dp": true, "ascend_fusion_config": {"fusion_ops_gmmswigluquant": false}}' \ + --kv-transfer-config \ + '{"kv_connector": "MooncakeConnectorV1", + "kv_role": "kv_consumer", + "kv_port": "30200", + "engine_id": "2", + "kv_connector_extra_config": { + "prefill": { + "dp_size": 2, + "tp_size": 8 + }, + "decode": { + "dp_size": 8, + "tp_size": 4 + } + } + }' \ + ``` + +Once the preparation is done, you can start the server with the following command on each node: + +1. Prefill node 0 + +```shell +# change ip to your own +python launch_online_dp.py --dp-size 2 --tp-size 8 --dp-size-local 2 --dp-rank-start 0 --dp-address $node_p0_ip --dp-rpc-port 12880 --vllm-start-port 9300 +``` + +2. Prefill node 1 + +```shell +# change ip to your own +python launch_online_dp.py --dp-size 2 --tp-size 8 --dp-size-local 2 --dp-rank-start 0 --dp-address $node_p1_ip --dp-rpc-port 12880 --vllm-start-port 9300 +``` + +3. Decode node 0 + +```shell +# change ip to your own +python launch_online_dp.py --dp-size 8 --tp-size 4 --dp-size-local 4 --dp-rank-start 0 --dp-address $node_d0_ip --dp-rpc-port 12778 --vllm-start-port 9300 +``` + +4. Decode node 1 + +```shell +# change ip to your own +python launch_online_dp.py --dp-size 8 --tp-size 4 --dp-size-local 4 --dp-rank-start 4 --dp-address $node_d0_ip --dp-rpc-port 12778 --vllm-start-port 9300 +``` + +### Request Forwarding + +To set up request forwarding, run the following script on any machine. You can get the proxy program in the repository's examples: [load_balance_proxy_server_example.py](https://github.com/vllm-project/vllm-ascend/blob/main/examples/disaggregated_prefill_v1/load_balance_proxy_server_example.py) + +```shell +unset http_proxy +unset https_proxy + +python load_balance_proxy_server_example.py \ + --port 8000 \ + --host 0.0.0.0 \ + --prefiller-hosts \ + $node_p0_ip $node_p0_ip \ + $node_p1_ip $node_p1_ip \ + --prefiller-ports \ + 9300 9301 \ + 9300 9301 \ + --decoder-hosts \ + $node_d0_ip \ + $node_d0_ip \ + $node_d0_ip \ + $node_d0_ip \ + $node_d1_ip \ + $node_d1_ip \ + $node_d1_ip \ + $node_d1_ip \ + --decoder-ports \ + 9300 9301 9302 9303 \ + 9300 9301 9302 9303 \ +``` ## Accuracy Evaluation @@ -219,12 +676,12 @@ Here are two accuracy evaluation methods. 1. Refer to [Using AISBench](../../developer_guide/evaluation/using_ais_bench.md) for details. -2. After execution, you can get the result, here is the result of `GLM4.6` in `vllm-ascend:main` (after `vllm-ascend:0.13.0rc1`) for reference only. +2. After execution, you can get the result, here is the result of `GLM4.7` in `vllm-ascend:main` (after `vllm-ascend:0.14.0rc1`) for reference only. | dataset | version | metric | mode | vllm-api-general-chat | note | |----- | ----- | ----- | ----- | -----| ----- | -| gsm8k | - | accuracy | gen | 96.13 | 1 Atlas 800 A3 (64G × 16) | -| gsm8k | - | accuracy | gen | 96.06 | GPU | +| GPQA | - | accuracy | gen | 84.85 | 1 Atlas 800 A3 (64G × 16) | +| MATH500 | - | accuracy | gen | 98.8 | 1 Atlas 800 A3 (64G × 16) | ### Using Language Model Evaluation Harness @@ -261,7 +718,7 @@ vllm bench serve \ --prefix-repetition-num-prefixes 1 \ --ignore-eos \ --model glm \ - --tokenizer /weight/glm4.5_w8a8_with_float_mtp \ + --tokenizer Eco-Tech/GLM-4.7-W8A8-floatmtp \ --seed 1000 \ --host 0.0.0.0 \ --port 8000 \