From c2c1db78a79c7b9cb2c87e4d0875a1219571c36c Mon Sep 17 00:00:00 2001 From: zxr2333 <64738772+nwpu-zxr@users.noreply.github.com> Date: Wed, 15 Oct 2025 08:45:44 +0800 Subject: [PATCH] [Bugfix] fix ZeroDivisionError when prefill_tp_size > num_kv_head and fix tp_resharding README (#3437) ### What this PR does / why we need it? Fix ZeroDivisionError when prefill_tp_size > num_kv_head, in this situation, num_head_replica can be 0 and used to divide another value, this PR restricts the minimum value of a to be 1. And this PR fix tp_resharding README. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? By CI. - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 --------- Signed-off-by: liziyu Signed-off-by: nwpu-zxr Co-authored-by: liziyu --- .../multi_node_pd_disaggregation_mooncake.md | 28 +++++++++++++------ .../test_mooncake_layerwise_connector.py | 6 ++-- vllm_ascend/ascend_config.py | 4 +-- .../mooncake_layerwise_connector.py | 2 +- 4 files changed, 26 insertions(+), 14 deletions(-) diff --git a/docs/source/tutorials/multi_node_pd_disaggregation_mooncake.md b/docs/source/tutorials/multi_node_pd_disaggregation_mooncake.md index e9ad07d..81c930b 100644 --- a/docs/source/tutorials/multi_node_pd_disaggregation_mooncake.md +++ b/docs/source/tutorials/multi_node_pd_disaggregation_mooncake.md @@ -114,10 +114,10 @@ export VLLM_USE_V1=1 export HCCL_BUFFSIZE=1024 export OMP_PROC_BIND=false export OMP_NUM_THREADS=10 -export ASCEND_AGGREGATE_ENABLE=1 -export ASCEND_TRANSPORT_PRINT=0 -export ACL_OP_INIT_MODE=1 -export ASCEND_A3_ENABLE=1 +export ASCEND_AGGREGATE_ENABLE=1 # enable aggregated transmission +export ASCEND_TRANSPORT_PRINT=0 # print ascend transport logs +export ACL_OP_INIT_MODE=1 # acl op initialization mode to prevent device id acquisition failure +export ASCEND_A3_ENABLE=1 # enable hccs transmission for A3; set to 0 for A2 export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH vllm serve /model/Qwen3-235B-A22B-W8A8 \ @@ -137,7 +137,6 @@ vllm serve /model/Qwen3-235B-A22B-W8A8 \ --max-model-len 32768 \ --max-num-batched-tokens 32768 \ --trust-remote-code \ - --no-enable-prefix-caching \ --gpu-memory-utilization 0.9 \ --kv-transfer-config \ '{"kv_connector": "MooncakeLayerwiseConnector", @@ -197,7 +196,6 @@ vllm serve /model/Qwen3-235B-A22B-W8A8 \ --max-model-len 32768 \ --max-num-batched-tokens 32768 \ --trust-remote-code \ - --no-enable-prefix-caching \ --gpu-memory-utilization 0.9 \ --kv-transfer-config \ '{"kv_connector": "MooncakeLayerwiseConnector", @@ -363,6 +361,10 @@ export VLLM_USE_V1=1 export HCCL_BUFFSIZE=1024 export OMP_PROC_BIND=false export OMP_NUM_THREADS=10 +export ASCEND_AGGREGATE_ENABLE=1 +export ASCEND_TRANSPORT_PRINT=0 +export ACL_OP_INIT_MODE=1 +export ASCEND_A3_ENABLE=1 export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH vllm serve /model/Qwen3-235B-A22B-W8A8 \ @@ -382,7 +384,6 @@ vllm serve /model/Qwen3-235B-A22B-W8A8 \ --max-model-len 32768 \ --max-num-batched-tokens 32768 \ --trust-remote-code \ - --no-enable-prefix-caching \ --gpu-memory-utilization 0.9 \ --kv-transfer-config \ '{"kv_connector": "MooncakeConnector", @@ -419,6 +420,10 @@ export VLLM_USE_V1=1 export HCCL_BUFFSIZE=1024 export OMP_PROC_BIND=false export OMP_NUM_THREADS=10 +export ASCEND_AGGREGATE_ENABLE=1 +export ASCEND_TRANSPORT_PRINT=0 +export ACL_OP_INIT_MODE=1 +export ASCEND_A3_ENABLE=1 export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH vllm serve /model/Qwen3-235B-A22B-W8A8 \ @@ -438,7 +443,6 @@ vllm serve /model/Qwen3-235B-A22B-W8A8 \ --max-model-len 32768 \ --max-num-batched-tokens 32768 \ --trust-remote-code \ - --no-enable-prefix-caching \ --gpu-memory-utilization 0.9 \ --kv-transfer-config \ '{"kv_connector": "MooncakeConnector", @@ -475,6 +479,10 @@ export VLLM_USE_V1=1 export HCCL_BUFFSIZE=2048 export OMP_PROC_BIND=false export OMP_NUM_THREADS=10 +export ASCEND_AGGREGATE_ENABLE=1 +export ASCEND_TRANSPORT_PRINT=0 +export ACL_OP_INIT_MODE=1 +export ASCEND_A3_ENABLE=1 export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH vllm serve /model/Qwen3-235B-A22B-W8A8 \ @@ -532,6 +540,10 @@ export VLLM_USE_V1=1 export HCCL_BUFFSIZE=2048 export OMP_PROC_BIND=false export OMP_NUM_THREADS=10 +export ASCEND_AGGREGATE_ENABLE=1 +export ASCEND_TRANSPORT_PRINT=0 +export ACL_OP_INIT_MODE=1 +export ASCEND_A3_ENABLE=1 export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH vllm serve /model/Qwen3-235B-A22B-W8A8 \ diff --git a/tests/ut/kv_connector/test_mooncake_layerwise_connector.py b/tests/ut/kv_connector/test_mooncake_layerwise_connector.py index 079df07..ae9ff04 100644 --- a/tests/ut/kv_connector/test_mooncake_layerwise_connector.py +++ b/tests/ut/kv_connector/test_mooncake_layerwise_connector.py @@ -79,7 +79,7 @@ class TestKVCacheSendingLayerThreadBasic(unittest.TestCase): self.p1 = patch( 'vllm_ascend.distributed.mooncake_layerwise_connector.get_ascend_config', new=MagicMock(return_value=SimpleNamespace( - pd_tp_ratio=1, num_head_replica=0, pd_head_ratio=1))) + pd_tp_ratio=1, num_head_replica=1, pd_head_ratio=1))) self.p2 = patch( 'vllm_ascend.distributed.mooncake_layerwise_connector.get_current_vllm_config', new=MagicMock(return_value=SimpleNamespace( @@ -244,7 +244,7 @@ class TestSendingLayerThread(unittest.TestCase): self.p1 = patch( 'vllm_ascend.distributed.mooncake_layerwise_connector.get_ascend_config', new=MagicMock(return_value=SimpleNamespace( - pd_tp_ratio=1, num_head_replica=0, pd_head_ratio=1))) + pd_tp_ratio=1, num_head_replica=1, pd_head_ratio=1))) self.p2 = patch( 'vllm_ascend.distributed.mooncake_layerwise_connector.get_current_vllm_config', new=MagicMock(return_value=SimpleNamespace( @@ -903,7 +903,7 @@ class TestMooncakeLayerwiseConnectorWorker(unittest.TestCase): patch( 'vllm_ascend.distributed.mooncake_layerwise_connector.get_ascend_config', return_value=SimpleNamespace(pd_tp_ratio=1, - num_head_replica=0, + num_head_replica=1, pd_head_ratio=1), ), patch( diff --git a/vllm_ascend/ascend_config.py b/vllm_ascend/ascend_config.py index 1e68558..cbd905e 100644 --- a/vllm_ascend/ascend_config.py +++ b/vllm_ascend/ascend_config.py @@ -102,7 +102,7 @@ class AscendConfig: ) self.pd_tp_ratio = 1 self.pd_head_ratio = 1 - self.num_head_replica = 0 + self.num_head_replica = 1 if vllm_config.kv_transfer_config is not None and not vllm_config.model_config.is_deepseek_mla: prefill_tp_size = vllm_config.kv_transfer_config.get_from_extra_config( "prefill", {"tp_size": 1})["tp_size"] @@ -115,7 +115,7 @@ class AscendConfig: # only support Qwen model now # TODO: use a more robust method to get kv_head_num num_kv_head = vllm_config.model_config.hf_config.num_key_value_heads - self.num_head_replica = prefill_tp_size // num_kv_head + self.num_head_replica = prefill_tp_size // num_kv_head if prefill_tp_size >= num_kv_head else 1 prefill_tp_size = min(prefill_tp_size, num_kv_head) decode_tp_size = min(decode_tp_size, num_kv_head) self.pd_head_ratio = prefill_tp_size // decode_tp_size diff --git a/vllm_ascend/distributed/mooncake_layerwise_connector.py b/vllm_ascend/distributed/mooncake_layerwise_connector.py index 87f59e8..457c737 100644 --- a/vllm_ascend/distributed/mooncake_layerwise_connector.py +++ b/vllm_ascend/distributed/mooncake_layerwise_connector.py @@ -360,7 +360,7 @@ class SendingLayerThread(threading.Thread): remote_kv_base_addrs = req_meta.kv_caches_base_addr remote_block_ids = req_meta.block_ids - if self.num_head_replica >= 1 and self.tp_rank % self.num_head_replica != 0: + if self.tp_rank % self.num_head_replica != 0: pass elif self.pd_head_ratio == 1: layer_local_kv_base_addr = [