[Bugfix] fix ZeroDivisionError when prefill_tp_size > num_kv_head and fix tp_resharding README (#3437)
### What this PR does / why we need it? Fix ZeroDivisionError when prefill_tp_size > num_kv_head, in this situation, num_head_replica can be 0 and used to divide another value, this PR restricts the minimum value of a to be 1. And this PR fix tp_resharding README. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? By CI. - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 --------- Signed-off-by: liziyu <liziyu16@huawei.com> Signed-off-by: nwpu-zxr <zhouxuerong2@huawei.com> Co-authored-by: liziyu <liziyu16@huawei.com>
This commit is contained in:
@@ -114,10 +114,10 @@ export VLLM_USE_V1=1
|
|||||||
export HCCL_BUFFSIZE=1024
|
export HCCL_BUFFSIZE=1024
|
||||||
export OMP_PROC_BIND=false
|
export OMP_PROC_BIND=false
|
||||||
export OMP_NUM_THREADS=10
|
export OMP_NUM_THREADS=10
|
||||||
export ASCEND_AGGREGATE_ENABLE=1
|
export ASCEND_AGGREGATE_ENABLE=1 # enable aggregated transmission
|
||||||
export ASCEND_TRANSPORT_PRINT=0
|
export ASCEND_TRANSPORT_PRINT=0 # print ascend transport logs
|
||||||
export ACL_OP_INIT_MODE=1
|
export ACL_OP_INIT_MODE=1 # acl op initialization mode to prevent device id acquisition failure
|
||||||
export ASCEND_A3_ENABLE=1
|
export ASCEND_A3_ENABLE=1 # enable hccs transmission for A3; set to 0 for A2
|
||||||
export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH
|
export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH
|
||||||
|
|
||||||
vllm serve /model/Qwen3-235B-A22B-W8A8 \
|
vllm serve /model/Qwen3-235B-A22B-W8A8 \
|
||||||
@@ -137,7 +137,6 @@ vllm serve /model/Qwen3-235B-A22B-W8A8 \
|
|||||||
--max-model-len 32768 \
|
--max-model-len 32768 \
|
||||||
--max-num-batched-tokens 32768 \
|
--max-num-batched-tokens 32768 \
|
||||||
--trust-remote-code \
|
--trust-remote-code \
|
||||||
--no-enable-prefix-caching \
|
|
||||||
--gpu-memory-utilization 0.9 \
|
--gpu-memory-utilization 0.9 \
|
||||||
--kv-transfer-config \
|
--kv-transfer-config \
|
||||||
'{"kv_connector": "MooncakeLayerwiseConnector",
|
'{"kv_connector": "MooncakeLayerwiseConnector",
|
||||||
@@ -197,7 +196,6 @@ vllm serve /model/Qwen3-235B-A22B-W8A8 \
|
|||||||
--max-model-len 32768 \
|
--max-model-len 32768 \
|
||||||
--max-num-batched-tokens 32768 \
|
--max-num-batched-tokens 32768 \
|
||||||
--trust-remote-code \
|
--trust-remote-code \
|
||||||
--no-enable-prefix-caching \
|
|
||||||
--gpu-memory-utilization 0.9 \
|
--gpu-memory-utilization 0.9 \
|
||||||
--kv-transfer-config \
|
--kv-transfer-config \
|
||||||
'{"kv_connector": "MooncakeLayerwiseConnector",
|
'{"kv_connector": "MooncakeLayerwiseConnector",
|
||||||
@@ -363,6 +361,10 @@ export VLLM_USE_V1=1
|
|||||||
export HCCL_BUFFSIZE=1024
|
export HCCL_BUFFSIZE=1024
|
||||||
export OMP_PROC_BIND=false
|
export OMP_PROC_BIND=false
|
||||||
export OMP_NUM_THREADS=10
|
export OMP_NUM_THREADS=10
|
||||||
|
export ASCEND_AGGREGATE_ENABLE=1
|
||||||
|
export ASCEND_TRANSPORT_PRINT=0
|
||||||
|
export ACL_OP_INIT_MODE=1
|
||||||
|
export ASCEND_A3_ENABLE=1
|
||||||
export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH
|
export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH
|
||||||
|
|
||||||
vllm serve /model/Qwen3-235B-A22B-W8A8 \
|
vllm serve /model/Qwen3-235B-A22B-W8A8 \
|
||||||
@@ -382,7 +384,6 @@ vllm serve /model/Qwen3-235B-A22B-W8A8 \
|
|||||||
--max-model-len 32768 \
|
--max-model-len 32768 \
|
||||||
--max-num-batched-tokens 32768 \
|
--max-num-batched-tokens 32768 \
|
||||||
--trust-remote-code \
|
--trust-remote-code \
|
||||||
--no-enable-prefix-caching \
|
|
||||||
--gpu-memory-utilization 0.9 \
|
--gpu-memory-utilization 0.9 \
|
||||||
--kv-transfer-config \
|
--kv-transfer-config \
|
||||||
'{"kv_connector": "MooncakeConnector",
|
'{"kv_connector": "MooncakeConnector",
|
||||||
@@ -419,6 +420,10 @@ export VLLM_USE_V1=1
|
|||||||
export HCCL_BUFFSIZE=1024
|
export HCCL_BUFFSIZE=1024
|
||||||
export OMP_PROC_BIND=false
|
export OMP_PROC_BIND=false
|
||||||
export OMP_NUM_THREADS=10
|
export OMP_NUM_THREADS=10
|
||||||
|
export ASCEND_AGGREGATE_ENABLE=1
|
||||||
|
export ASCEND_TRANSPORT_PRINT=0
|
||||||
|
export ACL_OP_INIT_MODE=1
|
||||||
|
export ASCEND_A3_ENABLE=1
|
||||||
export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH
|
export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH
|
||||||
|
|
||||||
vllm serve /model/Qwen3-235B-A22B-W8A8 \
|
vllm serve /model/Qwen3-235B-A22B-W8A8 \
|
||||||
@@ -438,7 +443,6 @@ vllm serve /model/Qwen3-235B-A22B-W8A8 \
|
|||||||
--max-model-len 32768 \
|
--max-model-len 32768 \
|
||||||
--max-num-batched-tokens 32768 \
|
--max-num-batched-tokens 32768 \
|
||||||
--trust-remote-code \
|
--trust-remote-code \
|
||||||
--no-enable-prefix-caching \
|
|
||||||
--gpu-memory-utilization 0.9 \
|
--gpu-memory-utilization 0.9 \
|
||||||
--kv-transfer-config \
|
--kv-transfer-config \
|
||||||
'{"kv_connector": "MooncakeConnector",
|
'{"kv_connector": "MooncakeConnector",
|
||||||
@@ -475,6 +479,10 @@ export VLLM_USE_V1=1
|
|||||||
export HCCL_BUFFSIZE=2048
|
export HCCL_BUFFSIZE=2048
|
||||||
export OMP_PROC_BIND=false
|
export OMP_PROC_BIND=false
|
||||||
export OMP_NUM_THREADS=10
|
export OMP_NUM_THREADS=10
|
||||||
|
export ASCEND_AGGREGATE_ENABLE=1
|
||||||
|
export ASCEND_TRANSPORT_PRINT=0
|
||||||
|
export ACL_OP_INIT_MODE=1
|
||||||
|
export ASCEND_A3_ENABLE=1
|
||||||
export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH
|
export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH
|
||||||
|
|
||||||
vllm serve /model/Qwen3-235B-A22B-W8A8 \
|
vllm serve /model/Qwen3-235B-A22B-W8A8 \
|
||||||
@@ -532,6 +540,10 @@ export VLLM_USE_V1=1
|
|||||||
export HCCL_BUFFSIZE=2048
|
export HCCL_BUFFSIZE=2048
|
||||||
export OMP_PROC_BIND=false
|
export OMP_PROC_BIND=false
|
||||||
export OMP_NUM_THREADS=10
|
export OMP_NUM_THREADS=10
|
||||||
|
export ASCEND_AGGREGATE_ENABLE=1
|
||||||
|
export ASCEND_TRANSPORT_PRINT=0
|
||||||
|
export ACL_OP_INIT_MODE=1
|
||||||
|
export ASCEND_A3_ENABLE=1
|
||||||
export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH
|
export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH
|
||||||
|
|
||||||
vllm serve /model/Qwen3-235B-A22B-W8A8 \
|
vllm serve /model/Qwen3-235B-A22B-W8A8 \
|
||||||
|
|||||||
@@ -79,7 +79,7 @@ class TestKVCacheSendingLayerThreadBasic(unittest.TestCase):
|
|||||||
self.p1 = patch(
|
self.p1 = patch(
|
||||||
'vllm_ascend.distributed.mooncake_layerwise_connector.get_ascend_config',
|
'vllm_ascend.distributed.mooncake_layerwise_connector.get_ascend_config',
|
||||||
new=MagicMock(return_value=SimpleNamespace(
|
new=MagicMock(return_value=SimpleNamespace(
|
||||||
pd_tp_ratio=1, num_head_replica=0, pd_head_ratio=1)))
|
pd_tp_ratio=1, num_head_replica=1, pd_head_ratio=1)))
|
||||||
self.p2 = patch(
|
self.p2 = patch(
|
||||||
'vllm_ascend.distributed.mooncake_layerwise_connector.get_current_vllm_config',
|
'vllm_ascend.distributed.mooncake_layerwise_connector.get_current_vllm_config',
|
||||||
new=MagicMock(return_value=SimpleNamespace(
|
new=MagicMock(return_value=SimpleNamespace(
|
||||||
@@ -244,7 +244,7 @@ class TestSendingLayerThread(unittest.TestCase):
|
|||||||
self.p1 = patch(
|
self.p1 = patch(
|
||||||
'vllm_ascend.distributed.mooncake_layerwise_connector.get_ascend_config',
|
'vllm_ascend.distributed.mooncake_layerwise_connector.get_ascend_config',
|
||||||
new=MagicMock(return_value=SimpleNamespace(
|
new=MagicMock(return_value=SimpleNamespace(
|
||||||
pd_tp_ratio=1, num_head_replica=0, pd_head_ratio=1)))
|
pd_tp_ratio=1, num_head_replica=1, pd_head_ratio=1)))
|
||||||
self.p2 = patch(
|
self.p2 = patch(
|
||||||
'vllm_ascend.distributed.mooncake_layerwise_connector.get_current_vllm_config',
|
'vllm_ascend.distributed.mooncake_layerwise_connector.get_current_vllm_config',
|
||||||
new=MagicMock(return_value=SimpleNamespace(
|
new=MagicMock(return_value=SimpleNamespace(
|
||||||
@@ -903,7 +903,7 @@ class TestMooncakeLayerwiseConnectorWorker(unittest.TestCase):
|
|||||||
patch(
|
patch(
|
||||||
'vllm_ascend.distributed.mooncake_layerwise_connector.get_ascend_config',
|
'vllm_ascend.distributed.mooncake_layerwise_connector.get_ascend_config',
|
||||||
return_value=SimpleNamespace(pd_tp_ratio=1,
|
return_value=SimpleNamespace(pd_tp_ratio=1,
|
||||||
num_head_replica=0,
|
num_head_replica=1,
|
||||||
pd_head_ratio=1),
|
pd_head_ratio=1),
|
||||||
),
|
),
|
||||||
patch(
|
patch(
|
||||||
|
|||||||
@@ -102,7 +102,7 @@ class AscendConfig:
|
|||||||
)
|
)
|
||||||
self.pd_tp_ratio = 1
|
self.pd_tp_ratio = 1
|
||||||
self.pd_head_ratio = 1
|
self.pd_head_ratio = 1
|
||||||
self.num_head_replica = 0
|
self.num_head_replica = 1
|
||||||
if vllm_config.kv_transfer_config is not None and not vllm_config.model_config.is_deepseek_mla:
|
if vllm_config.kv_transfer_config is not None and not vllm_config.model_config.is_deepseek_mla:
|
||||||
prefill_tp_size = vllm_config.kv_transfer_config.get_from_extra_config(
|
prefill_tp_size = vllm_config.kv_transfer_config.get_from_extra_config(
|
||||||
"prefill", {"tp_size": 1})["tp_size"]
|
"prefill", {"tp_size": 1})["tp_size"]
|
||||||
@@ -115,7 +115,7 @@ class AscendConfig:
|
|||||||
# only support Qwen model now
|
# only support Qwen model now
|
||||||
# TODO: use a more robust method to get kv_head_num
|
# TODO: use a more robust method to get kv_head_num
|
||||||
num_kv_head = vllm_config.model_config.hf_config.num_key_value_heads
|
num_kv_head = vllm_config.model_config.hf_config.num_key_value_heads
|
||||||
self.num_head_replica = prefill_tp_size // num_kv_head
|
self.num_head_replica = prefill_tp_size // num_kv_head if prefill_tp_size >= num_kv_head else 1
|
||||||
prefill_tp_size = min(prefill_tp_size, num_kv_head)
|
prefill_tp_size = min(prefill_tp_size, num_kv_head)
|
||||||
decode_tp_size = min(decode_tp_size, num_kv_head)
|
decode_tp_size = min(decode_tp_size, num_kv_head)
|
||||||
self.pd_head_ratio = prefill_tp_size // decode_tp_size
|
self.pd_head_ratio = prefill_tp_size // decode_tp_size
|
||||||
|
|||||||
@@ -360,7 +360,7 @@ class SendingLayerThread(threading.Thread):
|
|||||||
remote_kv_base_addrs = req_meta.kv_caches_base_addr
|
remote_kv_base_addrs = req_meta.kv_caches_base_addr
|
||||||
|
|
||||||
remote_block_ids = req_meta.block_ids
|
remote_block_ids = req_meta.block_ids
|
||||||
if self.num_head_replica >= 1 and self.tp_rank % self.num_head_replica != 0:
|
if self.tp_rank % self.num_head_replica != 0:
|
||||||
pass
|
pass
|
||||||
elif self.pd_head_ratio == 1:
|
elif self.pd_head_ratio == 1:
|
||||||
layer_local_kv_base_addr = [
|
layer_local_kv_base_addr = [
|
||||||
|
|||||||
Reference in New Issue
Block a user