From 3effc4bc70176a4a0dc1da82fdbd72c6cc6b42b5 Mon Sep 17 00:00:00 2001 From: pz1116 <47019764+Pz1116@users.noreply.github.com> Date: Thu, 19 Mar 2026 10:13:13 +0800 Subject: [PATCH] [Doc][KV Pool]Revision KV Pool User Guide (#7434) ### What this PR does / why we need it? Revise the KV Pool user guide: 1. Revise Mooncake environment variables and kvconnector extra configs. 2. Delete `use_ascend_direct` in kv connector extra config as it is deprecated 3. Delete `kv_buffer_device` and `kv_rank` in P2P mooncake config 4. Unifies default `max-model-len` and `max-num-batch-tokens` in examples given. ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.17.0 - vLLM main: https://github.com/vllm-project/vllm/commit/4497431df654e46fb1fb5e64bf8611e762ae5d87 --------- Signed-off-by: Pz1116 Co-authored-by: Chao Lei --- ...ng_sequence_context_parallel_multi_node.md | 2 - .../pd_colocated_mooncake_multi_instance.md | 2 - docs/source/tutorials/models/DeepSeek-V3.2.md | 4 - .../tutorials/models/Qwen3-235B-A22B.md | 3 - .../user_guide/feature_guide/kv_pool.md | 126 ++++++++---------- .../config/DeepSeek-V3_2-W8A8-EP.yaml | 4 - .../config/Qwen3-235B-disagg-pd.yaml | 2 - .../single_node/models/test_qwen3_30b_acc.py | 1 - 8 files changed, 58 insertions(+), 86 deletions(-) diff --git a/docs/source/tutorials/features/long_sequence_context_parallel_multi_node.md b/docs/source/tutorials/features/long_sequence_context_parallel_multi_node.md index 40efbe91..f2d6606e 100644 --- a/docs/source/tutorials/features/long_sequence_context_parallel_multi_node.md +++ b/docs/source/tutorials/features/long_sequence_context_parallel_multi_node.md @@ -124,7 +124,6 @@ vllm serve /path_to_weight/DeepSeek-V3.1_w8a8mix_mtp \ "kv_port": "30000", "engine_id": "0", "kv_connector_extra_config": { - "use_ascend_direct": true, "prefill": { "dp_size": 1, "tp_size": 16 @@ -192,7 +191,6 @@ vllm serve /path_to_weight/DeepSeek-V3.1_w8a8mix_mtp \ "kv_port": "30000", "engine_id": "1", "kv_connector_extra_config": { - "use_ascend_direct": true, "prefill": { "dp_size": 1, "tp_size": 16 diff --git a/docs/source/tutorials/features/pd_colocated_mooncake_multi_instance.md b/docs/source/tutorials/features/pd_colocated_mooncake_multi_instance.md index e993d4c4..9cffcfe2 100644 --- a/docs/source/tutorials/features/pd_colocated_mooncake_multi_instance.md +++ b/docs/source/tutorials/features/pd_colocated_mooncake_multi_instance.md @@ -185,7 +185,6 @@ The template for the mooncake.json file is as follows: "metadata_server": "P2PHANDSHAKE", "protocol": "ascend", "device_name": "", - "use_ascend_direct": true, "master_server_address": ":50088", "global_segment_size": 107374182400 } @@ -195,7 +194,6 @@ The template for the mooncake.json file is as follows: | --------------| ------------------------| -----------------------------------| | metadata_server | P2PHANDSHAKE | Point-to-point handshake mode | | protocol | ascend | Ascend proprietary protocol | -| use_ascend_direct | true | Enable direct hardware access | | master_server_address | 90.90.100.188:50088(for example) | Master server address| | global_segment_size | 107374182400 | Size per segment (100 GB) | diff --git a/docs/source/tutorials/models/DeepSeek-V3.2.md b/docs/source/tutorials/models/DeepSeek-V3.2.md index c1870ccb..65563e30 100644 --- a/docs/source/tutorials/models/DeepSeek-V3.2.md +++ b/docs/source/tutorials/models/DeepSeek-V3.2.md @@ -564,7 +564,6 @@ Before you start, please "kv_port": "30000", "engine_id": "0", "kv_connector_extra_config": { - "use_ascend_direct": true, "prefill": { "dp_size": 2, "tp_size": 16 @@ -639,7 +638,6 @@ Before you start, please "kv_port": "30000", "engine_id": "0", "kv_connector_extra_config": { - "use_ascend_direct": true, "prefill": { "dp_size": 2, "tp_size": 16 @@ -716,7 +714,6 @@ Before you start, please "kv_port": "30100", "engine_id": "1", "kv_connector_extra_config": { - "use_ascend_direct": true, "prefill": { "dp_size": 2, "tp_size": 16 @@ -793,7 +790,6 @@ Before you start, please "kv_port": "30100", "engine_id": "1", "kv_connector_extra_config": { - "use_ascend_direct": true, "prefill": { "dp_size": 2, "tp_size": 16 diff --git a/docs/source/tutorials/models/Qwen3-235B-A22B.md b/docs/source/tutorials/models/Qwen3-235B-A22B.md index cf74f381..a41733d2 100644 --- a/docs/source/tutorials/models/Qwen3-235B-A22B.md +++ b/docs/source/tutorials/models/Qwen3-235B-A22B.md @@ -448,7 +448,6 @@ vllm serve vllm-ascend/Qwen3-235B-A22B-w8a8 \ "kv_port": "30000", "engine_id": "0", "kv_connector_extra_config": { - "use_ascend_direct": true, "prefill": { "dp_size": 2, "tp_size": 8 @@ -513,7 +512,6 @@ vllm serve vllm-ascend/Qwen3-235B-A22B-w8a8 \ "kv_port": "30100", "engine_id": "1", "kv_connector_extra_config": { - "use_ascend_direct": true, "prefill": { "dp_size": 2, "tp_size": 8 @@ -579,7 +577,6 @@ vllm serve vllm-ascend/Qwen3-235B-A22B-w8a8 \ "kv_port": "30100", "engine_id": "1", "kv_connector_extra_config": { - "use_ascend_direct": true, "prefill": { "dp_size": 2, "tp_size": 8 diff --git a/docs/source/user_guide/feature_guide/kv_pool.md b/docs/source/user_guide/feature_guide/kv_pool.md index 21614f63..04a954ac 100644 --- a/docs/source/user_guide/feature_guide/kv_pool.md +++ b/docs/source/user_guide/feature_guide/kv_pool.md @@ -3,18 +3,24 @@ ## Environmental Dependencies * Software: - * Python >= 3.10, < 3.12 - * CANN == 8.3.rc2 - * PyTorch == 2.8.0, torch-npu == 2.8.0 + * CANN >= 8.5.0 * vLLM:main branch * vLLM-Ascend:main branch + * mooncake:>= 0.3.9 ### KV Pool Parameter Description -**kv_connector_extra_config**: Additional Configurable Parameters for Pooling. -**lookup_rpc_port**: Port for RPC Communication Between Pooling Scheduler Process and Worker Process: Each Instance Requires a Unique Port Configuration. -**load_async**: Whether to Enable Asynchronous Loading. The default value is false. -**backend**: Set the storage backend for kvpool, with the default being mooncake. +#### `kv_connector_extra_config`: Additional Configurable Parameters for Pooling + +| Parameter | Description | +| :--- | :--- | +| `lookup_rpc_port` | Port for RPC Communication Between Pooling Scheduler Process and Worker Process: Each Instance Requires a Unique Port Configuration. | +| `load_async` | Whether to Enable Asynchronous Loading. The default value is false. | +| `backend` | Set the storage backend for kvpool, with the default being mooncake. | +| `consumer_is_to_put` | Whether Decode node put KV Cache into KV Pool. The default value is false. | +| `consumer_is_to_load` | Whether Decode node load KV cache from KV Pool. The default value is false. | +| `prefill_pp_size` | Prefill PP size, needs to be set when Prefill node enables PP. | +| `prefill_pp_layer_partition` | Prefill PP layer partition, needs to be set when Prefill node enables PP. | ### Environment Variable Configuration @@ -87,12 +93,11 @@ export PYTHONHASHSEED=0 ### Environment Variables Description -`export ASCEND_ENABLE_USE_FABRIC_MEM=1`: Enable unified memory address direct transmission scheme and only can be used for 800 I/T A3 series. Required supporting hardware versions are as follows: - - HDK >=26.0 - CANN >= 9.0 - -`export ASCEND_BUFFER_POOL=4:8`: ASCEND_BUFFER_POOL is the environment variable for configuring the number and size of buffer on NPU Device for aggregation and KV transfer,the value 4:8 means we allocate 4 buffers of size 8MB. It only can be used for 800 I/T A2 series. +| Hardware | HDK & CANN versions | Export Command | Description | +| :--- | :--- | :--- | :--- | +| 800 I/T A3 series | HDK >= 26.0.0
CANN >= 9.0.0 | `export ASCEND_ENABLE_USE_FABRIC_MEM=1` | **Recommended**. Enables unified memory address direct transmission scheme. | +| 800 I/T A3 series | 25.5.0<=HDK<26.0.0 | `export ASCEND_BUFFER_POOL=4:8` | Configures the number and size of buffers on the NPU Device for aggregation and KV transfer (e.g., `4:8` means 4 buffers of 8MB). | +| 800 I/T A2 series | N/A | `export HCCL_INTRA_ROCE_ENABLE=1` | Required by direct transmission cheme on 800 I/T A2 series| ### Run Mooncake Master @@ -114,7 +119,7 @@ The environment variable **MOONCAKE_CONFIG_PATH** is configured to the full path **protocol:** Must be set to 'Ascend' on the NPU. **device_name**: "" **master_server_address**: Configured with the IP and port of the master service. -**global_segment_size**: Registered memory size per card to the KV Pool. +**global_segment_size**: Registered memory size per card to the KV Pool. **Needs to be aligned to 1GB.** #### 2.Start mooncake_master @@ -147,9 +152,10 @@ export PYTHONPATH=$PYTHONPATH:/xxxxx/vllm export MOONCAKE_CONFIG_PATH="/xxxxxx/mooncake.json" export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3 export ACL_OP_INIT_MODE=1 - -# ASCEND_BUFFER_POOL is the environment variable for configuring the number and size of buffer on NPU Device for aggregation and KV transfer,the value 4:8 means we allocate 4 buffers of size 8MB. -export ASCEND_BUFFER_POOL=4:8 +#A3 +export ASCEND_ENABLE_USE_FABRIC_MEM=1 +#A2 +#export HCCL_INTRA_ROCE_ENABLE=1 # Unit: ms. The timeout for one-sided communication connection establishment is set to 10 seconds by default (see PR: https://github.com/kvcache-ai/Mooncake/pull/1039). Users can adjust this value based on their specific setup. # The recommended formula is: ASCEND_CONNECT_TIMEOUT = connection_time_per_card (typically within 500ms) × total_number_of_Decode_cards. @@ -164,12 +170,12 @@ python3 -m vllm.entrypoints.openai.api_server \ --port 8100 \ --trust-remote-code \ --enforce-eager \ - --no_enable_prefix_caching \ + --no-enable-prefix-caching \ --tensor-parallel-size 1 \ --data-parallel-size 1 \ - --max-model-len 10000 \ + --max-model-len 32768 \ --block-size 128 \ - --max-num-batched-tokens 4096 \ + --max-num-batched-tokens 16384 \ --kv-transfer-config \ '{ "kv_connector": "MultiConnector", @@ -181,7 +187,6 @@ python3 -m vllm.entrypoints.openai.api_server \ "kv_role": "kv_producer", "kv_port": "20001", "kv_connector_extra_config": { - "use_ascend_direct": true, "prefill": { "dp_size": 1, "tp_size": 1 @@ -220,7 +225,10 @@ export PYTHONHASHSEED=0  export MOONCAKE_CONFIG_PATH="/xxxxx/mooncake.json" export ASCEND_RT_VISIBLE_DEVICES=4,5,6,7 export ACL_OP_INIT_MODE=1 -export ASCEND_BUFFER_POOL=4:8 +#A3 +export ASCEND_ENABLE_USE_FABRIC_MEM=1 +#A2 +#export HCCL_INTRA_ROCE_ENABLE=1 export ASCEND_CONNECT_TIMEOUT=10000 export ASCEND_TRANSFER_TIMEOUT=10000 @@ -229,12 +237,12 @@ python3 -m vllm.entrypoints.openai.api_server \ --port 8200 \ --trust-remote-code \ --enforce-eager \ - --no_enable_prefix_caching \ + --no-enable-prefix-caching \ --tensor-parallel-size 1 \ --data-parallel-size 1 \ - --max-model-len 10000 \ + --max-model-len 32768 \ --block-size 128 \ - --max-num-batched-tokens 4096 \ + --max-num-batched-tokens 16384 \ --kv-transfer-config \ '{ "kv_connector": "MultiConnector", @@ -331,7 +339,10 @@ export MOONCAKE_CONFIG_PATH="/xxxxxx/mooncake.json" export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3 export PYTHONHASHSEED=0  export ACL_OP_INIT_MODE=1 -export ASCEND_BUFFER_POOL=4:8 +#A3 +export ASCEND_ENABLE_USE_FABRIC_MEM=1 +#A2 +#export HCCL_INTRA_ROCE_ENABLE=1 export ASCEND_CONNECT_TIMEOUT=10000 export ASCEND_TRANSFER_TIMEOUT=10000 @@ -340,12 +351,12 @@ python3 -m vllm.entrypoints.openai.api_server \ --port 8100 \ --trust-remote-code \ --enforce-eager \ - --no_enable_prefix_caching \ + --no-enable-prefix-caching \ --tensor-parallel-size 1 \ --data-parallel-size 1 \ - --max-model-len 10000 \ + --max-model-len 32768 \ --block-size 128 \ - --max-num-batched-tokens 4096 \ + --max-num-batched-tokens 16384 \ --kv-transfer-config \ '{ "kv_connector": "AscendStoreConnector", @@ -616,13 +627,12 @@ vllm serve xxxxxxx/Qwen3-32B \ --tensor-parallel-size 4 \ --seed 1024 \ --served-model-name qwen3 \ - --max-model-len 65536 \ + --max-model-len 32768 \ --max-num-batched-tokens 16384 \ --trust-remote-code \ --gpu-memory-utilization 0.9 \ --max-num_seqs 20 \ --no-enable-prefix-caching \ - --additional_config='{"ascend_scheduler_config":{"enabled":false}, "enable_shared_expert_dp":false}' \ --kv-transfer-config \ '{ "kv_connector": "MultiConnector", @@ -633,11 +643,8 @@ vllm serve xxxxxxx/Qwen3-32B \ { "kv_connector": "MooncakeConnectorV1", "kv_role": "kv_producer", - "kv_buffer_device": "npu", - "kv_rank": 0, "kv_port": "20001", "kv_connector_extra_config": { - "use_ascend_direct": true, "prefill": { "dp_size": 2, "tp_size": 4 @@ -699,13 +706,12 @@ vllm serve xxxxxxx/Qwen3-32B \ --tensor-parallel-size 4 \ --seed 1024 \ --served-model-name qwen3 \ - --max-model-len 65536 \ + --max-model-len 32768 \ --max-num-batched-tokens 16384 \ --trust-remote-code \ --gpu-memory-utilization 0.9 \ --max-num_seqs 20 \ --no-enable-prefix-caching \ - --additional_config='{"ascend_scheduler_config":{"enabled":false}, "enable_shared_expert_dp":false}' \ --kv-transfer-config \ '{ "kv_connector": "MultiConnector", @@ -715,11 +721,8 @@ vllm serve xxxxxxx/Qwen3-32B \ { "kv_connector": "MooncakeConnectorV1", "kv_role": "kv_consumer", - "kv_buffer_device": "npu", - "kv_rank": 1, "kv_port": "20002", "kv_connector_extra_config": { - "use_ascend_direct": true, "prefill": { "dp_size": 2, "tp_size": 4 @@ -774,10 +777,9 @@ python -m vllm.entrypoints.openai.api_server \ --data-parallel-size 2 \ --tensor-parallel-size 8 \ --port 30050 \ - --max-num_seqs 28 \ - --max-model-len 16384 \ + --max-num_seqs 20 \ + --max-model-len 32768 \ --max-num-batched-tokens 16384 \ - --additional_config='{"ascend_scheduler_config":{"enabled":false}, "enable_shared_expert_dp":false}' \ --enable_expert_parallel \ --quantization ascend \ --gpu-memory-utilization 0.90 \ @@ -792,11 +794,8 @@ python -m vllm.entrypoints.openai.api_server \ { "kv_connector": "MooncakeConnectorV1", "kv_role": "kv_producer", - "kv_buffer_device": "npu", - "kv_rank": 0, "kv_port": "20001", "kv_connector_extra_config": { - "use_ascend_direct": true, "prefill": { "dp_size": 2, "tp_size": 8 @@ -846,15 +845,14 @@ python -m vllm.entrypoints.openai.api_server \ --data-parallel-size 2 \ --tensor-parallel-size 8 \ --port 30060 \ - --max-model-len 16384 \ - --max-num-batched-tokens 5200 \ + --max-model-len 32768 \ + --max-num-batched-tokens 16384 \ --enforce-eager\ --quantization ascend \ --no-enable-prefix-caching \ - --max-num_seqs 28 \ + --max-num_seqs 20 \ --speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' \ --enable_expert_parallel \ - --additional_config='{"ascend_scheduler_config":{"enabled":false}, "enable_shared_expert_dp":false}' \ --gpu-memory-utilization 0.9 \ --kv-transfer-config \ '{ @@ -865,11 +863,8 @@ python -m vllm.entrypoints.openai.api_server \ { "kv_connector": "MooncakeConnectorV1", "kv_role": "kv_consumer", - "kv_buffer_device": "npu", - "kv_rank": 1, "kv_port": "20002", "kv_connector_extra_config": { - "use_ascend_direct": true, "prefill": { "dp_size": 2, "tp_size": 8 @@ -905,7 +900,7 @@ python -m vllm.entrypoints.openai.api_server \ The deepseek model needs to be run in a two-node cluster. -**Run_hunbu_1.sh:** +**Run_pd_mix_1.sh:** ```shell rm -rf /root/ascend/log/* @@ -948,7 +943,7 @@ vllm serve xxxxxxx/DeepSeek-R1 \ --tensor-parallel-size 8 \ --seed 1024 \ --served-model-name deepseek \ - --max-model-len 65536 \ + --max-model-len 32768 \ --max-num-batched-tokens 16384 \ --trust-remote-code \ --gpu-memory-utilization 0.9 \ @@ -956,7 +951,6 @@ vllm serve xxxxxxx/DeepSeek-R1 \ --max-num_seqs 20 \ --enable-expert-parallel \ --no-enable-prefix-caching \ - --additional_config='{"ascend_scheduler_config":{"enabled":false}, "enable_shared_expert_dp":false}' \ --kv-transfer-config \ '{ "kv_connector": "AscendStoreConnector", @@ -965,11 +959,11 @@ vllm serve xxxxxxx/DeepSeek-R1 \ "backend": "memcache", "lookup_rpc_port":"0" } - }' > log_hunbu_1.log 2>&1 + }' > log_pd_mix_1.log 2>&1 ``` -**Run_hunbu_2.sh:** +**Run_pd_mix_2.sh:** ```shell rm -rf /root/ascend/log/* @@ -1014,7 +1008,7 @@ vllm serve xxxxxxx/DeepSeek-R1 \ --tensor-parallel-size 8 \ --seed 1024 \ --served-model-name deepseek \ - --max-model-len 65536 \ + --max-model-len 32768 \ --max-num-batched-tokens 16384 \ --trust-remote-code \ --gpu-memory-utilization 0.9 \ @@ -1022,16 +1016,15 @@ vllm serve xxxxxxx/DeepSeek-R1 \ --max-num_seqs 20 \ --enable-expert-parallel \ --no-enable-prefix-caching \ - --additional_config='{"ascend_scheduler_config":{"enabled":false}, "enable_shared_expert_dp":false, "chunked_prefill_for_mla":true}' \ --kv-transfer-config \ '{ "kv_connector": "AscendStoreConnector", "kv_role": "kv_both", "kv_connector_extra_config": { "backend": "memcache", - "mooncake_rpc_port":"0" + "lookup_rpc_port":"0" } - }' > log_hunbu_2.log 2>&1 + }' > log_pd_mix_2.log 2>&1 ``` @@ -1069,12 +1062,11 @@ python -m vllm.entrypoints.openai.api_server \ -dp 2 \ -tp 8 \ --port 30050 \ - --max-num_seqs 28 \ - --max-model-len 16384 \ + --max-num_seqs 20 \ + --max-model-len 32768 \ --max-num-batched-tokens 16384 \ --speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' \ --compilation_config '{"cudagraph_mode":"FULL_DECODE_ONLY"}' \ - --additional_config='{"ascend_scheduler_config":{"enabled":false}, "enable_shared_expert_dp":false, "chunked_prefill_for_mla":true}' \ --enable_expert_parallel \ --quantization ascend \ --gpu-memory-utilization 0.90 \ @@ -1085,10 +1077,8 @@ python -m vllm.entrypoints.openai.api_server \ "kv_role": "kv_both", "kv_connector_extra_config": { "backend": "memcache", - "mooncake_rpc_port":"0" + "lookup_rpc_port":"0" } - }' > log_hunbu.log 2>&1 + }' > log_pd_mix.log 2>&1 ``` - -#### [2.Run Inference](#2run-inference) diff --git a/tests/e2e/nightly/multi_node/config/DeepSeek-V3_2-W8A8-EP.yaml b/tests/e2e/nightly/multi_node/config/DeepSeek-V3_2-W8A8-EP.yaml index 4dfc3212..3c33d40b 100644 --- a/tests/e2e/nightly/multi_node/config/DeepSeek-V3_2-W8A8-EP.yaml +++ b/tests/e2e/nightly/multi_node/config/DeepSeek-V3_2-W8A8-EP.yaml @@ -60,7 +60,6 @@ deployment: "kv_port": "30000", "engine_id": "0", "kv_connector_extra_config": { - "use_ascend_direct": true, "prefill": { "dp_size": 2, "tp_size": 16 @@ -106,7 +105,6 @@ deployment: "kv_port": "30000", "engine_id": "0", "kv_connector_extra_config": { - "use_ascend_direct": true, "prefill": { "dp_size": 2, "tp_size": 16 @@ -152,7 +150,6 @@ deployment: "kv_port": "30200", "engine_id": "1", "kv_connector_extra_config": { - "use_ascend_direct": true, "prefill": { "dp_size": 2, "tp_size": 16 @@ -200,7 +197,6 @@ deployment: "kv_port": "30200", "engine_id": "1", "kv_connector_extra_config": { - "use_ascend_direct": true, "prefill": { "dp_size": 2, "tp_size": 16 diff --git a/tests/e2e/nightly/multi_node/config/Qwen3-235B-disagg-pd.yaml b/tests/e2e/nightly/multi_node/config/Qwen3-235B-disagg-pd.yaml index 4139a35d..9391352d 100644 --- a/tests/e2e/nightly/multi_node/config/Qwen3-235B-disagg-pd.yaml +++ b/tests/e2e/nightly/multi_node/config/Qwen3-235B-disagg-pd.yaml @@ -45,7 +45,6 @@ deployment: "kv_port": "30000", "engine_id": "0", "kv_connector_extra_config": { - "use_ascend_direct": true, "prefill": { "dp_size": 2, "tp_size": 8 @@ -84,7 +83,6 @@ deployment: "kv_port": "30100", "engine_id": "1", "kv_connector_extra_config": { - "use_ascend_direct": true, "prefill": { "dp_size": 2, "tp_size": 8 diff --git a/tests/e2e/weekly/single_node/models/test_qwen3_30b_acc.py b/tests/e2e/weekly/single_node/models/test_qwen3_30b_acc.py index a1b09335..d193b010 100644 --- a/tests/e2e/weekly/single_node/models/test_qwen3_30b_acc.py +++ b/tests/e2e/weekly/single_node/models/test_qwen3_30b_acc.py @@ -45,7 +45,6 @@ mooncake_json = { "metadata_server": "P2PHANDSHAKE", "protocol": "ascend", "device_name": "", - "use_ascend_direct": True, "master_server_address": "", "global_segment_size": 30000000000 }