From 2ef4d1979e311656cb537b6b5557d94b035696a0 Mon Sep 17 00:00:00 2001 From: fems14 <74094523+fems14@users.noreply.github.com> Date: Sat, 27 Dec 2025 09:53:57 +0800 Subject: [PATCH] [bugfix][main]KV Pool for KV Transfer in PD Disaggregation Scenarios (#5398) ### What this PR does / why we need it? 1.KV Pool for KV Transfer in PD Disaggregation Scenarios Error Resolution 2.Update KV Pool Documentation ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: release/v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/254f6b986720c92ddf97fbb1a6a6465da8e87e29 --------- Signed-off-by: fems14 <1804143737@qq.com> --- .../user_guide/feature_guide/kv_pool.md | 72 +++++++++---------- .../kvpool/ascend_store_connector.py | 2 +- .../distributed/kvpool/pool_scheduler.py | 7 +- 3 files changed, 40 insertions(+), 41 deletions(-) diff --git a/docs/source/user_guide/feature_guide/kv_pool.md b/docs/source/user_guide/feature_guide/kv_pool.md index 5f956f5e..2ba0ea9e 100644 --- a/docs/source/user_guide/feature_guide/kv_pool.md +++ b/docs/source/user_guide/feature_guide/kv_pool.md @@ -9,13 +9,20 @@ * vLLM:main branch * vLLM-Ascend:main branch -### KV Pooling Parameter Description +### KV Pool Parameter Description **kv_connector_extra_config**: Additional Configurable Parameters for Pooling. **lookup_rpc_port**: Port for RPC Communication Between Pooling Scheduler Process and Worker Process: Each Instance Requires a Unique Port Configuration. **load_async**: Whether to Enable Asynchronous Loading. The default value is false. **backend**: Set the storage backend for kvpool, with the default being mooncake. -## Example of using Mooncake as a KVCache pooling backend +### Environment Variable Configuration +To guarantee uniform hash generation, it is required to synchronize the PYTHONHASHSEED environment variable across all nodes upon enabling KV Pool. + +```bash +export PYTHONHASHSEED=0 +``` + +## Example of using Mooncake as a KV Pool backend * Software: * Check NPU network configuration: @@ -83,39 +90,35 @@ The environment variable **MOONCAKE_CONFIG_PATH** is configured to the full path ``` { - "local_hostname": "xx.xx.xx.xx", "metadata_server": "P2PHANDSHAKE", "protocol": "ascend", "device_name": "", - "alloc_in_same_node": true, "master_server_address": "xx.xx.xx.xx:50088", "global_segment_size": "1GB" (1024MB/1048576KB/1073741824B/1073741824) } ``` -**local_hostname**: Configured as the IP address of the current master node. **metadata_server**: Configured as **P2PHANDSHAKE**. -**protocol:** Configured for Ascend to use Mooncake's HCCL communication. -**device_name**: "" -**alloc_in_same_node**: Indicator for preferring local buffer allocation strategy. +**protocol:** Must be set to 'Ascend' on the NPU. +**device_name**: "" **master_server_address**: Configured with the IP and port of the master service. -**global_segment_size**: Expands the kvcache size registered by the PD node to the master. +**global_segment_size**: Registered memory size per card to the KV Pool. -#### 2. Start mooncake_master +#### 2.Start mooncake_master Under the mooncake folder: ``` -mooncake_master --port 50088 --eviction_high_watermark_ratio 0.95 --eviction_ratio 0.05 +mooncake_master --port 50088 --eviction_high_watermark_ratio 0.9 --eviction_ratio 0.1 ``` `eviction_high_watermark_ratio` determines the watermark where Mooncake Store will perform eviction,and `eviction_ratio` determines the portion of stored objects that would be evicted. -### Pooling and Prefill Decode Disaggregate Scenario +### PD Disaggregation Scenario #### 1.Run `prefill` Node and `decode` Node -Using MultiConnector to simultaneously utilize both p2p connectors and pooled connectors. P2P performs kv_transfer, while pooling creates a larger prefix-cache. +Using `MultiConnector` to simultaneously utilize both `MooncakeConnectorV1` and `AscendStoreConnector`. `MooncakeConnectorV1` performs kv_transfer, while `AscendStoreConnector` serves as the prefix-cache node. `prefill` Node: @@ -127,6 +130,7 @@ The content of the multi_producer.sh script: ``` export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH +export PYTHONHASHSEED=0  export PYTHONPATH=$PYTHONPATH:/xxxxx/vllm export MOONCAKE_CONFIG_PATH="/xxxxxx/mooncake.json" export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3 @@ -159,7 +163,6 @@ python3 -m vllm.entrypoints.openai.api_server \ "kv_connector": "MultiConnector", "kv_role": "kv_producer", "kv_connector_extra_config": { - "use_layerwise": false, "connectors": [ { "kv_connector": "MooncakeConnectorV1", @@ -177,12 +180,14 @@ python3 -m vllm.entrypoints.openai.api_server \ } } }, - { + { "kv_connector": "AscendStoreConnector", "kv_role": "kv_producer", - "lookup_rpc_port":"0", - "backend": "mooncake" - } + "kv_connector_extra_config": { + "lookup_rpc_port":"0", + "backend": "mooncake" + } + } ] } }' @@ -199,6 +204,7 @@ The content of multi_consumer.sh: ``` export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH export PYTHONPATH=$PYTHONPATH:/xxxxx/vllm +export PYTHONHASHSEED=0  export MOONCAKE_CONFIG_PATH="/xxxxx/mooncake.json" export ASCEND_RT_VISIBLE_DEVICES=4,5,6,7 export ACL_OP_INIT_MODE=1 @@ -222,7 +228,6 @@ python3 -m vllm.entrypoints.openai.api_server \ "kv_connector": "MultiConnector", "kv_role": "kv_consumer", "kv_connector_extra_config": { - "use_layerwise": false, "connectors": [ { "kv_connector": "MooncakeConnectorV1", @@ -242,22 +247,17 @@ python3 -m vllm.entrypoints.openai.api_server \ { "kv_connector": "AscendStoreConnector", "kv_role": "kv_consumer", - "lookup_rpc_port":"1", - "backend": "mooncake" + "kv_connector_extra_config": { + "lookup_rpc_port":"0", + "backend": "mooncake" + } } ] } }' ``` -#### 2、Start proxy_server. - -``` -bash proxy.sh -``` - -proxy.sh content: -Change localhost to your actual IP address. +#### 2.Start proxy_server. ``` python vllm-ascend/examples/disaggregated_prefill_v1/load_balance_proxy_server_example.py \ @@ -268,7 +268,9 @@ python vllm-ascend/examples/disaggregated_prefill_v1/load_balance_proxy_server_e --decoder-ports 8200 \ ``` -#### 3. Run Inference +Change localhost to your actual IP address. + +#### 3.Run Inference Configure the localhost, port, and model weight path in the command to your own settings. @@ -284,11 +286,9 @@ Long question: curl -s http://localhost:8000/v1/completions -H "Content-Type: application/json" -d '{ "model": "/xxxxx/Qwen2.5-7B-Instruct", "prompt": "Given the accelerating impacts of climate change—including rising sea levels, increasing frequency of extreme weather events, loss of biodiversity, and adverse effects on agriculture and human health—there is an urgent need for a robust, globally coordinated response. However, international efforts are complicated by a range of factors: economic disparities between high-income and low-income countries, differing levels of industrialization, varying access to clean energy technologies, and divergent political systems that influence climate policy implementation. In this context, how can global agreements like the Paris Accord be redesigned or strengthened to not only encourage but effectively enforce emission reduction targets? Furthermore, what mechanisms can be introduced to promote fair and transparent technology transfer, provide adequate financial support for climate adaptation in vulnerable regions, and hold nations accountable without exacerbating existing geopolitical tensions or disproportionately burdening those with historically lower emissions?", "max_tokens": 256, "temperature":0.0 }' ``` -### Pooling and Mixed Deployment Scenario +### Colocation Scenario -#### 1、Run Mixed Department Script - -The mixed script is essentially a pure pooling scenario for the P node. +#### 1.Run Mixed Department Script ``` bash mixed_department.sh @@ -301,6 +301,7 @@ export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packa export PYTHONPATH=$PYTHONPATH:/xxxxx/vllm export MOONCAKE_CONFIG_PATH="/xxxxxx/mooncake.json" export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3 +export PYTHONHASHSEED=0  export ACL_OP_INIT_MODE=1 export ASCEND_BUFFER_POOL=4:8 export ASCEND_CONNECT_TIMEOUT=10000 @@ -322,14 +323,13 @@ python3 -m vllm.entrypoints.openai.api_server \ "kv_connector": "AscendStoreConnector", "kv_role": "kv_both", "kv_connector_extra_config": { - "use_layerwise": false, "lookup_rpc_port":"1", "backend": "mooncake" } }' > mix.log 2>&1 ``` -#### 2. Run Inference +#### 2.Run Inference Configure the localhost, port, and model weight path in the command to your own settings. The requests sent will only go to the port where the mixed deployment script is located, and there is no need to start a separate proxy. diff --git a/vllm_ascend/distributed/kvpool/ascend_store_connector.py b/vllm_ascend/distributed/kvpool/ascend_store_connector.py index 1f11f841..093f3c07 100644 --- a/vllm_ascend/distributed/kvpool/ascend_store_connector.py +++ b/vllm_ascend/distributed/kvpool/ascend_store_connector.py @@ -55,7 +55,7 @@ class AscendStoreConnector(KVConnectorBase_V1): ) assert self.connector_worker is not None - if vllm_config.parallel_config.rank == 0 and self.kv_role != "kv_consumer": + if vllm_config.parallel_config.rank == 0: self.lookup_server = LookupKeyServer(self.connector_worker, vllm_config, self.use_layerwise) diff --git a/vllm_ascend/distributed/kvpool/pool_scheduler.py b/vllm_ascend/distributed/kvpool/pool_scheduler.py index 35ff9f19..9e1d982a 100644 --- a/vllm_ascend/distributed/kvpool/pool_scheduler.py +++ b/vllm_ascend/distributed/kvpool/pool_scheduler.py @@ -26,8 +26,7 @@ class KVPoolScheduler: "consumer_is_to_load", False) self.load_async = vllm_config.kv_transfer_config.kv_connector_extra_config.get( "load_async", False) - self.client = LookupKeyClient( - vllm_config) if self.kv_role != "kv_consumer" else None + self.client = LookupKeyClient(vllm_config) # request_id -> (vllm cached tokes, kvpool cached tokens) self.load_specs: dict[str, LoadSpec] = {} self.pcp_size = getattr(vllm_config.parallel_config, @@ -75,8 +74,8 @@ class KVPoolScheduler: else: token_len = len(request.prompt_token_ids) - num_external_hit_tokens = self.client.lookup( # type: ignore[union-attr] - token_len, request.block_hashes) + num_external_hit_tokens = self.client.lookup(token_len, + request.block_hashes) if num_external_hit_tokens == request.num_tokens: num_external_hit_tokens -= 1