From 073a3a6e6cd632dbb4b642776b2719f472e31ccb Mon Sep 17 00:00:00 2001 From: zxr2333 <64738772+nwpu-zxr@users.noreply.github.com> Date: Thu, 18 Dec 2025 22:29:19 +0800 Subject: [PATCH] [Doc][P/D] Fix MooncakeConnector's name (#5172) ### What this PR does / why we need it? vLLM community has integrated their MooncakeConnector. The original scripts will now find this MooncakeConnector instead of the one from vLLM-Ascend. All scripts that involve using the MooncakeConnector need to be modified to another name. ### Does this PR introduce _any_ user-facing change? Yes, users need to use a new name to load vLLM-Ascend MooncakeConnector. ### How was this patch tested? By CI. - vLLM version: v0.12.0 - vLLM main: https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9 --------- Signed-off-by: nwpu-zxr --- docs/source/tutorials/DeepSeek-V3.1.md | 8 ++++---- docs/source/tutorials/DeepSeek-V3.2.md | 8 ++++---- .../tutorials/pd_disaggregation_mooncake_multi_node.md | 8 ++++---- .../tutorials/pd_disaggregation_mooncake_single_node.md | 4 ++-- docs/source/user_guide/feature_guide/large_scale_ep.md | 8 ++++---- examples/offline_disaggregated_prefill_npu.py | 4 ++-- .../multi_node/config/models/DeepSeek-R1-W8A8-EPLB.yaml | 8 ++++---- .../multi_node/config/models/DeepSeek-R1-W8A8.yaml | 8 ++++---- .../e2e/nightly/multi_node/config/models/DeepSeek-V3.yaml | 4 ++-- .../multi_node/config/models/Qwen3-235B-W8A8-EPLB.yaml | 4 ++-- .../nightly/multi_node/config/models/Qwen3-235B-W8A8.yaml | 4 ++-- tests/ut/kv_connector/utils.py | 2 +- 12 files changed, 35 insertions(+), 35 deletions(-) diff --git a/docs/source/tutorials/DeepSeek-V3.1.md b/docs/source/tutorials/DeepSeek-V3.1.md index ec0ee08d..058f809a 100644 --- a/docs/source/tutorials/DeepSeek-V3.1.md +++ b/docs/source/tutorials/DeepSeek-V3.1.md @@ -421,7 +421,7 @@ vllm serve /weights/DeepSeek-V3.1_w8a8mix_mtp \ --speculative-config '{"num_speculative_tokens": 1, "method": "mtp"}' \ --additional-config '{"recompute_scheduler_enable":true,"enable_shared_expert_dp": true}' \ --kv-transfer-config \ - '{"kv_connector": "MooncakeConnector", + '{"kv_connector": "MooncakeConnectorV1", "kv_role": "kv_producer", "kv_port": "30000", "engine_id": "0", @@ -500,7 +500,7 @@ vllm serve /weights/DeepSeek-V3.1_w8a8mix_mtp \ --speculative-config '{"num_speculative_tokens": 1, "method": "deepseek_mtp"}' \ --additional-config '{"recompute_scheduler_enable":true,"enable_shared_expert_dp": true}' \ --kv-transfer-config \ - '{"kv_connector": "MooncakeConnector", + '{"kv_connector": "MooncakeConnectorV1", "kv_role": "kv_producer", "kv_port": "30100", "engine_id": "1", @@ -579,7 +579,7 @@ vllm serve /weights/DeepSeek-V3.1_w8a8mix_mtp \ --compilation-config '{"cudagraph_mode": "FULL_DECODE_ONLY"}' \ --additional-config '{"recompute_scheduler_enable":true,"multistream_overlap_shared_expert": true,"lm_head_tensor_parallel_size":16}' \ --kv-transfer-config \ - '{"kv_connector": "MooncakeConnector", + '{"kv_connector": "MooncakeConnectorV1", "kv_role": "kv_consumer", "kv_port": "30200", "engine_id": "2", @@ -658,7 +658,7 @@ vllm serve /weights/DeepSeek-V3.1_w8a8mix_mtp \ --compilation-config '{"cudagraph_mode": "FULL_DECODE_ONLY"}' \ --additional-config '{"recompute_scheduler_enable":true,"multistream_overlap_shared_expert": true,"lm_head_tensor_parallel_size":16}' \ --kv-transfer-config \ - '{"kv_connector": "MooncakeConnector", + '{"kv_connector": "MooncakeConnectorV1", "kv_role": "kv_consumer", "kv_port": "30300", "engine_id": "3", diff --git a/docs/source/tutorials/DeepSeek-V3.2.md b/docs/source/tutorials/DeepSeek-V3.2.md index f7b547b7..61f529c6 100644 --- a/docs/source/tutorials/DeepSeek-V3.2.md +++ b/docs/source/tutorials/DeepSeek-V3.2.md @@ -294,7 +294,7 @@ Before you start, please --enforce-eager \ --no-enable-prefix-caching \ --kv-transfer-config \ - '{"kv_connector": "MooncakeConnector", + '{"kv_connector": "MooncakeConnectorV1", "kv_role": "kv_producer", "kv_port": "30000", "engine_id": "0", @@ -369,7 +369,7 @@ Before you start, please --enforce-eager \ --no-enable-prefix-caching \ --kv-transfer-config \ - '{"kv_connector": "MooncakeConnector", + '{"kv_connector": "MooncakeConnectorV1", "kv_role": "kv_producer", "kv_port": "30000", "engine_id": "0", @@ -447,7 +447,7 @@ Before you start, please --async-scheduling \ --quantization ascend \ --kv-transfer-config \ - '{"kv_connector": "MooncakeConnector", + '{"kv_connector": "MooncakeConnectorV1", "kv_role": "kv_consumer", "kv_port": "30100", "engine_id": "1", @@ -525,7 +525,7 @@ Before you start, please --no-enable-prefix-caching \ --quantization ascend \ --kv-transfer-config \ - '{"kv_connector": "MooncakeConnector", + '{"kv_connector": "MooncakeConnectorV1", "kv_role": "kv_consumer", "kv_port": "30100", "engine_id": "1", diff --git a/docs/source/tutorials/pd_disaggregation_mooncake_multi_node.md b/docs/source/tutorials/pd_disaggregation_mooncake_multi_node.md index 089c0820..54f4ff62 100644 --- a/docs/source/tutorials/pd_disaggregation_mooncake_multi_node.md +++ b/docs/source/tutorials/pd_disaggregation_mooncake_multi_node.md @@ -504,7 +504,7 @@ vllm serve /path_to_weight/DeepSeek-r1_w8a8_mtp \ --speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' \ --additional-config '{"recompute_scheduler_enable":true,"enable_shared_expert_dp": true}' \ --kv-transfer-config \ - '{"kv_connector": "MooncakeConnector", + '{"kv_connector": "MooncakeConnectorV1", "kv_role": "kv_producer", "kv_port": "30000", "engine_id": "0", @@ -564,7 +564,7 @@ vllm serve /path_to_weight/DeepSeek-r1_w8a8_mtp \ --speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' \ --additional-config '{"recompute_scheduler_enable":true,"enable_shared_expert_dp": true}' \ --kv-transfer-config \ - '{"kv_connector": "MooncakeConnector", + '{"kv_connector": "MooncakeConnectorV1", "kv_role": "kv_producer", "kv_port": "30100", "engine_id": "1", @@ -625,7 +625,7 @@ vllm serve /path_to_weight/DeepSeek-r1_w8a8_mtp \ --additional-config '{"recompute_scheduler_enable":true,"multistream_overlap_shared_expert": true,"lm_head_tensor_parallel_size":16}' \ --compilation-config '{"cudagraph_mode": "FULL_DECODE_ONLY"}' \ --kv-transfer-config \ - '{"kv_connector": "MooncakeConnector", + '{"kv_connector": "MooncakeConnectorV1", "kv_role": "kv_consumer", "kv_port": "30200", "engine_id": "2", @@ -685,7 +685,7 @@ vllm serve /path_to_weight/DeepSeek-r1_w8a8_mtp \ --additional-config '{"recompute_scheduler_enable":true,"multistream_overlap_shared_expert": true,"lm_head_tensor_parallel_size":16}' \ --compilation-config '{"cudagraph_mode": "FULL_DECODE_ONLY"}' \ --kv-transfer-config \ - '{"kv_connector": "MooncakeConnector", + '{"kv_connector": "MooncakeConnectorV1", "kv_role": "kv_consumer", "kv_port": "30200", "engine_id": "2", diff --git a/docs/source/tutorials/pd_disaggregation_mooncake_single_node.md b/docs/source/tutorials/pd_disaggregation_mooncake_single_node.md index 553fb7d0..44c266d1 100644 --- a/docs/source/tutorials/pd_disaggregation_mooncake_single_node.md +++ b/docs/source/tutorials/pd_disaggregation_mooncake_single_node.md @@ -155,7 +155,7 @@ vllm serve /model/Qwen2.5-VL-7B-Instruct \ --trust-remote-code \ --gpu-memory-utilization 0.9 \ --kv-transfer-config \ - '{"kv_connector": "MooncakeConnector", + '{"kv_connector": "MooncakeConnectorV1", "kv_role": "kv_producer", "kv_port": "30000", "engine_id": "0", @@ -198,7 +198,7 @@ vllm serve /model/Qwen2.5-VL-7B-Instruct \ --trust-remote-code \ --gpu-memory-utilization 0.9 \ --kv-transfer-config \ - '{"kv_connector": "MooncakeConnector", + '{"kv_connector": "MooncakeConnectorV1", "kv_role": "kv_consumer", "kv_port": "30100", "engine_id": "1", diff --git a/docs/source/user_guide/feature_guide/large_scale_ep.md b/docs/source/user_guide/feature_guide/large_scale_ep.md index 1af17d1f..01a52434 100644 --- a/docs/source/user_guide/feature_guide/large_scale_ep.md +++ b/docs/source/user_guide/feature_guide/large_scale_ep.md @@ -158,7 +158,7 @@ vllm serve vllm-ascend/DeepSeek-R1-W8A8 \ --speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' \ --enforce-eager \ --kv-transfer-config \ - '{"kv_connector": "MooncakeConnector", + '{"kv_connector": "MooncakeConnectorV1", "kv_buffer_device": "npu", "kv_role": "kv_producer", "kv_parallel_size": "1", @@ -225,7 +225,7 @@ vllm serve vllm-ascend/DeepSeek-R1-W8A8 \ --quantization ascend \ --speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' \ --kv-transfer-config \ - '{"kv_connector": "MooncakeConnector", + '{"kv_connector": "MooncakeConnectorV1", "kv_buffer_device": "npu", "kv_role": "kv_consumer", "kv_parallel_size": "1", @@ -430,7 +430,7 @@ In the PD separation scenario, we provide a optimized configuration. ```shell --kv-transfer-config \ - '{"kv_connector": "MooncakeConnector", + '{"kv_connector": "MooncakeConnectorV1", "kv_buffer_device": "npu", "kv_role": "kv_producer", "kv_parallel_size": "1", @@ -453,7 +453,7 @@ In the PD separation scenario, we provide a optimized configuration. ```shell --kv-transfer-config - '{"kv_connector": "MooncakeConnector", + '{"kv_connector": "MooncakeConnectorV1", "kv_buffer_device": "npu", "kv_role": "kv_consumer", "kv_parallel_size": "1", diff --git a/examples/offline_disaggregated_prefill_npu.py b/examples/offline_disaggregated_prefill_npu.py index 0e244cd3..96281d20 100644 --- a/examples/offline_disaggregated_prefill_npu.py +++ b/examples/offline_disaggregated_prefill_npu.py @@ -50,7 +50,7 @@ def run_prefill(prefill_done, process_close): sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1) ktc = KVTransferConfig( - kv_connector="MooncakeConnector", + kv_connector="MooncakeConnectorV1", kv_role="kv_producer", kv_port="30000", engine_id="0", @@ -102,7 +102,7 @@ def run_decode(prefill_done): sampling_params = SamplingParams(temperature=0, top_p=0.95) ktc = KVTransferConfig( - kv_connector="MooncakeConnector", + kv_connector="MooncakeConnectorV1", kv_role="kv_consumer", kv_port="30100", engine_id="1", diff --git a/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8-EPLB.yaml b/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8-EPLB.yaml index cbc48d64..f689113f 100644 --- a/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8-EPLB.yaml +++ b/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8-EPLB.yaml @@ -39,7 +39,7 @@ deployment: --gpu-memory-utilization 0.9 --speculative-config '{"num_speculative_tokens": 1, "method":"mtp"}' --kv-transfer-config - '{"kv_connector": "MooncakeConnector", + '{"kv_connector": "MooncakeConnectorV1", "kv_role": "kv_producer", "kv_port": "30000", "engine_id": "0", @@ -77,7 +77,7 @@ deployment: --gpu-memory-utilization 0.9 --speculative-config '{"num_speculative_tokens": 1, "method":"mtp"}' --kv-transfer-config - '{"kv_connector": "MooncakeConnector", + '{"kv_connector": "MooncakeConnectorV1", "kv_role": "kv_producer", "kv_port": "30100", "engine_id": "1", @@ -116,7 +116,7 @@ deployment: --gpu-memory-utilization 0.9 --speculative-config '{"num_speculative_tokens": 1, "method":"mtp"}' --kv-transfer-config - '{"kv_connector": "MooncakeConnector", + '{"kv_connector": "MooncakeConnectorV1", "kv_role": "kv_consumer", "kv_port": "30200", "engine_id": "2", @@ -154,7 +154,7 @@ deployment: --gpu-memory-utilization 0.9 --speculative-config '{"num_speculative_tokens": 1, "method":"mtp"}' --kv-transfer-config - '{"kv_connector": "MooncakeConnector", + '{"kv_connector": "MooncakeConnectorV1", "kv_role": "kv_consumer", "kv_port": "30200", "engine_id": "2", diff --git a/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8.yaml b/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8.yaml index a99239d6..e8050e54 100644 --- a/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8.yaml +++ b/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8.yaml @@ -38,7 +38,7 @@ deployment: --gpu-memory-utilization 0.9 --speculative-config '{"num_speculative_tokens": 1, "method":"mtp"}' --kv-transfer-config - '{"kv_connector": "MooncakeConnector", + '{"kv_connector": "MooncakeConnectorV1", "kv_role": "kv_producer", "kv_port": "30000", "engine_id": "0", @@ -76,7 +76,7 @@ deployment: --gpu-memory-utilization 0.9 --speculative-config '{"num_speculative_tokens": 1, "method":"mtp"}' --kv-transfer-config - '{"kv_connector": "MooncakeConnector", + '{"kv_connector": "MooncakeConnectorV1", "kv_role": "kv_producer", "kv_port": "30100", "engine_id": "1", @@ -115,7 +115,7 @@ deployment: --gpu-memory-utilization 0.9 --speculative-config '{"num_speculative_tokens": 1, "method":"mtp"}' --kv-transfer-config - '{"kv_connector": "MooncakeConnector", + '{"kv_connector": "MooncakeConnectorV1", "kv_role": "kv_consumer", "kv_port": "30200", "engine_id": "2", @@ -153,7 +153,7 @@ deployment: --gpu-memory-utilization 0.9 --speculative-config '{"num_speculative_tokens": 1, "method":"mtp"}' --kv-transfer-config - '{"kv_connector": "MooncakeConnector", + '{"kv_connector": "MooncakeConnectorV1", "kv_role": "kv_consumer", "kv_port": "30200", "engine_id": "2", diff --git a/tests/e2e/nightly/multi_node/config/models/DeepSeek-V3.yaml b/tests/e2e/nightly/multi_node/config/models/DeepSeek-V3.yaml index 09afa24c..ce336df3 100644 --- a/tests/e2e/nightly/multi_node/config/models/DeepSeek-V3.yaml +++ b/tests/e2e/nightly/multi_node/config/models/DeepSeek-V3.yaml @@ -48,7 +48,7 @@ deployment: --no-enable-prefix-caching --gpu-memory-utilization 0.9 --kv-transfer-config - '{"kv_connector": "MooncakeConnector", + '{"kv_connector": "MooncakeConnectorV1", "kv_role": "kv_producer", "kv_port": "30000", "engine_id": "0", @@ -83,7 +83,7 @@ deployment: --no-enable-prefix-caching --gpu-memory-utilization 0.9 --kv-transfer-config - '{"kv_connector": "MooncakeConnector", + '{"kv_connector": "MooncakeConnectorV1", "kv_role": "kv_consumer", "kv_port": "30200", "engine_id": "1", diff --git a/tests/e2e/nightly/multi_node/config/models/Qwen3-235B-W8A8-EPLB.yaml b/tests/e2e/nightly/multi_node/config/models/Qwen3-235B-W8A8-EPLB.yaml index 8e7bf334..339891d4 100644 --- a/tests/e2e/nightly/multi_node/config/models/Qwen3-235B-W8A8-EPLB.yaml +++ b/tests/e2e/nightly/multi_node/config/models/Qwen3-235B-W8A8-EPLB.yaml @@ -33,7 +33,7 @@ deployment: --no-enable-prefix-caching --gpu-memory-utilization 0.9 --kv-transfer-config - '{"kv_connector": "MooncakeConnector", + '{"kv_connector": "MooncakeConnectorV1", "kv_role": "kv_producer", "kv_port": "30000", "engine_id": "0", @@ -70,7 +70,7 @@ deployment: --no-enable-prefix-caching --gpu-memory-utilization 0.9 --kv-transfer-config - '{"kv_connector": "MooncakeConnector", + '{"kv_connector": "MooncakeConnectorV1", "kv_role": "kv_consumer", "kv_port": "30200", "engine_id": "1", diff --git a/tests/e2e/nightly/multi_node/config/models/Qwen3-235B-W8A8.yaml b/tests/e2e/nightly/multi_node/config/models/Qwen3-235B-W8A8.yaml index be934f2e..9a1056b3 100644 --- a/tests/e2e/nightly/multi_node/config/models/Qwen3-235B-W8A8.yaml +++ b/tests/e2e/nightly/multi_node/config/models/Qwen3-235B-W8A8.yaml @@ -33,7 +33,7 @@ deployment: --no-enable-prefix-caching --gpu-memory-utilization 0.9 --kv-transfer-config - '{"kv_connector": "MooncakeConnector", + '{"kv_connector": "MooncakeConnectorV1", "kv_role": "kv_producer", "kv_port": "30000", "engine_id": "0", @@ -68,7 +68,7 @@ deployment: --no-enable-prefix-caching --gpu-memory-utilization 0.9 --kv-transfer-config - '{"kv_connector": "MooncakeConnector", + '{"kv_connector": "MooncakeConnectorV1", "kv_role": "kv_consumer", "kv_port": "30200", "engine_id": "1", diff --git a/tests/ut/kv_connector/utils.py b/tests/ut/kv_connector/utils.py index 70c84d80..0723eb01 100644 --- a/tests/ut/kv_connector/utils.py +++ b/tests/ut/kv_connector/utils.py @@ -78,7 +78,7 @@ def create_vllm_config( enable_prefix_caching=True, ) kv_transfer_config = KVTransferConfig( - kv_connector="MooncakeConnector", + kv_connector="MooncakeConnectorV1", kv_role="kv_both", kv_connector_module_path="vllm_ascend.distributed.mooncake_connector") return VllmConfig(scheduler_config=scheduler_config,