[Doc][P/D] Fix MooncakeConnector's name (#5172)
### What this PR does / why we need it?
vLLM community has integrated their MooncakeConnector. The original
scripts will now find this MooncakeConnector instead of the one from
vLLM-Ascend. All scripts that involve using the MooncakeConnector need
to be modified to another name.
### Does this PR introduce _any_ user-facing change?
Yes, users need to use a new name to load vLLM-Ascend MooncakeConnector.
### How was this patch tested?
By CI.
- vLLM version: v0.12.0
- vLLM main:
ad32e3e19c
---------
Signed-off-by: nwpu-zxr <zhouxuerong2@huawei.com>
This commit is contained in:
@@ -421,7 +421,7 @@ vllm serve /weights/DeepSeek-V3.1_w8a8mix_mtp \
|
||||
--speculative-config '{"num_speculative_tokens": 1, "method": "mtp"}' \
|
||||
--additional-config '{"recompute_scheduler_enable":true,"enable_shared_expert_dp": true}' \
|
||||
--kv-transfer-config \
|
||||
'{"kv_connector": "MooncakeConnector",
|
||||
'{"kv_connector": "MooncakeConnectorV1",
|
||||
"kv_role": "kv_producer",
|
||||
"kv_port": "30000",
|
||||
"engine_id": "0",
|
||||
@@ -500,7 +500,7 @@ vllm serve /weights/DeepSeek-V3.1_w8a8mix_mtp \
|
||||
--speculative-config '{"num_speculative_tokens": 1, "method": "deepseek_mtp"}' \
|
||||
--additional-config '{"recompute_scheduler_enable":true,"enable_shared_expert_dp": true}' \
|
||||
--kv-transfer-config \
|
||||
'{"kv_connector": "MooncakeConnector",
|
||||
'{"kv_connector": "MooncakeConnectorV1",
|
||||
"kv_role": "kv_producer",
|
||||
"kv_port": "30100",
|
||||
"engine_id": "1",
|
||||
@@ -579,7 +579,7 @@ vllm serve /weights/DeepSeek-V3.1_w8a8mix_mtp \
|
||||
--compilation-config '{"cudagraph_mode": "FULL_DECODE_ONLY"}' \
|
||||
--additional-config '{"recompute_scheduler_enable":true,"multistream_overlap_shared_expert": true,"lm_head_tensor_parallel_size":16}' \
|
||||
--kv-transfer-config \
|
||||
'{"kv_connector": "MooncakeConnector",
|
||||
'{"kv_connector": "MooncakeConnectorV1",
|
||||
"kv_role": "kv_consumer",
|
||||
"kv_port": "30200",
|
||||
"engine_id": "2",
|
||||
@@ -658,7 +658,7 @@ vllm serve /weights/DeepSeek-V3.1_w8a8mix_mtp \
|
||||
--compilation-config '{"cudagraph_mode": "FULL_DECODE_ONLY"}' \
|
||||
--additional-config '{"recompute_scheduler_enable":true,"multistream_overlap_shared_expert": true,"lm_head_tensor_parallel_size":16}' \
|
||||
--kv-transfer-config \
|
||||
'{"kv_connector": "MooncakeConnector",
|
||||
'{"kv_connector": "MooncakeConnectorV1",
|
||||
"kv_role": "kv_consumer",
|
||||
"kv_port": "30300",
|
||||
"engine_id": "3",
|
||||
|
||||
@@ -294,7 +294,7 @@ Before you start, please
|
||||
--enforce-eager \
|
||||
--no-enable-prefix-caching \
|
||||
--kv-transfer-config \
|
||||
'{"kv_connector": "MooncakeConnector",
|
||||
'{"kv_connector": "MooncakeConnectorV1",
|
||||
"kv_role": "kv_producer",
|
||||
"kv_port": "30000",
|
||||
"engine_id": "0",
|
||||
@@ -369,7 +369,7 @@ Before you start, please
|
||||
--enforce-eager \
|
||||
--no-enable-prefix-caching \
|
||||
--kv-transfer-config \
|
||||
'{"kv_connector": "MooncakeConnector",
|
||||
'{"kv_connector": "MooncakeConnectorV1",
|
||||
"kv_role": "kv_producer",
|
||||
"kv_port": "30000",
|
||||
"engine_id": "0",
|
||||
@@ -447,7 +447,7 @@ Before you start, please
|
||||
--async-scheduling \
|
||||
--quantization ascend \
|
||||
--kv-transfer-config \
|
||||
'{"kv_connector": "MooncakeConnector",
|
||||
'{"kv_connector": "MooncakeConnectorV1",
|
||||
"kv_role": "kv_consumer",
|
||||
"kv_port": "30100",
|
||||
"engine_id": "1",
|
||||
@@ -525,7 +525,7 @@ Before you start, please
|
||||
--no-enable-prefix-caching \
|
||||
--quantization ascend \
|
||||
--kv-transfer-config \
|
||||
'{"kv_connector": "MooncakeConnector",
|
||||
'{"kv_connector": "MooncakeConnectorV1",
|
||||
"kv_role": "kv_consumer",
|
||||
"kv_port": "30100",
|
||||
"engine_id": "1",
|
||||
|
||||
@@ -504,7 +504,7 @@ vllm serve /path_to_weight/DeepSeek-r1_w8a8_mtp \
|
||||
--speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' \
|
||||
--additional-config '{"recompute_scheduler_enable":true,"enable_shared_expert_dp": true}' \
|
||||
--kv-transfer-config \
|
||||
'{"kv_connector": "MooncakeConnector",
|
||||
'{"kv_connector": "MooncakeConnectorV1",
|
||||
"kv_role": "kv_producer",
|
||||
"kv_port": "30000",
|
||||
"engine_id": "0",
|
||||
@@ -564,7 +564,7 @@ vllm serve /path_to_weight/DeepSeek-r1_w8a8_mtp \
|
||||
--speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' \
|
||||
--additional-config '{"recompute_scheduler_enable":true,"enable_shared_expert_dp": true}' \
|
||||
--kv-transfer-config \
|
||||
'{"kv_connector": "MooncakeConnector",
|
||||
'{"kv_connector": "MooncakeConnectorV1",
|
||||
"kv_role": "kv_producer",
|
||||
"kv_port": "30100",
|
||||
"engine_id": "1",
|
||||
@@ -625,7 +625,7 @@ vllm serve /path_to_weight/DeepSeek-r1_w8a8_mtp \
|
||||
--additional-config '{"recompute_scheduler_enable":true,"multistream_overlap_shared_expert": true,"lm_head_tensor_parallel_size":16}' \
|
||||
--compilation-config '{"cudagraph_mode": "FULL_DECODE_ONLY"}' \
|
||||
--kv-transfer-config \
|
||||
'{"kv_connector": "MooncakeConnector",
|
||||
'{"kv_connector": "MooncakeConnectorV1",
|
||||
"kv_role": "kv_consumer",
|
||||
"kv_port": "30200",
|
||||
"engine_id": "2",
|
||||
@@ -685,7 +685,7 @@ vllm serve /path_to_weight/DeepSeek-r1_w8a8_mtp \
|
||||
--additional-config '{"recompute_scheduler_enable":true,"multistream_overlap_shared_expert": true,"lm_head_tensor_parallel_size":16}' \
|
||||
--compilation-config '{"cudagraph_mode": "FULL_DECODE_ONLY"}' \
|
||||
--kv-transfer-config \
|
||||
'{"kv_connector": "MooncakeConnector",
|
||||
'{"kv_connector": "MooncakeConnectorV1",
|
||||
"kv_role": "kv_consumer",
|
||||
"kv_port": "30200",
|
||||
"engine_id": "2",
|
||||
|
||||
@@ -155,7 +155,7 @@ vllm serve /model/Qwen2.5-VL-7B-Instruct \
|
||||
--trust-remote-code \
|
||||
--gpu-memory-utilization 0.9 \
|
||||
--kv-transfer-config \
|
||||
'{"kv_connector": "MooncakeConnector",
|
||||
'{"kv_connector": "MooncakeConnectorV1",
|
||||
"kv_role": "kv_producer",
|
||||
"kv_port": "30000",
|
||||
"engine_id": "0",
|
||||
@@ -198,7 +198,7 @@ vllm serve /model/Qwen2.5-VL-7B-Instruct \
|
||||
--trust-remote-code \
|
||||
--gpu-memory-utilization 0.9 \
|
||||
--kv-transfer-config \
|
||||
'{"kv_connector": "MooncakeConnector",
|
||||
'{"kv_connector": "MooncakeConnectorV1",
|
||||
"kv_role": "kv_consumer",
|
||||
"kv_port": "30100",
|
||||
"engine_id": "1",
|
||||
|
||||
@@ -158,7 +158,7 @@ vllm serve vllm-ascend/DeepSeek-R1-W8A8 \
|
||||
--speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' \
|
||||
--enforce-eager \
|
||||
--kv-transfer-config \
|
||||
'{"kv_connector": "MooncakeConnector",
|
||||
'{"kv_connector": "MooncakeConnectorV1",
|
||||
"kv_buffer_device": "npu",
|
||||
"kv_role": "kv_producer",
|
||||
"kv_parallel_size": "1",
|
||||
@@ -225,7 +225,7 @@ vllm serve vllm-ascend/DeepSeek-R1-W8A8 \
|
||||
--quantization ascend \
|
||||
--speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' \
|
||||
--kv-transfer-config \
|
||||
'{"kv_connector": "MooncakeConnector",
|
||||
'{"kv_connector": "MooncakeConnectorV1",
|
||||
"kv_buffer_device": "npu",
|
||||
"kv_role": "kv_consumer",
|
||||
"kv_parallel_size": "1",
|
||||
@@ -430,7 +430,7 @@ In the PD separation scenario, we provide a optimized configuration.
|
||||
|
||||
```shell
|
||||
--kv-transfer-config \
|
||||
'{"kv_connector": "MooncakeConnector",
|
||||
'{"kv_connector": "MooncakeConnectorV1",
|
||||
"kv_buffer_device": "npu",
|
||||
"kv_role": "kv_producer",
|
||||
"kv_parallel_size": "1",
|
||||
@@ -453,7 +453,7 @@ In the PD separation scenario, we provide a optimized configuration.
|
||||
|
||||
```shell
|
||||
--kv-transfer-config
|
||||
'{"kv_connector": "MooncakeConnector",
|
||||
'{"kv_connector": "MooncakeConnectorV1",
|
||||
"kv_buffer_device": "npu",
|
||||
"kv_role": "kv_consumer",
|
||||
"kv_parallel_size": "1",
|
||||
|
||||
@@ -50,7 +50,7 @@ def run_prefill(prefill_done, process_close):
|
||||
sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1)
|
||||
|
||||
ktc = KVTransferConfig(
|
||||
kv_connector="MooncakeConnector",
|
||||
kv_connector="MooncakeConnectorV1",
|
||||
kv_role="kv_producer",
|
||||
kv_port="30000",
|
||||
engine_id="0",
|
||||
@@ -102,7 +102,7 @@ def run_decode(prefill_done):
|
||||
sampling_params = SamplingParams(temperature=0, top_p=0.95)
|
||||
|
||||
ktc = KVTransferConfig(
|
||||
kv_connector="MooncakeConnector",
|
||||
kv_connector="MooncakeConnectorV1",
|
||||
kv_role="kv_consumer",
|
||||
kv_port="30100",
|
||||
engine_id="1",
|
||||
|
||||
@@ -39,7 +39,7 @@ deployment:
|
||||
--gpu-memory-utilization 0.9
|
||||
--speculative-config '{"num_speculative_tokens": 1, "method":"mtp"}'
|
||||
--kv-transfer-config
|
||||
'{"kv_connector": "MooncakeConnector",
|
||||
'{"kv_connector": "MooncakeConnectorV1",
|
||||
"kv_role": "kv_producer",
|
||||
"kv_port": "30000",
|
||||
"engine_id": "0",
|
||||
@@ -77,7 +77,7 @@ deployment:
|
||||
--gpu-memory-utilization 0.9
|
||||
--speculative-config '{"num_speculative_tokens": 1, "method":"mtp"}'
|
||||
--kv-transfer-config
|
||||
'{"kv_connector": "MooncakeConnector",
|
||||
'{"kv_connector": "MooncakeConnectorV1",
|
||||
"kv_role": "kv_producer",
|
||||
"kv_port": "30100",
|
||||
"engine_id": "1",
|
||||
@@ -116,7 +116,7 @@ deployment:
|
||||
--gpu-memory-utilization 0.9
|
||||
--speculative-config '{"num_speculative_tokens": 1, "method":"mtp"}'
|
||||
--kv-transfer-config
|
||||
'{"kv_connector": "MooncakeConnector",
|
||||
'{"kv_connector": "MooncakeConnectorV1",
|
||||
"kv_role": "kv_consumer",
|
||||
"kv_port": "30200",
|
||||
"engine_id": "2",
|
||||
@@ -154,7 +154,7 @@ deployment:
|
||||
--gpu-memory-utilization 0.9
|
||||
--speculative-config '{"num_speculative_tokens": 1, "method":"mtp"}'
|
||||
--kv-transfer-config
|
||||
'{"kv_connector": "MooncakeConnector",
|
||||
'{"kv_connector": "MooncakeConnectorV1",
|
||||
"kv_role": "kv_consumer",
|
||||
"kv_port": "30200",
|
||||
"engine_id": "2",
|
||||
|
||||
@@ -38,7 +38,7 @@ deployment:
|
||||
--gpu-memory-utilization 0.9
|
||||
--speculative-config '{"num_speculative_tokens": 1, "method":"mtp"}'
|
||||
--kv-transfer-config
|
||||
'{"kv_connector": "MooncakeConnector",
|
||||
'{"kv_connector": "MooncakeConnectorV1",
|
||||
"kv_role": "kv_producer",
|
||||
"kv_port": "30000",
|
||||
"engine_id": "0",
|
||||
@@ -76,7 +76,7 @@ deployment:
|
||||
--gpu-memory-utilization 0.9
|
||||
--speculative-config '{"num_speculative_tokens": 1, "method":"mtp"}'
|
||||
--kv-transfer-config
|
||||
'{"kv_connector": "MooncakeConnector",
|
||||
'{"kv_connector": "MooncakeConnectorV1",
|
||||
"kv_role": "kv_producer",
|
||||
"kv_port": "30100",
|
||||
"engine_id": "1",
|
||||
@@ -115,7 +115,7 @@ deployment:
|
||||
--gpu-memory-utilization 0.9
|
||||
--speculative-config '{"num_speculative_tokens": 1, "method":"mtp"}'
|
||||
--kv-transfer-config
|
||||
'{"kv_connector": "MooncakeConnector",
|
||||
'{"kv_connector": "MooncakeConnectorV1",
|
||||
"kv_role": "kv_consumer",
|
||||
"kv_port": "30200",
|
||||
"engine_id": "2",
|
||||
@@ -153,7 +153,7 @@ deployment:
|
||||
--gpu-memory-utilization 0.9
|
||||
--speculative-config '{"num_speculative_tokens": 1, "method":"mtp"}'
|
||||
--kv-transfer-config
|
||||
'{"kv_connector": "MooncakeConnector",
|
||||
'{"kv_connector": "MooncakeConnectorV1",
|
||||
"kv_role": "kv_consumer",
|
||||
"kv_port": "30200",
|
||||
"engine_id": "2",
|
||||
|
||||
@@ -48,7 +48,7 @@ deployment:
|
||||
--no-enable-prefix-caching
|
||||
--gpu-memory-utilization 0.9
|
||||
--kv-transfer-config
|
||||
'{"kv_connector": "MooncakeConnector",
|
||||
'{"kv_connector": "MooncakeConnectorV1",
|
||||
"kv_role": "kv_producer",
|
||||
"kv_port": "30000",
|
||||
"engine_id": "0",
|
||||
@@ -83,7 +83,7 @@ deployment:
|
||||
--no-enable-prefix-caching
|
||||
--gpu-memory-utilization 0.9
|
||||
--kv-transfer-config
|
||||
'{"kv_connector": "MooncakeConnector",
|
||||
'{"kv_connector": "MooncakeConnectorV1",
|
||||
"kv_role": "kv_consumer",
|
||||
"kv_port": "30200",
|
||||
"engine_id": "1",
|
||||
|
||||
@@ -33,7 +33,7 @@ deployment:
|
||||
--no-enable-prefix-caching
|
||||
--gpu-memory-utilization 0.9
|
||||
--kv-transfer-config
|
||||
'{"kv_connector": "MooncakeConnector",
|
||||
'{"kv_connector": "MooncakeConnectorV1",
|
||||
"kv_role": "kv_producer",
|
||||
"kv_port": "30000",
|
||||
"engine_id": "0",
|
||||
@@ -70,7 +70,7 @@ deployment:
|
||||
--no-enable-prefix-caching
|
||||
--gpu-memory-utilization 0.9
|
||||
--kv-transfer-config
|
||||
'{"kv_connector": "MooncakeConnector",
|
||||
'{"kv_connector": "MooncakeConnectorV1",
|
||||
"kv_role": "kv_consumer",
|
||||
"kv_port": "30200",
|
||||
"engine_id": "1",
|
||||
|
||||
@@ -33,7 +33,7 @@ deployment:
|
||||
--no-enable-prefix-caching
|
||||
--gpu-memory-utilization 0.9
|
||||
--kv-transfer-config
|
||||
'{"kv_connector": "MooncakeConnector",
|
||||
'{"kv_connector": "MooncakeConnectorV1",
|
||||
"kv_role": "kv_producer",
|
||||
"kv_port": "30000",
|
||||
"engine_id": "0",
|
||||
@@ -68,7 +68,7 @@ deployment:
|
||||
--no-enable-prefix-caching
|
||||
--gpu-memory-utilization 0.9
|
||||
--kv-transfer-config
|
||||
'{"kv_connector": "MooncakeConnector",
|
||||
'{"kv_connector": "MooncakeConnectorV1",
|
||||
"kv_role": "kv_consumer",
|
||||
"kv_port": "30200",
|
||||
"engine_id": "1",
|
||||
|
||||
@@ -78,7 +78,7 @@ def create_vllm_config(
|
||||
enable_prefix_caching=True,
|
||||
)
|
||||
kv_transfer_config = KVTransferConfig(
|
||||
kv_connector="MooncakeConnector",
|
||||
kv_connector="MooncakeConnectorV1",
|
||||
kv_role="kv_both",
|
||||
kv_connector_module_path="vllm_ascend.distributed.mooncake_connector")
|
||||
return VllmConfig(scheduler_config=scheduler_config,
|
||||
|
||||
Reference in New Issue
Block a user