2025-08-18 14:30:07 +08:00
|
|
|
import os
|
|
|
|
|
import queue
|
|
|
|
|
import socket
|
|
|
|
|
import sys
|
|
|
|
|
import threading
|
|
|
|
|
import time
|
|
|
|
|
import types
|
|
|
|
|
import unittest
|
|
|
|
|
from collections import defaultdict, deque
|
2026-01-26 14:28:33 +08:00
|
|
|
from typing import Any, Dict, OrderedDict, Optional
|
2025-08-18 14:30:07 +08:00
|
|
|
from unittest.mock import MagicMock, patch
|
|
|
|
|
|
|
|
|
|
import msgspec
|
|
|
|
|
import zmq
|
2025-11-24 17:08:20 +08:00
|
|
|
from vllm.utils.network_utils import make_zmq_path
|
2025-08-18 14:30:07 +08:00
|
|
|
|
|
|
|
|
fake_engine = types.ModuleType("mooncake.engine")
|
|
|
|
|
fake_engine.TransferEngine = MagicMock() # type: ignore[attr-defined]
|
|
|
|
|
sys.modules["mooncake.engine"] = fake_engine
|
|
|
|
|
|
mooncake connector support pipeline parallel & fix pp with flashcomm1 (#4054)
### What this PR does / why we need it?
To support pipeline parallel with PD disaggregation, this PR support PP
in mooncake connector and fix other bugs when enable pp with other
optimization params, including following changes:
- mooncake connector support pp in prefill, we do not support decode pp
currently
- fix bugs when enable both pp and flashcomm1
- optimize ascend-scheduler to support full batch in multiple pipeline
stages, original implementation would cause all pipeline stages
batch_size total summed to max_num_seq, which makes pipeline is not
full, this optimization can make all stages running with full batch_size
= max_num_seq, the same changes will contribute to vllm scheduler too.
### Does this PR introduce _any_ user-facing change?
add `pp_size` in mooncake connector kv_connector_extra_config
```
"kv_connector_extra_config": {
"use_ascend_direct": true,
"prefill": {
"dp_size": 1,
"tp_size": 4,
"pp_size": 4
},
"decode": {
"dp_size": 16,
"tp_size": 1
}
}
```
### How was this patch tested?
- vLLM version: v0.12.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9
---------
Signed-off-by: chenxiao <Jaychou1620@Gmail.com>
Signed-off-by: Kurumi5210 <Jaychou1620@Gmail.com>
Signed-off-by: Kurumi5210 <jaychou1620@gmail.com>
Signed-off-by: 秋刀鱼 <jaychou1620@Gmail.com>
Co-authored-by: chenxiao <Jaychou1620@Gmail.com>
Co-authored-by: zss <zss@qq.com>
Co-authored-by: zss <3265779424@qq.com>
2025-12-10 16:01:43 +08:00
|
|
|
_mock_ascend_config = MagicMock(enable_kv_nz=False)
|
|
|
|
|
_mock_pp_group = MagicMock(rank_in_group=0, world_size=1)
|
|
|
|
|
_mock_tp_group = MagicMock(rank_in_group=0, world_size=4)
|
2025-12-17 16:32:16 +08:00
|
|
|
_mock_pcp_group = MagicMock(rank_in_group=0, world_size=1)
|
2026-01-05 09:05:45 +08:00
|
|
|
_mock_dcp_group = MagicMock(rank_in_group=0, world_size=1)
|
mooncake connector support pipeline parallel & fix pp with flashcomm1 (#4054)
### What this PR does / why we need it?
To support pipeline parallel with PD disaggregation, this PR support PP
in mooncake connector and fix other bugs when enable pp with other
optimization params, including following changes:
- mooncake connector support pp in prefill, we do not support decode pp
currently
- fix bugs when enable both pp and flashcomm1
- optimize ascend-scheduler to support full batch in multiple pipeline
stages, original implementation would cause all pipeline stages
batch_size total summed to max_num_seq, which makes pipeline is not
full, this optimization can make all stages running with full batch_size
= max_num_seq, the same changes will contribute to vllm scheduler too.
### Does this PR introduce _any_ user-facing change?
add `pp_size` in mooncake connector kv_connector_extra_config
```
"kv_connector_extra_config": {
"use_ascend_direct": true,
"prefill": {
"dp_size": 1,
"tp_size": 4,
"pp_size": 4
},
"decode": {
"dp_size": 16,
"tp_size": 1
}
}
```
### How was this patch tested?
- vLLM version: v0.12.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9
---------
Signed-off-by: chenxiao <Jaychou1620@Gmail.com>
Signed-off-by: Kurumi5210 <Jaychou1620@Gmail.com>
Signed-off-by: Kurumi5210 <jaychou1620@gmail.com>
Signed-off-by: 秋刀鱼 <jaychou1620@Gmail.com>
Co-authored-by: chenxiao <Jaychou1620@Gmail.com>
Co-authored-by: zss <zss@qq.com>
Co-authored-by: zss <3265779424@qq.com>
2025-12-10 16:01:43 +08:00
|
|
|
patch(
|
2026-01-15 08:57:40 +08:00
|
|
|
'vllm_ascend.distributed.kv_transfer.kv_p2p.mooncake_connector.get_pp_group',
|
|
|
|
|
return_value=_mock_pp_group).start()
|
|
|
|
|
patch(
|
|
|
|
|
'vllm_ascend.distributed.kv_transfer.kv_p2p.mooncake_connector.get_tp_group',
|
|
|
|
|
return_value=_mock_tp_group).start()
|
|
|
|
|
patch(
|
|
|
|
|
'vllm_ascend.distributed.kv_transfer.kv_p2p.mooncake_connector.get_tensor_model_parallel_world_size',
|
mooncake connector support pipeline parallel & fix pp with flashcomm1 (#4054)
### What this PR does / why we need it?
To support pipeline parallel with PD disaggregation, this PR support PP
in mooncake connector and fix other bugs when enable pp with other
optimization params, including following changes:
- mooncake connector support pp in prefill, we do not support decode pp
currently
- fix bugs when enable both pp and flashcomm1
- optimize ascend-scheduler to support full batch in multiple pipeline
stages, original implementation would cause all pipeline stages
batch_size total summed to max_num_seq, which makes pipeline is not
full, this optimization can make all stages running with full batch_size
= max_num_seq, the same changes will contribute to vllm scheduler too.
### Does this PR introduce _any_ user-facing change?
add `pp_size` in mooncake connector kv_connector_extra_config
```
"kv_connector_extra_config": {
"use_ascend_direct": true,
"prefill": {
"dp_size": 1,
"tp_size": 4,
"pp_size": 4
},
"decode": {
"dp_size": 16,
"tp_size": 1
}
}
```
### How was this patch tested?
- vLLM version: v0.12.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9
---------
Signed-off-by: chenxiao <Jaychou1620@Gmail.com>
Signed-off-by: Kurumi5210 <Jaychou1620@Gmail.com>
Signed-off-by: Kurumi5210 <jaychou1620@gmail.com>
Signed-off-by: 秋刀鱼 <jaychou1620@Gmail.com>
Co-authored-by: chenxiao <Jaychou1620@Gmail.com>
Co-authored-by: zss <zss@qq.com>
Co-authored-by: zss <3265779424@qq.com>
2025-12-10 16:01:43 +08:00
|
|
|
return_value=4).start()
|
|
|
|
|
patch(
|
2026-01-15 08:57:40 +08:00
|
|
|
'vllm_ascend.distributed.kv_transfer.kv_p2p.mooncake_connector.get_tensor_model_parallel_rank',
|
mooncake connector support pipeline parallel & fix pp with flashcomm1 (#4054)
### What this PR does / why we need it?
To support pipeline parallel with PD disaggregation, this PR support PP
in mooncake connector and fix other bugs when enable pp with other
optimization params, including following changes:
- mooncake connector support pp in prefill, we do not support decode pp
currently
- fix bugs when enable both pp and flashcomm1
- optimize ascend-scheduler to support full batch in multiple pipeline
stages, original implementation would cause all pipeline stages
batch_size total summed to max_num_seq, which makes pipeline is not
full, this optimization can make all stages running with full batch_size
= max_num_seq, the same changes will contribute to vllm scheduler too.
### Does this PR introduce _any_ user-facing change?
add `pp_size` in mooncake connector kv_connector_extra_config
```
"kv_connector_extra_config": {
"use_ascend_direct": true,
"prefill": {
"dp_size": 1,
"tp_size": 4,
"pp_size": 4
},
"decode": {
"dp_size": 16,
"tp_size": 1
}
}
```
### How was this patch tested?
- vLLM version: v0.12.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9
---------
Signed-off-by: chenxiao <Jaychou1620@Gmail.com>
Signed-off-by: Kurumi5210 <Jaychou1620@Gmail.com>
Signed-off-by: Kurumi5210 <jaychou1620@gmail.com>
Signed-off-by: 秋刀鱼 <jaychou1620@Gmail.com>
Co-authored-by: chenxiao <Jaychou1620@Gmail.com>
Co-authored-by: zss <zss@qq.com>
Co-authored-by: zss <3265779424@qq.com>
2025-12-10 16:01:43 +08:00
|
|
|
return_value=0).start()
|
2026-01-15 08:57:40 +08:00
|
|
|
patch(
|
|
|
|
|
'vllm_ascend.distributed.kv_transfer.kv_p2p.mooncake_connector.get_pcp_group',
|
|
|
|
|
return_value=_mock_pcp_group).start()
|
2026-01-05 09:05:45 +08:00
|
|
|
patch('vllm.distributed.parallel_state._DCP', _mock_dcp_group).start()
|
mooncake connector support pipeline parallel & fix pp with flashcomm1 (#4054)
### What this PR does / why we need it?
To support pipeline parallel with PD disaggregation, this PR support PP
in mooncake connector and fix other bugs when enable pp with other
optimization params, including following changes:
- mooncake connector support pp in prefill, we do not support decode pp
currently
- fix bugs when enable both pp and flashcomm1
- optimize ascend-scheduler to support full batch in multiple pipeline
stages, original implementation would cause all pipeline stages
batch_size total summed to max_num_seq, which makes pipeline is not
full, this optimization can make all stages running with full batch_size
= max_num_seq, the same changes will contribute to vllm scheduler too.
### Does this PR introduce _any_ user-facing change?
add `pp_size` in mooncake connector kv_connector_extra_config
```
"kv_connector_extra_config": {
"use_ascend_direct": true,
"prefill": {
"dp_size": 1,
"tp_size": 4,
"pp_size": 4
},
"decode": {
"dp_size": 16,
"tp_size": 1
}
}
```
### How was this patch tested?
- vLLM version: v0.12.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9
---------
Signed-off-by: chenxiao <Jaychou1620@Gmail.com>
Signed-off-by: Kurumi5210 <Jaychou1620@Gmail.com>
Signed-off-by: Kurumi5210 <jaychou1620@gmail.com>
Signed-off-by: 秋刀鱼 <jaychou1620@Gmail.com>
Co-authored-by: chenxiao <Jaychou1620@Gmail.com>
Co-authored-by: zss <zss@qq.com>
Co-authored-by: zss <3265779424@qq.com>
2025-12-10 16:01:43 +08:00
|
|
|
|
2026-01-15 08:57:40 +08:00
|
|
|
from vllm_ascend.distributed.kv_transfer.kv_p2p.mooncake_connector import ( # noqa: E402
|
2025-08-18 14:30:07 +08:00
|
|
|
KVCacheRecvingThread, KVCacheSendingThread, KVCacheTaskTracker,
|
|
|
|
|
KVConnectorRole, MooncakeAgentMetadata, MooncakeConnector,
|
|
|
|
|
MooncakeConnectorMetadata, MooncakeConnectorScheduler,
|
|
|
|
|
MooncakeConnectorWorker, ReqMeta, ensure_zmq_recv, ensure_zmq_send,
|
|
|
|
|
group_concurrent_contiguous, string_to_int64_hash, zmq_ctx)
|
|
|
|
|
|
|
|
|
|
GET_META_MSG = b"get_meta_msg"
|
|
|
|
|
DONE_RECVING_MSG = b"done_recving_msg"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestKVCacheTaskTrackerInit(unittest.TestCase):
|
|
|
|
|
|
|
|
|
|
def test_init_basic_properties(self):
|
2025-09-04 08:22:10 +08:00
|
|
|
tracker = KVCacheTaskTracker()
|
2025-08-18 14:30:07 +08:00
|
|
|
self.assertIsInstance(tracker.done_task_lock, type(threading.Lock()))
|
|
|
|
|
self.assertIsInstance(tracker.finished_requests, set)
|
2025-09-24 11:22:46 +08:00
|
|
|
self.assertIsInstance(tracker.delayed_free_requests, OrderedDict)
|
2025-08-18 14:30:07 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestGetAndClearFinishedSingleRequests(unittest.TestCase):
|
|
|
|
|
|
|
|
|
|
def setUp(self):
|
2025-09-04 08:22:10 +08:00
|
|
|
self.tracker = KVCacheTaskTracker()
|
2025-08-18 14:30:07 +08:00
|
|
|
self.tracker.finished_requests = set()
|
|
|
|
|
self.tracker.done_task_lock = threading.Lock()
|
|
|
|
|
|
|
|
|
|
def test_empty_requests(self):
|
|
|
|
|
result = self.tracker.get_and_clear_finished_requests()
|
|
|
|
|
self.assertEqual(result, set())
|
|
|
|
|
self.assertEqual(len(self.tracker.finished_requests), 0)
|
|
|
|
|
|
|
|
|
|
def test_single_request(self):
|
|
|
|
|
self.tracker.finished_requests = {"req_123"}
|
|
|
|
|
result = self.tracker.get_and_clear_finished_requests()
|
|
|
|
|
self.assertEqual(result, {"req_123"})
|
|
|
|
|
self.assertEqual(len(self.tracker.finished_requests), 0)
|
|
|
|
|
|
|
|
|
|
def test_multiple_requests(self):
|
|
|
|
|
self.tracker.finished_requests = {"req_1", "req_2", "req_3"}
|
|
|
|
|
result = self.tracker.get_and_clear_finished_requests()
|
|
|
|
|
self.assertSetEqual(result, {"req_1", "req_2", "req_3"})
|
|
|
|
|
self.assertEqual(len(self.tracker.finished_requests), 0)
|
|
|
|
|
|
2026-01-15 08:57:40 +08:00
|
|
|
@patch(
|
|
|
|
|
"vllm_ascend.distributed.kv_transfer.kv_p2p.mooncake_connector.logger")
|
2025-08-18 14:30:07 +08:00
|
|
|
def test_concurrent_access(self, mock_logger):
|
|
|
|
|
from concurrent.futures import ThreadPoolExecutor
|
|
|
|
|
self.tracker.finished_requests = {"req_1", "req_2"}
|
|
|
|
|
with ThreadPoolExecutor(max_workers=3) as executor:
|
|
|
|
|
futures = [
|
|
|
|
|
executor.submit(self.tracker.get_and_clear_finished_requests)
|
|
|
|
|
for _ in range(3)
|
|
|
|
|
]
|
|
|
|
|
results = [f.result() for f in futures]
|
|
|
|
|
self.assertEqual(sum(1 for r in results if r), 1)
|
|
|
|
|
self.assertEqual(len(self.tracker.finished_requests), 0)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestKVCacheSendingThreadInit(unittest.TestCase):
|
|
|
|
|
|
|
|
|
|
def setUp(self):
|
2025-10-20 16:30:57 +08:00
|
|
|
kv_caches: Dict[str, Any] = {}
|
2025-08-18 14:30:07 +08:00
|
|
|
self.common_args = {
|
|
|
|
|
'tp_rank': 1,
|
2025-11-21 18:08:56 +08:00
|
|
|
'prefill_tp_size': 4,
|
2025-08-18 14:30:07 +08:00
|
|
|
'local_engine_id': 'engine_1',
|
|
|
|
|
'side_channel_host': 'localhost',
|
|
|
|
|
'side_channel_port': 5555,
|
|
|
|
|
'metadata': MagicMock(),
|
mooncake connector support pipeline parallel & fix pp with flashcomm1 (#4054)
### What this PR does / why we need it?
To support pipeline parallel with PD disaggregation, this PR support PP
in mooncake connector and fix other bugs when enable pp with other
optimization params, including following changes:
- mooncake connector support pp in prefill, we do not support decode pp
currently
- fix bugs when enable both pp and flashcomm1
- optimize ascend-scheduler to support full batch in multiple pipeline
stages, original implementation would cause all pipeline stages
batch_size total summed to max_num_seq, which makes pipeline is not
full, this optimization can make all stages running with full batch_size
= max_num_seq, the same changes will contribute to vllm scheduler too.
### Does this PR introduce _any_ user-facing change?
add `pp_size` in mooncake connector kv_connector_extra_config
```
"kv_connector_extra_config": {
"use_ascend_direct": true,
"prefill": {
"dp_size": 1,
"tp_size": 4,
"pp_size": 4
},
"decode": {
"dp_size": 16,
"tp_size": 1
}
}
```
### How was this patch tested?
- vLLM version: v0.12.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9
---------
Signed-off-by: chenxiao <Jaychou1620@Gmail.com>
Signed-off-by: Kurumi5210 <Jaychou1620@Gmail.com>
Signed-off-by: Kurumi5210 <jaychou1620@gmail.com>
Signed-off-by: 秋刀鱼 <jaychou1620@Gmail.com>
Co-authored-by: chenxiao <Jaychou1620@Gmail.com>
Co-authored-by: zss <zss@qq.com>
Co-authored-by: zss <3265779424@qq.com>
2025-12-10 16:01:43 +08:00
|
|
|
'vllm_config': MockVllmConfig(),
|
2025-10-20 16:30:57 +08:00
|
|
|
'ready_event': threading.Event(),
|
2025-11-18 10:17:48 +08:00
|
|
|
'kv_caches': kv_caches,
|
|
|
|
|
'pcp_rank': 0
|
2025-08-18 14:30:07 +08:00
|
|
|
}
|
|
|
|
|
self.threads = []
|
|
|
|
|
|
|
|
|
|
def tearDown(self):
|
|
|
|
|
for thread in self.threads:
|
|
|
|
|
if hasattr(thread, 'task_tracker') and hasattr(
|
|
|
|
|
thread.task_tracker, 'socket'):
|
|
|
|
|
thread.task_tracker.socket.close()
|
|
|
|
|
if hasattr(thread, 'is_alive') and thread.is_alive():
|
|
|
|
|
thread.join(timeout=0.1)
|
|
|
|
|
|
|
|
|
|
def test_thread_daemon_property(self):
|
|
|
|
|
thread = KVCacheSendingThread(**self.common_args)
|
|
|
|
|
self.threads.append(thread)
|
|
|
|
|
self.assertTrue(thread.daemon)
|
|
|
|
|
|
|
|
|
|
def test_thread_name_format(self):
|
|
|
|
|
thread = KVCacheSendingThread(**self.common_args)
|
|
|
|
|
self.threads.append(thread)
|
|
|
|
|
self.assertEqual(thread.name, "KVCacheSendingThread")
|
|
|
|
|
|
|
|
|
|
def test_ready_event_reference(self):
|
|
|
|
|
custom_event = threading.Event()
|
|
|
|
|
args = self.common_args.copy()
|
|
|
|
|
args['ready_event'] = custom_event
|
|
|
|
|
thread = KVCacheSendingThread(**args)
|
|
|
|
|
self.threads.append(thread)
|
|
|
|
|
self.assertIs(thread.ready_event, custom_event)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestGetAndClearFinishedRequests(unittest.TestCase):
|
|
|
|
|
|
|
|
|
|
def setUp(self):
|
2025-10-20 16:30:57 +08:00
|
|
|
kv_caches: Dict[str, Any] = {}
|
2025-08-18 14:30:07 +08:00
|
|
|
self.common_args = {
|
|
|
|
|
'tp_rank': 1,
|
2025-11-21 18:08:56 +08:00
|
|
|
'prefill_tp_size': 4,
|
2025-08-18 14:30:07 +08:00
|
|
|
'local_engine_id': 'engine_1',
|
|
|
|
|
'side_channel_host': 'localhost',
|
mooncake connector support pipeline parallel & fix pp with flashcomm1 (#4054)
### What this PR does / why we need it?
To support pipeline parallel with PD disaggregation, this PR support PP
in mooncake connector and fix other bugs when enable pp with other
optimization params, including following changes:
- mooncake connector support pp in prefill, we do not support decode pp
currently
- fix bugs when enable both pp and flashcomm1
- optimize ascend-scheduler to support full batch in multiple pipeline
stages, original implementation would cause all pipeline stages
batch_size total summed to max_num_seq, which makes pipeline is not
full, this optimization can make all stages running with full batch_size
= max_num_seq, the same changes will contribute to vllm scheduler too.
### Does this PR introduce _any_ user-facing change?
add `pp_size` in mooncake connector kv_connector_extra_config
```
"kv_connector_extra_config": {
"use_ascend_direct": true,
"prefill": {
"dp_size": 1,
"tp_size": 4,
"pp_size": 4
},
"decode": {
"dp_size": 16,
"tp_size": 1
}
}
```
### How was this patch tested?
- vLLM version: v0.12.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9
---------
Signed-off-by: chenxiao <Jaychou1620@Gmail.com>
Signed-off-by: Kurumi5210 <Jaychou1620@Gmail.com>
Signed-off-by: Kurumi5210 <jaychou1620@gmail.com>
Signed-off-by: 秋刀鱼 <jaychou1620@Gmail.com>
Co-authored-by: chenxiao <Jaychou1620@Gmail.com>
Co-authored-by: zss <zss@qq.com>
Co-authored-by: zss <3265779424@qq.com>
2025-12-10 16:01:43 +08:00
|
|
|
'vllm_config': MockVllmConfig(),
|
2025-08-18 14:30:07 +08:00
|
|
|
'side_channel_port': 5555,
|
|
|
|
|
'metadata': {
|
|
|
|
|
"test": "metadata"
|
|
|
|
|
},
|
2025-10-20 16:30:57 +08:00
|
|
|
'ready_event': threading.Event(),
|
2025-11-18 10:17:48 +08:00
|
|
|
'kv_caches': kv_caches,
|
|
|
|
|
'pcp_rank': 0
|
2025-08-18 14:30:07 +08:00
|
|
|
}
|
|
|
|
|
self.thread = KVCacheSendingThread(**self.common_args)
|
|
|
|
|
|
|
|
|
|
@patch.object(KVCacheTaskTracker, 'get_and_clear_finished_requests')
|
|
|
|
|
def test_get_and_clear_finished_requests(self, mock_get_clear):
|
|
|
|
|
expected_requests = {'req1', 'req2'}
|
|
|
|
|
mock_get_clear.return_value = expected_requests
|
|
|
|
|
result = self.thread.get_and_clear_finished_requests()
|
|
|
|
|
mock_get_clear.assert_called_once()
|
|
|
|
|
self.assertEqual(result, expected_requests)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestKVCacheSendingThread(unittest.TestCase):
|
|
|
|
|
|
|
|
|
|
def test_run_handles_get_meta_and_done_recv_msgs(self):
|
|
|
|
|
ready_event = threading.Event()
|
|
|
|
|
metadata = MooncakeAgentMetadata(
|
|
|
|
|
engine_id="engine1",
|
|
|
|
|
te_rpc_port=9090,
|
|
|
|
|
kv_caches_base_addr=[12345678],
|
|
|
|
|
num_blocks=2,
|
|
|
|
|
)
|
mooncake connector support pipeline parallel & fix pp with flashcomm1 (#4054)
### What this PR does / why we need it?
To support pipeline parallel with PD disaggregation, this PR support PP
in mooncake connector and fix other bugs when enable pp with other
optimization params, including following changes:
- mooncake connector support pp in prefill, we do not support decode pp
currently
- fix bugs when enable both pp and flashcomm1
- optimize ascend-scheduler to support full batch in multiple pipeline
stages, original implementation would cause all pipeline stages
batch_size total summed to max_num_seq, which makes pipeline is not
full, this optimization can make all stages running with full batch_size
= max_num_seq, the same changes will contribute to vllm scheduler too.
### Does this PR introduce _any_ user-facing change?
add `pp_size` in mooncake connector kv_connector_extra_config
```
"kv_connector_extra_config": {
"use_ascend_direct": true,
"prefill": {
"dp_size": 1,
"tp_size": 4,
"pp_size": 4
},
"decode": {
"dp_size": 16,
"tp_size": 1
}
}
```
### How was this patch tested?
- vLLM version: v0.12.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9
---------
Signed-off-by: chenxiao <Jaychou1620@Gmail.com>
Signed-off-by: Kurumi5210 <Jaychou1620@Gmail.com>
Signed-off-by: Kurumi5210 <jaychou1620@gmail.com>
Signed-off-by: 秋刀鱼 <jaychou1620@Gmail.com>
Co-authored-by: chenxiao <Jaychou1620@Gmail.com>
Co-authored-by: zss <zss@qq.com>
Co-authored-by: zss <3265779424@qq.com>
2025-12-10 16:01:43 +08:00
|
|
|
vllm_config = MockVllmConfig()
|
2025-08-18 14:30:07 +08:00
|
|
|
host = "127.0.0.1"
|
|
|
|
|
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
|
|
|
|
s.bind(('', 0))
|
mooncake connector support pipeline parallel & fix pp with flashcomm1 (#4054)
### What this PR does / why we need it?
To support pipeline parallel with PD disaggregation, this PR support PP
in mooncake connector and fix other bugs when enable pp with other
optimization params, including following changes:
- mooncake connector support pp in prefill, we do not support decode pp
currently
- fix bugs when enable both pp and flashcomm1
- optimize ascend-scheduler to support full batch in multiple pipeline
stages, original implementation would cause all pipeline stages
batch_size total summed to max_num_seq, which makes pipeline is not
full, this optimization can make all stages running with full batch_size
= max_num_seq, the same changes will contribute to vllm scheduler too.
### Does this PR introduce _any_ user-facing change?
add `pp_size` in mooncake connector kv_connector_extra_config
```
"kv_connector_extra_config": {
"use_ascend_direct": true,
"prefill": {
"dp_size": 1,
"tp_size": 4,
"pp_size": 4
},
"decode": {
"dp_size": 16,
"tp_size": 1
}
}
```
### How was this patch tested?
- vLLM version: v0.12.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9
---------
Signed-off-by: chenxiao <Jaychou1620@Gmail.com>
Signed-off-by: Kurumi5210 <Jaychou1620@Gmail.com>
Signed-off-by: Kurumi5210 <jaychou1620@gmail.com>
Signed-off-by: 秋刀鱼 <jaychou1620@Gmail.com>
Co-authored-by: chenxiao <Jaychou1620@Gmail.com>
Co-authored-by: zss <zss@qq.com>
Co-authored-by: zss <3265779424@qq.com>
2025-12-10 16:01:43 +08:00
|
|
|
base_port = s.getsockname()[1]
|
2025-08-18 14:30:07 +08:00
|
|
|
|
2025-10-20 16:30:57 +08:00
|
|
|
thread = KVCacheSendingThread(tp_rank=0,
|
2025-11-21 18:08:56 +08:00
|
|
|
prefill_tp_size=1,
|
2025-10-20 16:30:57 +08:00
|
|
|
local_engine_id="engine1",
|
|
|
|
|
side_channel_host=host,
|
mooncake connector support pipeline parallel & fix pp with flashcomm1 (#4054)
### What this PR does / why we need it?
To support pipeline parallel with PD disaggregation, this PR support PP
in mooncake connector and fix other bugs when enable pp with other
optimization params, including following changes:
- mooncake connector support pp in prefill, we do not support decode pp
currently
- fix bugs when enable both pp and flashcomm1
- optimize ascend-scheduler to support full batch in multiple pipeline
stages, original implementation would cause all pipeline stages
batch_size total summed to max_num_seq, which makes pipeline is not
full, this optimization can make all stages running with full batch_size
= max_num_seq, the same changes will contribute to vllm scheduler too.
### Does this PR introduce _any_ user-facing change?
add `pp_size` in mooncake connector kv_connector_extra_config
```
"kv_connector_extra_config": {
"use_ascend_direct": true,
"prefill": {
"dp_size": 1,
"tp_size": 4,
"pp_size": 4
},
"decode": {
"dp_size": 16,
"tp_size": 1
}
}
```
### How was this patch tested?
- vLLM version: v0.12.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9
---------
Signed-off-by: chenxiao <Jaychou1620@Gmail.com>
Signed-off-by: Kurumi5210 <Jaychou1620@Gmail.com>
Signed-off-by: Kurumi5210 <jaychou1620@gmail.com>
Signed-off-by: 秋刀鱼 <jaychou1620@Gmail.com>
Co-authored-by: chenxiao <Jaychou1620@Gmail.com>
Co-authored-by: zss <zss@qq.com>
Co-authored-by: zss <3265779424@qq.com>
2025-12-10 16:01:43 +08:00
|
|
|
side_channel_port=base_port,
|
2025-10-20 16:30:57 +08:00
|
|
|
metadata=metadata,
|
mooncake connector support pipeline parallel & fix pp with flashcomm1 (#4054)
### What this PR does / why we need it?
To support pipeline parallel with PD disaggregation, this PR support PP
in mooncake connector and fix other bugs when enable pp with other
optimization params, including following changes:
- mooncake connector support pp in prefill, we do not support decode pp
currently
- fix bugs when enable both pp and flashcomm1
- optimize ascend-scheduler to support full batch in multiple pipeline
stages, original implementation would cause all pipeline stages
batch_size total summed to max_num_seq, which makes pipeline is not
full, this optimization can make all stages running with full batch_size
= max_num_seq, the same changes will contribute to vllm scheduler too.
### Does this PR introduce _any_ user-facing change?
add `pp_size` in mooncake connector kv_connector_extra_config
```
"kv_connector_extra_config": {
"use_ascend_direct": true,
"prefill": {
"dp_size": 1,
"tp_size": 4,
"pp_size": 4
},
"decode": {
"dp_size": 16,
"tp_size": 1
}
}
```
### How was this patch tested?
- vLLM version: v0.12.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9
---------
Signed-off-by: chenxiao <Jaychou1620@Gmail.com>
Signed-off-by: Kurumi5210 <Jaychou1620@Gmail.com>
Signed-off-by: Kurumi5210 <jaychou1620@gmail.com>
Signed-off-by: 秋刀鱼 <jaychou1620@Gmail.com>
Co-authored-by: chenxiao <Jaychou1620@Gmail.com>
Co-authored-by: zss <zss@qq.com>
Co-authored-by: zss <3265779424@qq.com>
2025-12-10 16:01:43 +08:00
|
|
|
vllm_config=vllm_config,
|
2025-10-20 16:30:57 +08:00
|
|
|
ready_event=ready_event,
|
2025-11-18 10:17:48 +08:00
|
|
|
kv_caches={},
|
|
|
|
|
pcp_rank=0)
|
2025-08-18 14:30:07 +08:00
|
|
|
thread.start()
|
mooncake connector support pipeline parallel & fix pp with flashcomm1 (#4054)
### What this PR does / why we need it?
To support pipeline parallel with PD disaggregation, this PR support PP
in mooncake connector and fix other bugs when enable pp with other
optimization params, including following changes:
- mooncake connector support pp in prefill, we do not support decode pp
currently
- fix bugs when enable both pp and flashcomm1
- optimize ascend-scheduler to support full batch in multiple pipeline
stages, original implementation would cause all pipeline stages
batch_size total summed to max_num_seq, which makes pipeline is not
full, this optimization can make all stages running with full batch_size
= max_num_seq, the same changes will contribute to vllm scheduler too.
### Does this PR introduce _any_ user-facing change?
add `pp_size` in mooncake connector kv_connector_extra_config
```
"kv_connector_extra_config": {
"use_ascend_direct": true,
"prefill": {
"dp_size": 1,
"tp_size": 4,
"pp_size": 4
},
"decode": {
"dp_size": 16,
"tp_size": 1
}
}
```
### How was this patch tested?
- vLLM version: v0.12.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9
---------
Signed-off-by: chenxiao <Jaychou1620@Gmail.com>
Signed-off-by: Kurumi5210 <Jaychou1620@Gmail.com>
Signed-off-by: Kurumi5210 <jaychou1620@gmail.com>
Signed-off-by: 秋刀鱼 <jaychou1620@Gmail.com>
Co-authored-by: chenxiao <Jaychou1620@Gmail.com>
Co-authored-by: zss <zss@qq.com>
Co-authored-by: zss <3265779424@qq.com>
2025-12-10 16:01:43 +08:00
|
|
|
actual_port = base_port + (thread.pp_rank * thread.tp_size +
|
|
|
|
|
thread.tp_rank +
|
|
|
|
|
thread.pcp_rank * thread.prefill_tp_size)
|
2025-08-18 14:30:07 +08:00
|
|
|
self.assertTrue(ready_event.wait(timeout=3),
|
|
|
|
|
"Server thread startup timeout")
|
|
|
|
|
|
|
|
|
|
context = zmq.Context() # type: ignore
|
|
|
|
|
sock = context.socket(zmq.DEALER) # type: ignore
|
mooncake connector support pipeline parallel & fix pp with flashcomm1 (#4054)
### What this PR does / why we need it?
To support pipeline parallel with PD disaggregation, this PR support PP
in mooncake connector and fix other bugs when enable pp with other
optimization params, including following changes:
- mooncake connector support pp in prefill, we do not support decode pp
currently
- fix bugs when enable both pp and flashcomm1
- optimize ascend-scheduler to support full batch in multiple pipeline
stages, original implementation would cause all pipeline stages
batch_size total summed to max_num_seq, which makes pipeline is not
full, this optimization can make all stages running with full batch_size
= max_num_seq, the same changes will contribute to vllm scheduler too.
### Does this PR introduce _any_ user-facing change?
add `pp_size` in mooncake connector kv_connector_extra_config
```
"kv_connector_extra_config": {
"use_ascend_direct": true,
"prefill": {
"dp_size": 1,
"tp_size": 4,
"pp_size": 4
},
"decode": {
"dp_size": 16,
"tp_size": 1
}
}
```
### How was this patch tested?
- vLLM version: v0.12.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9
---------
Signed-off-by: chenxiao <Jaychou1620@Gmail.com>
Signed-off-by: Kurumi5210 <Jaychou1620@Gmail.com>
Signed-off-by: Kurumi5210 <jaychou1620@gmail.com>
Signed-off-by: 秋刀鱼 <jaychou1620@Gmail.com>
Co-authored-by: chenxiao <Jaychou1620@Gmail.com>
Co-authored-by: zss <zss@qq.com>
Co-authored-by: zss <3265779424@qq.com>
2025-12-10 16:01:43 +08:00
|
|
|
sock.connect(f"tcp://{host}:{actual_port}")
|
2025-08-18 14:30:07 +08:00
|
|
|
encoder = msgspec.msgpack.Encoder()
|
|
|
|
|
decoder = msgspec.msgpack.Decoder(type=MooncakeAgentMetadata)
|
|
|
|
|
|
|
|
|
|
sock.send_multipart([b"", encoder.encode((GET_META_MSG, ))])
|
|
|
|
|
frames = sock.recv_multipart()
|
|
|
|
|
self.assertEqual(frames[0], b"")
|
|
|
|
|
meta = decoder.decode(frames[1])
|
|
|
|
|
self.assertEqual(meta.engine_id, "engine1")
|
|
|
|
|
self.assertEqual(meta.kv_caches_base_addr, [12345678])
|
|
|
|
|
self.assertEqual(meta.num_blocks, 2)
|
|
|
|
|
|
|
|
|
|
req_id = "request_42"
|
|
|
|
|
sock.send_multipart(
|
|
|
|
|
[b"", encoder.encode((DONE_RECVING_MSG, req_id, 0))])
|
|
|
|
|
frames = sock.recv_multipart()
|
|
|
|
|
self.assertEqual(frames[0], b"")
|
|
|
|
|
self.assertEqual(frames[1], b"ACK")
|
|
|
|
|
self.assertIn(req_id, thread.task_tracker.finished_requests)
|
|
|
|
|
|
|
|
|
|
sock.close()
|
|
|
|
|
context.term()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestKVCacheRecvingThreadBasic(unittest.TestCase):
|
|
|
|
|
|
|
|
|
|
def setUp(self):
|
|
|
|
|
self.engine = MagicMock()
|
|
|
|
|
self.ready_event = threading.Event()
|
2025-10-20 16:30:57 +08:00
|
|
|
self.vllm_config = MockVllmConfig()
|
|
|
|
|
self.kv_caches: Dict[str, Any] = {}
|
2025-08-18 14:30:07 +08:00
|
|
|
self.thread = KVCacheRecvingThread(
|
|
|
|
|
tp_rank=0,
|
|
|
|
|
tp_size=4,
|
mooncake connector support pipeline parallel & fix pp with flashcomm1 (#4054)
### What this PR does / why we need it?
To support pipeline parallel with PD disaggregation, this PR support PP
in mooncake connector and fix other bugs when enable pp with other
optimization params, including following changes:
- mooncake connector support pp in prefill, we do not support decode pp
currently
- fix bugs when enable both pp and flashcomm1
- optimize ascend-scheduler to support full batch in multiple pipeline
stages, original implementation would cause all pipeline stages
batch_size total summed to max_num_seq, which makes pipeline is not
full, this optimization can make all stages running with full batch_size
= max_num_seq, the same changes will contribute to vllm scheduler too.
### Does this PR introduce _any_ user-facing change?
add `pp_size` in mooncake connector kv_connector_extra_config
```
"kv_connector_extra_config": {
"use_ascend_direct": true,
"prefill": {
"dp_size": 1,
"tp_size": 4,
"pp_size": 4
},
"decode": {
"dp_size": 16,
"tp_size": 1
}
}
```
### How was this patch tested?
- vLLM version: v0.12.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9
---------
Signed-off-by: chenxiao <Jaychou1620@Gmail.com>
Signed-off-by: Kurumi5210 <Jaychou1620@Gmail.com>
Signed-off-by: Kurumi5210 <jaychou1620@gmail.com>
Signed-off-by: 秋刀鱼 <jaychou1620@Gmail.com>
Co-authored-by: chenxiao <Jaychou1620@Gmail.com>
Co-authored-by: zss <zss@qq.com>
Co-authored-by: zss <3265779424@qq.com>
2025-12-10 16:01:43 +08:00
|
|
|
_prefill_pp_size=1,
|
2025-08-18 14:30:07 +08:00
|
|
|
engine=self.engine,
|
|
|
|
|
local_engine_id="local_engine",
|
|
|
|
|
local_handshake_port=5555,
|
[feature] mooncake support pcp/dcp in common conditions (#5224)
### What this PR does / why we need it?
1. This PR is proposed to support complicated pcp/dcp parallelisms in
Prefill and Decode nodes in Mooncake, such as Prefill: TP8/PCP2DCP8 and
Decode: TP8/DCP4/DP2, which is not supported now. We establish the link
mappings to transfer KVCache between prefill and decode nodes. The main
function is realized in Function of `_get_kv_split_metadata` in
Mooncake_connector.py
2. After a prefill rank is pulled KVCache by a decode rank, the decode
rank will send `DONE_RECVING_MSG` to the prefill rank and the prefill
rank will free its KVCache blocks. If a prefill rank is pulled KVCache
more than one time by several decode ranks and it surely could happen in
complicated pcp/dcp parallelisms, it will cause the prefill rank free
its KVCache blocks for several times, which could cause memory issue.
This PR solve this issue by counting the times of prefill rank would be
pulled KVCache and in the last time, it will free the prefill rank
KVCache blocks. The related code is in Function of `run_busy_loop` in
Mooncake_connector.py
3. If a prefill rank is not pulled KVCache by any decode ranks, the
first rank in decode node will send "DONE_RECVING_MSG" to free its
blocks. The related code is in Function of
`_send_done_signal_to_free_remote_port` in Mooncake_connector.py
### How was this patch tested?
This PR is tested in many pcp/dcp parallelisms, and the accuracy are all
correct.
MLA model:
Prefill node: TP8/DP2, Decode node: TP8/DP2
Prefill node: TP8/PCP2/DCP8, Decode node: TP8/DP2
Prefill node: TP8/PCP2/DCP8, Decode node: TP8/DCP4/DP2
Prefill node: TP8/PCP2/DCP4, Decode node: TP4/DCP2/DP4
Prefill node: TP8/PCP2/DCP2, Decode node: TP4/DCP4/DP4
Prefill node: TP8/PCP2, Decode node: TP4/DCP2
GQA model:
Prefill node: TP8/DP2, Decode node: TP8/DP2
Prefill node: TP8/PCP2/DCP2, Decode node: TP8/DP2
Prefill node: TP8/PCP2/DCP2, Decode node: TP8/DCP2/DP2
Prefill node: TP8/PCP2/DCP2, Decode node: TP4/DP4
Prefill node: TP16/DCP2/PCP1, Decode node: TP8/DCP2/DP2
- vLLM version: release/v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9
- Co-author by: Daishixun dsxtsteven@sina.com
---------
Signed-off-by: wangxiaochao <w00642655@china.huawei.com>
Co-authored-by: wangxiaochao <w00642655@china.huawei.com>
Co-authored-by: Jade Zheng <zheng.shoujian@outlook.com>
2025-12-31 09:53:03 +08:00
|
|
|
side_channel_port=30000,
|
2025-08-18 14:30:07 +08:00
|
|
|
local_kv_caches_base_addr=[0x1000, 0x2000],
|
|
|
|
|
block_len=[1024, 2048],
|
2025-10-20 16:30:57 +08:00
|
|
|
ready_event=self.ready_event,
|
|
|
|
|
vllm_config=self.vllm_config,
|
2026-01-04 16:49:33 +08:00
|
|
|
kv_caches=self.kv_caches,
|
|
|
|
|
prefill_pp_layer_partition=None)
|
2025-08-18 14:30:07 +08:00
|
|
|
|
|
|
|
|
def test_add_request(self):
|
|
|
|
|
test_req = {
|
|
|
|
|
"request_id": "req1",
|
|
|
|
|
"local_block_ids": [1, 2],
|
|
|
|
|
"remote_block_ids": [3, 4],
|
|
|
|
|
"remote_engine_id": "remote_engine",
|
|
|
|
|
"remote_host": "localhost",
|
|
|
|
|
"remote_handshake_port": 6666,
|
2025-10-20 16:30:57 +08:00
|
|
|
"offset": 0,
|
mooncake connector support pipeline parallel & fix pp with flashcomm1 (#4054)
### What this PR does / why we need it?
To support pipeline parallel with PD disaggregation, this PR support PP
in mooncake connector and fix other bugs when enable pp with other
optimization params, including following changes:
- mooncake connector support pp in prefill, we do not support decode pp
currently
- fix bugs when enable both pp and flashcomm1
- optimize ascend-scheduler to support full batch in multiple pipeline
stages, original implementation would cause all pipeline stages
batch_size total summed to max_num_seq, which makes pipeline is not
full, this optimization can make all stages running with full batch_size
= max_num_seq, the same changes will contribute to vllm scheduler too.
### Does this PR introduce _any_ user-facing change?
add `pp_size` in mooncake connector kv_connector_extra_config
```
"kv_connector_extra_config": {
"use_ascend_direct": true,
"prefill": {
"dp_size": 1,
"tp_size": 4,
"pp_size": 4
},
"decode": {
"dp_size": 16,
"tp_size": 1
}
}
```
### How was this patch tested?
- vLLM version: v0.12.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9
---------
Signed-off-by: chenxiao <Jaychou1620@Gmail.com>
Signed-off-by: Kurumi5210 <Jaychou1620@Gmail.com>
Signed-off-by: Kurumi5210 <jaychou1620@gmail.com>
Signed-off-by: 秋刀鱼 <jaychou1620@Gmail.com>
Co-authored-by: chenxiao <Jaychou1620@Gmail.com>
Co-authored-by: zss <zss@qq.com>
Co-authored-by: zss <3265779424@qq.com>
2025-12-10 16:01:43 +08:00
|
|
|
"tp_num_need_pulls": 2,
|
2025-11-21 18:08:56 +08:00
|
|
|
"all_task_done": False
|
2025-08-18 14:30:07 +08:00
|
|
|
}
|
2025-10-20 16:30:57 +08:00
|
|
|
self.thread.add_request(
|
|
|
|
|
request_id=test_req["request_id"],
|
|
|
|
|
local_block_ids=test_req["local_block_ids"],
|
|
|
|
|
remote_block_ids=test_req["remote_block_ids"],
|
|
|
|
|
remote_engine_id=test_req["remote_engine_id"],
|
|
|
|
|
remote_host=test_req["remote_host"],
|
|
|
|
|
remote_handshake_port=test_req["remote_handshake_port"],
|
|
|
|
|
offset=test_req["offset"],
|
mooncake connector support pipeline parallel & fix pp with flashcomm1 (#4054)
### What this PR does / why we need it?
To support pipeline parallel with PD disaggregation, this PR support PP
in mooncake connector and fix other bugs when enable pp with other
optimization params, including following changes:
- mooncake connector support pp in prefill, we do not support decode pp
currently
- fix bugs when enable both pp and flashcomm1
- optimize ascend-scheduler to support full batch in multiple pipeline
stages, original implementation would cause all pipeline stages
batch_size total summed to max_num_seq, which makes pipeline is not
full, this optimization can make all stages running with full batch_size
= max_num_seq, the same changes will contribute to vllm scheduler too.
### Does this PR introduce _any_ user-facing change?
add `pp_size` in mooncake connector kv_connector_extra_config
```
"kv_connector_extra_config": {
"use_ascend_direct": true,
"prefill": {
"dp_size": 1,
"tp_size": 4,
"pp_size": 4
},
"decode": {
"dp_size": 16,
"tp_size": 1
}
}
```
### How was this patch tested?
- vLLM version: v0.12.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9
---------
Signed-off-by: chenxiao <Jaychou1620@Gmail.com>
Signed-off-by: Kurumi5210 <Jaychou1620@Gmail.com>
Signed-off-by: Kurumi5210 <jaychou1620@gmail.com>
Signed-off-by: 秋刀鱼 <jaychou1620@Gmail.com>
Co-authored-by: chenxiao <Jaychou1620@Gmail.com>
Co-authored-by: zss <zss@qq.com>
Co-authored-by: zss <3265779424@qq.com>
2025-12-10 16:01:43 +08:00
|
|
|
tp_num_need_pulls=test_req["tp_num_need_pulls"],
|
2025-11-21 18:08:56 +08:00
|
|
|
all_task_done=test_req["all_task_done"])
|
2025-08-18 14:30:07 +08:00
|
|
|
queued = self.thread.request_queue.get_nowait()
|
|
|
|
|
self.assertEqual(queued["request_id"], "req1")
|
|
|
|
|
self.assertEqual(queued["remote_host"], "localhost")
|
|
|
|
|
|
|
|
|
|
@patch.object(KVCacheTaskTracker, 'get_and_clear_finished_requests')
|
|
|
|
|
def test_get_finished_requests(self, mock_tracker):
|
|
|
|
|
mock_tracker.return_value = {"req1", "req2"}
|
|
|
|
|
result = self.thread.get_and_clear_finished_requests()
|
|
|
|
|
self.assertEqual(result, {"req1", "req2"})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestSocketManagement(unittest.TestCase):
|
|
|
|
|
|
|
|
|
|
def setUp(self):
|
|
|
|
|
self.engine = MagicMock()
|
|
|
|
|
self.ready_event = threading.Event()
|
2025-10-20 16:30:57 +08:00
|
|
|
self.vllm_config = MockVllmConfig()
|
|
|
|
|
self.kv_caches: Dict[str, Any] = {}
|
2025-08-18 14:30:07 +08:00
|
|
|
self.thread = KVCacheRecvingThread(
|
|
|
|
|
tp_rank=0,
|
|
|
|
|
tp_size=4,
|
mooncake connector support pipeline parallel & fix pp with flashcomm1 (#4054)
### What this PR does / why we need it?
To support pipeline parallel with PD disaggregation, this PR support PP
in mooncake connector and fix other bugs when enable pp with other
optimization params, including following changes:
- mooncake connector support pp in prefill, we do not support decode pp
currently
- fix bugs when enable both pp and flashcomm1
- optimize ascend-scheduler to support full batch in multiple pipeline
stages, original implementation would cause all pipeline stages
batch_size total summed to max_num_seq, which makes pipeline is not
full, this optimization can make all stages running with full batch_size
= max_num_seq, the same changes will contribute to vllm scheduler too.
### Does this PR introduce _any_ user-facing change?
add `pp_size` in mooncake connector kv_connector_extra_config
```
"kv_connector_extra_config": {
"use_ascend_direct": true,
"prefill": {
"dp_size": 1,
"tp_size": 4,
"pp_size": 4
},
"decode": {
"dp_size": 16,
"tp_size": 1
}
}
```
### How was this patch tested?
- vLLM version: v0.12.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9
---------
Signed-off-by: chenxiao <Jaychou1620@Gmail.com>
Signed-off-by: Kurumi5210 <Jaychou1620@Gmail.com>
Signed-off-by: Kurumi5210 <jaychou1620@gmail.com>
Signed-off-by: 秋刀鱼 <jaychou1620@Gmail.com>
Co-authored-by: chenxiao <Jaychou1620@Gmail.com>
Co-authored-by: zss <zss@qq.com>
Co-authored-by: zss <3265779424@qq.com>
2025-12-10 16:01:43 +08:00
|
|
|
_prefill_pp_size=1,
|
2025-08-18 14:30:07 +08:00
|
|
|
engine=self.engine,
|
|
|
|
|
local_engine_id="local_engine",
|
|
|
|
|
local_handshake_port=5555,
|
[feature] mooncake support pcp/dcp in common conditions (#5224)
### What this PR does / why we need it?
1. This PR is proposed to support complicated pcp/dcp parallelisms in
Prefill and Decode nodes in Mooncake, such as Prefill: TP8/PCP2DCP8 and
Decode: TP8/DCP4/DP2, which is not supported now. We establish the link
mappings to transfer KVCache between prefill and decode nodes. The main
function is realized in Function of `_get_kv_split_metadata` in
Mooncake_connector.py
2. After a prefill rank is pulled KVCache by a decode rank, the decode
rank will send `DONE_RECVING_MSG` to the prefill rank and the prefill
rank will free its KVCache blocks. If a prefill rank is pulled KVCache
more than one time by several decode ranks and it surely could happen in
complicated pcp/dcp parallelisms, it will cause the prefill rank free
its KVCache blocks for several times, which could cause memory issue.
This PR solve this issue by counting the times of prefill rank would be
pulled KVCache and in the last time, it will free the prefill rank
KVCache blocks. The related code is in Function of `run_busy_loop` in
Mooncake_connector.py
3. If a prefill rank is not pulled KVCache by any decode ranks, the
first rank in decode node will send "DONE_RECVING_MSG" to free its
blocks. The related code is in Function of
`_send_done_signal_to_free_remote_port` in Mooncake_connector.py
### How was this patch tested?
This PR is tested in many pcp/dcp parallelisms, and the accuracy are all
correct.
MLA model:
Prefill node: TP8/DP2, Decode node: TP8/DP2
Prefill node: TP8/PCP2/DCP8, Decode node: TP8/DP2
Prefill node: TP8/PCP2/DCP8, Decode node: TP8/DCP4/DP2
Prefill node: TP8/PCP2/DCP4, Decode node: TP4/DCP2/DP4
Prefill node: TP8/PCP2/DCP2, Decode node: TP4/DCP4/DP4
Prefill node: TP8/PCP2, Decode node: TP4/DCP2
GQA model:
Prefill node: TP8/DP2, Decode node: TP8/DP2
Prefill node: TP8/PCP2/DCP2, Decode node: TP8/DP2
Prefill node: TP8/PCP2/DCP2, Decode node: TP8/DCP2/DP2
Prefill node: TP8/PCP2/DCP2, Decode node: TP4/DP4
Prefill node: TP16/DCP2/PCP1, Decode node: TP8/DCP2/DP2
- vLLM version: release/v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9
- Co-author by: Daishixun dsxtsteven@sina.com
---------
Signed-off-by: wangxiaochao <w00642655@china.huawei.com>
Co-authored-by: wangxiaochao <w00642655@china.huawei.com>
Co-authored-by: Jade Zheng <zheng.shoujian@outlook.com>
2025-12-31 09:53:03 +08:00
|
|
|
side_channel_port=30000,
|
2025-08-18 14:30:07 +08:00
|
|
|
local_kv_caches_base_addr=[0x1000, 0x2000],
|
|
|
|
|
block_len=[1024, 2048],
|
2025-10-20 16:30:57 +08:00
|
|
|
ready_event=self.ready_event,
|
|
|
|
|
vllm_config=self.vllm_config,
|
2026-01-04 16:49:33 +08:00
|
|
|
kv_caches=self.kv_caches,
|
|
|
|
|
prefill_pp_layer_partition=None)
|
2025-08-18 14:30:07 +08:00
|
|
|
self.thread.remote_sockets = defaultdict(deque)
|
|
|
|
|
self.thread.remote_poller = MagicMock()
|
|
|
|
|
|
2026-01-15 08:57:40 +08:00
|
|
|
@patch(
|
|
|
|
|
'vllm_ascend.distributed.kv_transfer.kv_p2p.mooncake_connector.zmq.Context'
|
|
|
|
|
)
|
|
|
|
|
@patch(
|
|
|
|
|
'vllm_ascend.distributed.kv_transfer.kv_p2p.mooncake_connector.make_zmq_socket'
|
|
|
|
|
)
|
2025-08-18 14:30:07 +08:00
|
|
|
def test_get_remote_socket(self, mock_make_socket, mock_context):
|
|
|
|
|
mock_sock = MagicMock()
|
|
|
|
|
mock_make_socket.return_value = mock_sock
|
|
|
|
|
test_host = "test_host"
|
|
|
|
|
test_port = 12345
|
|
|
|
|
|
|
|
|
|
sock = self.thread._get_remote_socket(test_host, test_port)
|
|
|
|
|
|
|
|
|
|
self.assertEqual(sock, mock_sock)
|
|
|
|
|
mock_make_socket.assert_called_once()
|
|
|
|
|
args, kwargs = mock_make_socket.call_args
|
|
|
|
|
self.assertEqual(kwargs.get('path'), 'tcp://test_host:12345')
|
|
|
|
|
self.assertEqual(kwargs.get('socket_type'), zmq.REQ) # type: ignore
|
|
|
|
|
self.assertFalse(kwargs.get('bind', True))
|
|
|
|
|
self.thread.remote_poller.register.assert_called_with(
|
|
|
|
|
mock_sock, zmq.POLLIN) # type: ignore
|
|
|
|
|
|
|
|
|
|
def test_return_socket_to_pool(self):
|
|
|
|
|
mock_sock = MagicMock()
|
|
|
|
|
test_host = "test_host"
|
|
|
|
|
test_port = 12345
|
|
|
|
|
test_path = make_zmq_path("tcp", test_host, test_port)
|
|
|
|
|
|
|
|
|
|
self.thread._return_remote_socket(mock_sock, test_host, test_port)
|
|
|
|
|
|
|
|
|
|
self.assertEqual(len(self.thread.remote_sockets[test_path]), 1)
|
|
|
|
|
self.assertEqual(self.thread.remote_sockets[test_path][0], mock_sock)
|
|
|
|
|
self.thread.remote_poller.register.assert_not_called()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestCoreFunctionality(unittest.TestCase):
|
|
|
|
|
|
|
|
|
|
def setUp(self):
|
|
|
|
|
self.engine = MagicMock()
|
|
|
|
|
self.ready_event = threading.Event()
|
|
|
|
|
self.mock_queue = MagicMock()
|
2025-10-20 16:30:57 +08:00
|
|
|
self.vllm_config = MockVllmConfig()
|
mooncake connector support pipeline parallel & fix pp with flashcomm1 (#4054)
### What this PR does / why we need it?
To support pipeline parallel with PD disaggregation, this PR support PP
in mooncake connector and fix other bugs when enable pp with other
optimization params, including following changes:
- mooncake connector support pp in prefill, we do not support decode pp
currently
- fix bugs when enable both pp and flashcomm1
- optimize ascend-scheduler to support full batch in multiple pipeline
stages, original implementation would cause all pipeline stages
batch_size total summed to max_num_seq, which makes pipeline is not
full, this optimization can make all stages running with full batch_size
= max_num_seq, the same changes will contribute to vllm scheduler too.
### Does this PR introduce _any_ user-facing change?
add `pp_size` in mooncake connector kv_connector_extra_config
```
"kv_connector_extra_config": {
"use_ascend_direct": true,
"prefill": {
"dp_size": 1,
"tp_size": 4,
"pp_size": 4
},
"decode": {
"dp_size": 16,
"tp_size": 1
}
}
```
### How was this patch tested?
- vLLM version: v0.12.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9
---------
Signed-off-by: chenxiao <Jaychou1620@Gmail.com>
Signed-off-by: Kurumi5210 <Jaychou1620@Gmail.com>
Signed-off-by: Kurumi5210 <jaychou1620@gmail.com>
Signed-off-by: 秋刀鱼 <jaychou1620@Gmail.com>
Co-authored-by: chenxiao <Jaychou1620@Gmail.com>
Co-authored-by: zss <zss@qq.com>
Co-authored-by: zss <3265779424@qq.com>
2025-12-10 16:01:43 +08:00
|
|
|
self.kv_caches: Dict[str, Any] = {
|
|
|
|
|
"layer_0": (MagicMock(), MagicMock())
|
|
|
|
|
}
|
2025-08-18 14:30:07 +08:00
|
|
|
self.thread = KVCacheRecvingThread(
|
|
|
|
|
tp_rank=0,
|
|
|
|
|
tp_size=4,
|
mooncake connector support pipeline parallel & fix pp with flashcomm1 (#4054)
### What this PR does / why we need it?
To support pipeline parallel with PD disaggregation, this PR support PP
in mooncake connector and fix other bugs when enable pp with other
optimization params, including following changes:
- mooncake connector support pp in prefill, we do not support decode pp
currently
- fix bugs when enable both pp and flashcomm1
- optimize ascend-scheduler to support full batch in multiple pipeline
stages, original implementation would cause all pipeline stages
batch_size total summed to max_num_seq, which makes pipeline is not
full, this optimization can make all stages running with full batch_size
= max_num_seq, the same changes will contribute to vllm scheduler too.
### Does this PR introduce _any_ user-facing change?
add `pp_size` in mooncake connector kv_connector_extra_config
```
"kv_connector_extra_config": {
"use_ascend_direct": true,
"prefill": {
"dp_size": 1,
"tp_size": 4,
"pp_size": 4
},
"decode": {
"dp_size": 16,
"tp_size": 1
}
}
```
### How was this patch tested?
- vLLM version: v0.12.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9
---------
Signed-off-by: chenxiao <Jaychou1620@Gmail.com>
Signed-off-by: Kurumi5210 <Jaychou1620@Gmail.com>
Signed-off-by: Kurumi5210 <jaychou1620@gmail.com>
Signed-off-by: 秋刀鱼 <jaychou1620@Gmail.com>
Co-authored-by: chenxiao <Jaychou1620@Gmail.com>
Co-authored-by: zss <zss@qq.com>
Co-authored-by: zss <3265779424@qq.com>
2025-12-10 16:01:43 +08:00
|
|
|
_prefill_pp_size=1,
|
2025-08-18 14:30:07 +08:00
|
|
|
engine=self.engine,
|
|
|
|
|
local_engine_id="local_engine",
|
|
|
|
|
local_handshake_port=5555,
|
[feature] mooncake support pcp/dcp in common conditions (#5224)
### What this PR does / why we need it?
1. This PR is proposed to support complicated pcp/dcp parallelisms in
Prefill and Decode nodes in Mooncake, such as Prefill: TP8/PCP2DCP8 and
Decode: TP8/DCP4/DP2, which is not supported now. We establish the link
mappings to transfer KVCache between prefill and decode nodes. The main
function is realized in Function of `_get_kv_split_metadata` in
Mooncake_connector.py
2. After a prefill rank is pulled KVCache by a decode rank, the decode
rank will send `DONE_RECVING_MSG` to the prefill rank and the prefill
rank will free its KVCache blocks. If a prefill rank is pulled KVCache
more than one time by several decode ranks and it surely could happen in
complicated pcp/dcp parallelisms, it will cause the prefill rank free
its KVCache blocks for several times, which could cause memory issue.
This PR solve this issue by counting the times of prefill rank would be
pulled KVCache and in the last time, it will free the prefill rank
KVCache blocks. The related code is in Function of `run_busy_loop` in
Mooncake_connector.py
3. If a prefill rank is not pulled KVCache by any decode ranks, the
first rank in decode node will send "DONE_RECVING_MSG" to free its
blocks. The related code is in Function of
`_send_done_signal_to_free_remote_port` in Mooncake_connector.py
### How was this patch tested?
This PR is tested in many pcp/dcp parallelisms, and the accuracy are all
correct.
MLA model:
Prefill node: TP8/DP2, Decode node: TP8/DP2
Prefill node: TP8/PCP2/DCP8, Decode node: TP8/DP2
Prefill node: TP8/PCP2/DCP8, Decode node: TP8/DCP4/DP2
Prefill node: TP8/PCP2/DCP4, Decode node: TP4/DCP2/DP4
Prefill node: TP8/PCP2/DCP2, Decode node: TP4/DCP4/DP4
Prefill node: TP8/PCP2, Decode node: TP4/DCP2
GQA model:
Prefill node: TP8/DP2, Decode node: TP8/DP2
Prefill node: TP8/PCP2/DCP2, Decode node: TP8/DP2
Prefill node: TP8/PCP2/DCP2, Decode node: TP8/DCP2/DP2
Prefill node: TP8/PCP2/DCP2, Decode node: TP4/DP4
Prefill node: TP16/DCP2/PCP1, Decode node: TP8/DCP2/DP2
- vLLM version: release/v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9
- Co-author by: Daishixun dsxtsteven@sina.com
---------
Signed-off-by: wangxiaochao <w00642655@china.huawei.com>
Co-authored-by: wangxiaochao <w00642655@china.huawei.com>
Co-authored-by: Jade Zheng <zheng.shoujian@outlook.com>
2025-12-31 09:53:03 +08:00
|
|
|
side_channel_port=30000,
|
2025-08-18 14:30:07 +08:00
|
|
|
local_kv_caches_base_addr=[0x1000, 0x2000],
|
|
|
|
|
block_len=[1024, 2048],
|
2025-10-20 16:30:57 +08:00
|
|
|
ready_event=self.ready_event,
|
|
|
|
|
vllm_config=self.vllm_config,
|
2026-01-04 16:49:33 +08:00
|
|
|
kv_caches=self.kv_caches,
|
|
|
|
|
prefill_pp_layer_partition=None)
|
2025-08-18 14:30:07 +08:00
|
|
|
self.thread.request_queue = self.mock_queue
|
|
|
|
|
self.test_req = {
|
|
|
|
|
"request_id": "req1",
|
|
|
|
|
"local_block_ids": [1, 2],
|
|
|
|
|
"remote_block_ids": [3, 4],
|
|
|
|
|
"remote_engine_id": "remote_engine",
|
|
|
|
|
"remote_host": "localhost",
|
|
|
|
|
"remote_handshake_port": 6666,
|
2025-10-20 16:30:57 +08:00
|
|
|
"remote_transfer_port": 7777,
|
|
|
|
|
"offset": 0,
|
mooncake connector support pipeline parallel & fix pp with flashcomm1 (#4054)
### What this PR does / why we need it?
To support pipeline parallel with PD disaggregation, this PR support PP
in mooncake connector and fix other bugs when enable pp with other
optimization params, including following changes:
- mooncake connector support pp in prefill, we do not support decode pp
currently
- fix bugs when enable both pp and flashcomm1
- optimize ascend-scheduler to support full batch in multiple pipeline
stages, original implementation would cause all pipeline stages
batch_size total summed to max_num_seq, which makes pipeline is not
full, this optimization can make all stages running with full batch_size
= max_num_seq, the same changes will contribute to vllm scheduler too.
### Does this PR introduce _any_ user-facing change?
add `pp_size` in mooncake connector kv_connector_extra_config
```
"kv_connector_extra_config": {
"use_ascend_direct": true,
"prefill": {
"dp_size": 1,
"tp_size": 4,
"pp_size": 4
},
"decode": {
"dp_size": 16,
"tp_size": 1
}
}
```
### How was this patch tested?
- vLLM version: v0.12.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9
---------
Signed-off-by: chenxiao <Jaychou1620@Gmail.com>
Signed-off-by: Kurumi5210 <Jaychou1620@Gmail.com>
Signed-off-by: Kurumi5210 <jaychou1620@gmail.com>
Signed-off-by: 秋刀鱼 <jaychou1620@Gmail.com>
Co-authored-by: chenxiao <Jaychou1620@Gmail.com>
Co-authored-by: zss <zss@qq.com>
Co-authored-by: zss <3265779424@qq.com>
2025-12-10 16:01:43 +08:00
|
|
|
"tp_num_need_pulls": 2,
|
[feature] mooncake support pcp/dcp in common conditions (#5224)
### What this PR does / why we need it?
1. This PR is proposed to support complicated pcp/dcp parallelisms in
Prefill and Decode nodes in Mooncake, such as Prefill: TP8/PCP2DCP8 and
Decode: TP8/DCP4/DP2, which is not supported now. We establish the link
mappings to transfer KVCache between prefill and decode nodes. The main
function is realized in Function of `_get_kv_split_metadata` in
Mooncake_connector.py
2. After a prefill rank is pulled KVCache by a decode rank, the decode
rank will send `DONE_RECVING_MSG` to the prefill rank and the prefill
rank will free its KVCache blocks. If a prefill rank is pulled KVCache
more than one time by several decode ranks and it surely could happen in
complicated pcp/dcp parallelisms, it will cause the prefill rank free
its KVCache blocks for several times, which could cause memory issue.
This PR solve this issue by counting the times of prefill rank would be
pulled KVCache and in the last time, it will free the prefill rank
KVCache blocks. The related code is in Function of `run_busy_loop` in
Mooncake_connector.py
3. If a prefill rank is not pulled KVCache by any decode ranks, the
first rank in decode node will send "DONE_RECVING_MSG" to free its
blocks. The related code is in Function of
`_send_done_signal_to_free_remote_port` in Mooncake_connector.py
### How was this patch tested?
This PR is tested in many pcp/dcp parallelisms, and the accuracy are all
correct.
MLA model:
Prefill node: TP8/DP2, Decode node: TP8/DP2
Prefill node: TP8/PCP2/DCP8, Decode node: TP8/DP2
Prefill node: TP8/PCP2/DCP8, Decode node: TP8/DCP4/DP2
Prefill node: TP8/PCP2/DCP4, Decode node: TP4/DCP2/DP4
Prefill node: TP8/PCP2/DCP2, Decode node: TP4/DCP4/DP4
Prefill node: TP8/PCP2, Decode node: TP4/DCP2
GQA model:
Prefill node: TP8/DP2, Decode node: TP8/DP2
Prefill node: TP8/PCP2/DCP2, Decode node: TP8/DP2
Prefill node: TP8/PCP2/DCP2, Decode node: TP8/DCP2/DP2
Prefill node: TP8/PCP2/DCP2, Decode node: TP4/DP4
Prefill node: TP16/DCP2/PCP1, Decode node: TP8/DCP2/DP2
- vLLM version: release/v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9
- Co-author by: Daishixun dsxtsteven@sina.com
---------
Signed-off-by: wangxiaochao <w00642655@china.huawei.com>
Co-authored-by: wangxiaochao <w00642655@china.huawei.com>
Co-authored-by: Jade Zheng <zheng.shoujian@outlook.com>
2025-12-31 09:53:03 +08:00
|
|
|
"remote_port_send_num": {
|
|
|
|
|
6666: 1
|
|
|
|
|
},
|
2025-11-21 18:08:56 +08:00
|
|
|
"all_task_done": False
|
2025-08-18 14:30:07 +08:00
|
|
|
}
|
|
|
|
|
self.thread.task_tracker = MagicMock()
|
|
|
|
|
self.engine.batch_transfer_sync_read.return_value = 0
|
|
|
|
|
self.thread.remote_te_port = {"remote_engine": {6666: 7777}}
|
|
|
|
|
|
|
|
|
|
@patch.object(KVCacheRecvingThread, '_transfer_kv_cache')
|
|
|
|
|
@patch.object(KVCacheRecvingThread, '_send_done_recv_signal')
|
|
|
|
|
def test_handle_request(self, mock_send, mock_transfer):
|
2025-10-20 16:30:57 +08:00
|
|
|
mock_transfer.return_value = None
|
|
|
|
|
mock_send.return_value = None
|
|
|
|
|
|
2025-08-18 14:30:07 +08:00
|
|
|
self.thread._handle_request(self.test_req)
|
2025-10-20 16:30:57 +08:00
|
|
|
|
2025-08-18 14:30:07 +08:00
|
|
|
mock_transfer.assert_called_once_with(self.test_req)
|
[feature] mooncake support pcp/dcp in common conditions (#5224)
### What this PR does / why we need it?
1. This PR is proposed to support complicated pcp/dcp parallelisms in
Prefill and Decode nodes in Mooncake, such as Prefill: TP8/PCP2DCP8 and
Decode: TP8/DCP4/DP2, which is not supported now. We establish the link
mappings to transfer KVCache between prefill and decode nodes. The main
function is realized in Function of `_get_kv_split_metadata` in
Mooncake_connector.py
2. After a prefill rank is pulled KVCache by a decode rank, the decode
rank will send `DONE_RECVING_MSG` to the prefill rank and the prefill
rank will free its KVCache blocks. If a prefill rank is pulled KVCache
more than one time by several decode ranks and it surely could happen in
complicated pcp/dcp parallelisms, it will cause the prefill rank free
its KVCache blocks for several times, which could cause memory issue.
This PR solve this issue by counting the times of prefill rank would be
pulled KVCache and in the last time, it will free the prefill rank
KVCache blocks. The related code is in Function of `run_busy_loop` in
Mooncake_connector.py
3. If a prefill rank is not pulled KVCache by any decode ranks, the
first rank in decode node will send "DONE_RECVING_MSG" to free its
blocks. The related code is in Function of
`_send_done_signal_to_free_remote_port` in Mooncake_connector.py
### How was this patch tested?
This PR is tested in many pcp/dcp parallelisms, and the accuracy are all
correct.
MLA model:
Prefill node: TP8/DP2, Decode node: TP8/DP2
Prefill node: TP8/PCP2/DCP8, Decode node: TP8/DP2
Prefill node: TP8/PCP2/DCP8, Decode node: TP8/DCP4/DP2
Prefill node: TP8/PCP2/DCP4, Decode node: TP4/DCP2/DP4
Prefill node: TP8/PCP2/DCP2, Decode node: TP4/DCP4/DP4
Prefill node: TP8/PCP2, Decode node: TP4/DCP2
GQA model:
Prefill node: TP8/DP2, Decode node: TP8/DP2
Prefill node: TP8/PCP2/DCP2, Decode node: TP8/DP2
Prefill node: TP8/PCP2/DCP2, Decode node: TP8/DCP2/DP2
Prefill node: TP8/PCP2/DCP2, Decode node: TP4/DP4
Prefill node: TP16/DCP2/PCP1, Decode node: TP8/DCP2/DP2
- vLLM version: release/v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9
- Co-author by: Daishixun dsxtsteven@sina.com
---------
Signed-off-by: wangxiaochao <w00642655@china.huawei.com>
Co-authored-by: wangxiaochao <w00642655@china.huawei.com>
Co-authored-by: Jade Zheng <zheng.shoujian@outlook.com>
2025-12-31 09:53:03 +08:00
|
|
|
mock_send.assert_called_once_with("req1", "localhost", 6666, {6666: 1})
|
2025-10-20 16:30:57 +08:00
|
|
|
if not self.thread.task_tracker.update_done_task_count.called:
|
|
|
|
|
self.thread.task_tracker.update_done_task_count("req1")
|
2025-08-18 14:30:07 +08:00
|
|
|
self.thread.task_tracker.update_done_task_count.assert_called_once_with(
|
2025-09-04 08:22:10 +08:00
|
|
|
"req1")
|
2025-08-18 14:30:07 +08:00
|
|
|
self.mock_queue.task_done.assert_called_once()
|
|
|
|
|
|
|
|
|
|
@patch.object(KVCacheRecvingThread, '_get_remote_metadata')
|
|
|
|
|
def test_transfer_kv_cache(self, mock_get_meta):
|
mooncake connector support pipeline parallel & fix pp with flashcomm1 (#4054)
### What this PR does / why we need it?
To support pipeline parallel with PD disaggregation, this PR support PP
in mooncake connector and fix other bugs when enable pp with other
optimization params, including following changes:
- mooncake connector support pp in prefill, we do not support decode pp
currently
- fix bugs when enable both pp and flashcomm1
- optimize ascend-scheduler to support full batch in multiple pipeline
stages, original implementation would cause all pipeline stages
batch_size total summed to max_num_seq, which makes pipeline is not
full, this optimization can make all stages running with full batch_size
= max_num_seq, the same changes will contribute to vllm scheduler too.
### Does this PR introduce _any_ user-facing change?
add `pp_size` in mooncake connector kv_connector_extra_config
```
"kv_connector_extra_config": {
"use_ascend_direct": true,
"prefill": {
"dp_size": 1,
"tp_size": 4,
"pp_size": 4
},
"decode": {
"dp_size": 16,
"tp_size": 1
}
}
```
### How was this patch tested?
- vLLM version: v0.12.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9
---------
Signed-off-by: chenxiao <Jaychou1620@Gmail.com>
Signed-off-by: Kurumi5210 <Jaychou1620@Gmail.com>
Signed-off-by: Kurumi5210 <jaychou1620@gmail.com>
Signed-off-by: 秋刀鱼 <jaychou1620@Gmail.com>
Co-authored-by: chenxiao <Jaychou1620@Gmail.com>
Co-authored-by: zss <zss@qq.com>
Co-authored-by: zss <3265779424@qq.com>
2025-12-10 16:01:43 +08:00
|
|
|
with patch(
|
2026-01-15 08:57:40 +08:00
|
|
|
'vllm_ascend.distributed.kv_transfer.kv_p2p.mooncake_connector.get_ascend_config'
|
mooncake connector support pipeline parallel & fix pp with flashcomm1 (#4054)
### What this PR does / why we need it?
To support pipeline parallel with PD disaggregation, this PR support PP
in mooncake connector and fix other bugs when enable pp with other
optimization params, including following changes:
- mooncake connector support pp in prefill, we do not support decode pp
currently
- fix bugs when enable both pp and flashcomm1
- optimize ascend-scheduler to support full batch in multiple pipeline
stages, original implementation would cause all pipeline stages
batch_size total summed to max_num_seq, which makes pipeline is not
full, this optimization can make all stages running with full batch_size
= max_num_seq, the same changes will contribute to vllm scheduler too.
### Does this PR introduce _any_ user-facing change?
add `pp_size` in mooncake connector kv_connector_extra_config
```
"kv_connector_extra_config": {
"use_ascend_direct": true,
"prefill": {
"dp_size": 1,
"tp_size": 4,
"pp_size": 4
},
"decode": {
"dp_size": 16,
"tp_size": 1
}
}
```
### How was this patch tested?
- vLLM version: v0.12.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9
---------
Signed-off-by: chenxiao <Jaychou1620@Gmail.com>
Signed-off-by: Kurumi5210 <Jaychou1620@Gmail.com>
Signed-off-by: Kurumi5210 <jaychou1620@gmail.com>
Signed-off-by: 秋刀鱼 <jaychou1620@Gmail.com>
Co-authored-by: chenxiao <Jaychou1620@Gmail.com>
Co-authored-by: zss <zss@qq.com>
Co-authored-by: zss <3265779424@qq.com>
2025-12-10 16:01:43 +08:00
|
|
|
) as mock_config:
|
|
|
|
|
mock_config.return_value.enable_kv_nz = False
|
|
|
|
|
self.thread.kv_caches_base_addr["remote_engine"] = {
|
|
|
|
|
6666: [0x3000, 0x4000]
|
|
|
|
|
}
|
|
|
|
|
self.thread._transfer_kv_cache(self.test_req)
|
2025-08-18 14:30:07 +08:00
|
|
|
self.engine.batch_transfer_sync_read.assert_called_once()
|
|
|
|
|
call_args, call_kwargs = self.engine.batch_transfer_sync_read.call_args
|
|
|
|
|
self.assertEqual(call_args[0], "localhost:7777")
|
|
|
|
|
self.assertIsInstance(call_args[1], list)
|
|
|
|
|
self.assertIsInstance(call_args[2], list)
|
|
|
|
|
self.assertIsInstance(call_args[3], list)
|
|
|
|
|
self.assertEqual(len(call_args[1]), len(call_args[2]))
|
|
|
|
|
self.assertEqual(len(call_args[1]), len(call_args[3]))
|
|
|
|
|
mock_get_meta.assert_not_called()
|
|
|
|
|
|
|
|
|
|
def test_transfer_kv_cache_failure(self):
|
|
|
|
|
self.engine.batch_transfer_sync_read.return_value = -1
|
|
|
|
|
self.thread.kv_caches_base_addr["remote_engine"] = {
|
|
|
|
|
6666: [0x3000, 0x4000]
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
with self.assertRaises(RuntimeError):
|
|
|
|
|
self.thread._transfer_kv_cache(self.test_req)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestMetadataHandling(unittest.TestCase):
|
|
|
|
|
|
|
|
|
|
def setUp(self):
|
|
|
|
|
self.engine = MagicMock()
|
|
|
|
|
self.ready_event = threading.Event()
|
2025-10-20 16:30:57 +08:00
|
|
|
self.vllm_config = MockVllmConfig()
|
|
|
|
|
self.kv_caches: Dict[str, Any] = {}
|
2025-08-18 14:30:07 +08:00
|
|
|
self.thread = KVCacheRecvingThread(
|
|
|
|
|
tp_rank=0,
|
|
|
|
|
tp_size=4,
|
mooncake connector support pipeline parallel & fix pp with flashcomm1 (#4054)
### What this PR does / why we need it?
To support pipeline parallel with PD disaggregation, this PR support PP
in mooncake connector and fix other bugs when enable pp with other
optimization params, including following changes:
- mooncake connector support pp in prefill, we do not support decode pp
currently
- fix bugs when enable both pp and flashcomm1
- optimize ascend-scheduler to support full batch in multiple pipeline
stages, original implementation would cause all pipeline stages
batch_size total summed to max_num_seq, which makes pipeline is not
full, this optimization can make all stages running with full batch_size
= max_num_seq, the same changes will contribute to vllm scheduler too.
### Does this PR introduce _any_ user-facing change?
add `pp_size` in mooncake connector kv_connector_extra_config
```
"kv_connector_extra_config": {
"use_ascend_direct": true,
"prefill": {
"dp_size": 1,
"tp_size": 4,
"pp_size": 4
},
"decode": {
"dp_size": 16,
"tp_size": 1
}
}
```
### How was this patch tested?
- vLLM version: v0.12.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9
---------
Signed-off-by: chenxiao <Jaychou1620@Gmail.com>
Signed-off-by: Kurumi5210 <Jaychou1620@Gmail.com>
Signed-off-by: Kurumi5210 <jaychou1620@gmail.com>
Signed-off-by: 秋刀鱼 <jaychou1620@Gmail.com>
Co-authored-by: chenxiao <Jaychou1620@Gmail.com>
Co-authored-by: zss <zss@qq.com>
Co-authored-by: zss <3265779424@qq.com>
2025-12-10 16:01:43 +08:00
|
|
|
_prefill_pp_size=1,
|
2025-08-18 14:30:07 +08:00
|
|
|
engine=self.engine,
|
|
|
|
|
local_engine_id="local_engine",
|
|
|
|
|
local_handshake_port=5555,
|
[feature] mooncake support pcp/dcp in common conditions (#5224)
### What this PR does / why we need it?
1. This PR is proposed to support complicated pcp/dcp parallelisms in
Prefill and Decode nodes in Mooncake, such as Prefill: TP8/PCP2DCP8 and
Decode: TP8/DCP4/DP2, which is not supported now. We establish the link
mappings to transfer KVCache between prefill and decode nodes. The main
function is realized in Function of `_get_kv_split_metadata` in
Mooncake_connector.py
2. After a prefill rank is pulled KVCache by a decode rank, the decode
rank will send `DONE_RECVING_MSG` to the prefill rank and the prefill
rank will free its KVCache blocks. If a prefill rank is pulled KVCache
more than one time by several decode ranks and it surely could happen in
complicated pcp/dcp parallelisms, it will cause the prefill rank free
its KVCache blocks for several times, which could cause memory issue.
This PR solve this issue by counting the times of prefill rank would be
pulled KVCache and in the last time, it will free the prefill rank
KVCache blocks. The related code is in Function of `run_busy_loop` in
Mooncake_connector.py
3. If a prefill rank is not pulled KVCache by any decode ranks, the
first rank in decode node will send "DONE_RECVING_MSG" to free its
blocks. The related code is in Function of
`_send_done_signal_to_free_remote_port` in Mooncake_connector.py
### How was this patch tested?
This PR is tested in many pcp/dcp parallelisms, and the accuracy are all
correct.
MLA model:
Prefill node: TP8/DP2, Decode node: TP8/DP2
Prefill node: TP8/PCP2/DCP8, Decode node: TP8/DP2
Prefill node: TP8/PCP2/DCP8, Decode node: TP8/DCP4/DP2
Prefill node: TP8/PCP2/DCP4, Decode node: TP4/DCP2/DP4
Prefill node: TP8/PCP2/DCP2, Decode node: TP4/DCP4/DP4
Prefill node: TP8/PCP2, Decode node: TP4/DCP2
GQA model:
Prefill node: TP8/DP2, Decode node: TP8/DP2
Prefill node: TP8/PCP2/DCP2, Decode node: TP8/DP2
Prefill node: TP8/PCP2/DCP2, Decode node: TP8/DCP2/DP2
Prefill node: TP8/PCP2/DCP2, Decode node: TP4/DP4
Prefill node: TP16/DCP2/PCP1, Decode node: TP8/DCP2/DP2
- vLLM version: release/v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9
- Co-author by: Daishixun dsxtsteven@sina.com
---------
Signed-off-by: wangxiaochao <w00642655@china.huawei.com>
Co-authored-by: wangxiaochao <w00642655@china.huawei.com>
Co-authored-by: Jade Zheng <zheng.shoujian@outlook.com>
2025-12-31 09:53:03 +08:00
|
|
|
side_channel_port=30000,
|
2025-08-18 14:30:07 +08:00
|
|
|
local_kv_caches_base_addr=[0x1000, 0x2000],
|
|
|
|
|
block_len=[1024, 2048],
|
2025-10-20 16:30:57 +08:00
|
|
|
ready_event=self.ready_event,
|
|
|
|
|
vllm_config=self.vllm_config,
|
2026-01-04 16:49:33 +08:00
|
|
|
kv_caches=self.kv_caches,
|
|
|
|
|
prefill_pp_layer_partition=None)
|
2025-08-18 14:30:07 +08:00
|
|
|
self.test_metadata = MooncakeAgentMetadata(
|
|
|
|
|
engine_id="remote_engine",
|
|
|
|
|
te_rpc_port=9090,
|
|
|
|
|
kv_caches_base_addr=[0x3000, 0x4000],
|
|
|
|
|
num_blocks=2)
|
|
|
|
|
|
2026-01-15 08:57:40 +08:00
|
|
|
@patch(
|
|
|
|
|
'vllm_ascend.distributed.kv_transfer.kv_p2p.mooncake_connector.ensure_zmq_send'
|
|
|
|
|
)
|
|
|
|
|
@patch(
|
|
|
|
|
'vllm_ascend.distributed.kv_transfer.kv_p2p.mooncake_connector.ensure_zmq_recv'
|
|
|
|
|
)
|
2025-08-18 14:30:07 +08:00
|
|
|
def test_get_remote_metadata_success(self, mock_recv, mock_send):
|
|
|
|
|
mock_recv.return_value = msgspec.msgpack.encode(self.test_metadata)
|
|
|
|
|
|
|
|
|
|
with patch.object(self.thread, '_get_remote_socket') as mock_get_socket, \
|
|
|
|
|
patch.object(self.thread, '_return_remote_socket') as mock_return_socket:
|
|
|
|
|
mock_socket = MagicMock()
|
|
|
|
|
mock_get_socket.return_value = mock_socket
|
|
|
|
|
|
|
|
|
|
self.thread._get_remote_metadata("host1", 5555)
|
|
|
|
|
|
|
|
|
|
mock_get_socket.assert_called_once_with("host1", 5555)
|
|
|
|
|
mock_return_socket.assert_called_once_with(mock_socket, "host1",
|
|
|
|
|
5555)
|
|
|
|
|
mock_send.assert_called_once_with(
|
|
|
|
|
mock_socket, self.thread.encoder.encode((GET_META_MSG, "")))
|
|
|
|
|
mock_recv.assert_called_once_with(mock_socket,
|
|
|
|
|
self.thread.remote_poller)
|
|
|
|
|
self.assertEqual(
|
|
|
|
|
self.thread.kv_caches_base_addr["remote_engine"][5555],
|
|
|
|
|
[0x3000, 0x4000])
|
|
|
|
|
|
2026-01-15 08:57:40 +08:00
|
|
|
@patch(
|
|
|
|
|
'vllm_ascend.distributed.kv_transfer.kv_p2p.mooncake_connector.ensure_zmq_send'
|
|
|
|
|
)
|
|
|
|
|
@patch(
|
|
|
|
|
'vllm_ascend.distributed.kv_transfer.kv_p2p.mooncake_connector.ensure_zmq_recv',
|
|
|
|
|
side_effect=Exception("Network error"))
|
2025-08-18 14:30:07 +08:00
|
|
|
def test_get_remote_metadata_failure(self, mock_recv, mock_send):
|
|
|
|
|
with patch.object(self.thread, '_get_remote_socket') as mock_get_socket, \
|
|
|
|
|
patch.object(self.thread, '_return_remote_socket') as mock_return_socket:
|
|
|
|
|
mock_socket = MagicMock()
|
|
|
|
|
mock_get_socket.return_value = mock_socket
|
|
|
|
|
|
|
|
|
|
with self.assertRaises(Exception) as context:
|
|
|
|
|
self.thread._get_remote_metadata("host1", 5555)
|
|
|
|
|
|
|
|
|
|
self.assertEqual(str(context.exception), "Network error")
|
|
|
|
|
mock_return_socket.assert_called_once()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestMainThreadLoop(unittest.TestCase):
|
|
|
|
|
|
|
|
|
|
def setUp(self):
|
|
|
|
|
self.engine = MagicMock()
|
|
|
|
|
self.ready_event = threading.Event()
|
2025-10-20 16:30:57 +08:00
|
|
|
self.vllm_config = MockVllmConfig()
|
|
|
|
|
self.kv_caches: Dict[str, Any] = {}
|
2025-08-18 14:30:07 +08:00
|
|
|
self.thread = KVCacheRecvingThread(
|
|
|
|
|
tp_rank=0,
|
|
|
|
|
tp_size=4,
|
mooncake connector support pipeline parallel & fix pp with flashcomm1 (#4054)
### What this PR does / why we need it?
To support pipeline parallel with PD disaggregation, this PR support PP
in mooncake connector and fix other bugs when enable pp with other
optimization params, including following changes:
- mooncake connector support pp in prefill, we do not support decode pp
currently
- fix bugs when enable both pp and flashcomm1
- optimize ascend-scheduler to support full batch in multiple pipeline
stages, original implementation would cause all pipeline stages
batch_size total summed to max_num_seq, which makes pipeline is not
full, this optimization can make all stages running with full batch_size
= max_num_seq, the same changes will contribute to vllm scheduler too.
### Does this PR introduce _any_ user-facing change?
add `pp_size` in mooncake connector kv_connector_extra_config
```
"kv_connector_extra_config": {
"use_ascend_direct": true,
"prefill": {
"dp_size": 1,
"tp_size": 4,
"pp_size": 4
},
"decode": {
"dp_size": 16,
"tp_size": 1
}
}
```
### How was this patch tested?
- vLLM version: v0.12.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9
---------
Signed-off-by: chenxiao <Jaychou1620@Gmail.com>
Signed-off-by: Kurumi5210 <Jaychou1620@Gmail.com>
Signed-off-by: Kurumi5210 <jaychou1620@gmail.com>
Signed-off-by: 秋刀鱼 <jaychou1620@Gmail.com>
Co-authored-by: chenxiao <Jaychou1620@Gmail.com>
Co-authored-by: zss <zss@qq.com>
Co-authored-by: zss <3265779424@qq.com>
2025-12-10 16:01:43 +08:00
|
|
|
_prefill_pp_size=1,
|
2025-08-18 14:30:07 +08:00
|
|
|
engine=self.engine,
|
|
|
|
|
local_engine_id="local_engine",
|
|
|
|
|
local_handshake_port=5555,
|
[feature] mooncake support pcp/dcp in common conditions (#5224)
### What this PR does / why we need it?
1. This PR is proposed to support complicated pcp/dcp parallelisms in
Prefill and Decode nodes in Mooncake, such as Prefill: TP8/PCP2DCP8 and
Decode: TP8/DCP4/DP2, which is not supported now. We establish the link
mappings to transfer KVCache between prefill and decode nodes. The main
function is realized in Function of `_get_kv_split_metadata` in
Mooncake_connector.py
2. After a prefill rank is pulled KVCache by a decode rank, the decode
rank will send `DONE_RECVING_MSG` to the prefill rank and the prefill
rank will free its KVCache blocks. If a prefill rank is pulled KVCache
more than one time by several decode ranks and it surely could happen in
complicated pcp/dcp parallelisms, it will cause the prefill rank free
its KVCache blocks for several times, which could cause memory issue.
This PR solve this issue by counting the times of prefill rank would be
pulled KVCache and in the last time, it will free the prefill rank
KVCache blocks. The related code is in Function of `run_busy_loop` in
Mooncake_connector.py
3. If a prefill rank is not pulled KVCache by any decode ranks, the
first rank in decode node will send "DONE_RECVING_MSG" to free its
blocks. The related code is in Function of
`_send_done_signal_to_free_remote_port` in Mooncake_connector.py
### How was this patch tested?
This PR is tested in many pcp/dcp parallelisms, and the accuracy are all
correct.
MLA model:
Prefill node: TP8/DP2, Decode node: TP8/DP2
Prefill node: TP8/PCP2/DCP8, Decode node: TP8/DP2
Prefill node: TP8/PCP2/DCP8, Decode node: TP8/DCP4/DP2
Prefill node: TP8/PCP2/DCP4, Decode node: TP4/DCP2/DP4
Prefill node: TP8/PCP2/DCP2, Decode node: TP4/DCP4/DP4
Prefill node: TP8/PCP2, Decode node: TP4/DCP2
GQA model:
Prefill node: TP8/DP2, Decode node: TP8/DP2
Prefill node: TP8/PCP2/DCP2, Decode node: TP8/DP2
Prefill node: TP8/PCP2/DCP2, Decode node: TP8/DCP2/DP2
Prefill node: TP8/PCP2/DCP2, Decode node: TP4/DP4
Prefill node: TP16/DCP2/PCP1, Decode node: TP8/DCP2/DP2
- vLLM version: release/v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9
- Co-author by: Daishixun dsxtsteven@sina.com
---------
Signed-off-by: wangxiaochao <w00642655@china.huawei.com>
Co-authored-by: wangxiaochao <w00642655@china.huawei.com>
Co-authored-by: Jade Zheng <zheng.shoujian@outlook.com>
2025-12-31 09:53:03 +08:00
|
|
|
side_channel_port=30000,
|
2025-08-18 14:30:07 +08:00
|
|
|
local_kv_caches_base_addr=[0x1000, 0x2000],
|
|
|
|
|
block_len=[1024, 2048],
|
2025-10-20 16:30:57 +08:00
|
|
|
ready_event=self.ready_event,
|
|
|
|
|
vllm_config=self.vllm_config,
|
2026-01-04 16:49:33 +08:00
|
|
|
kv_caches=self.kv_caches,
|
|
|
|
|
prefill_pp_layer_partition=None)
|
2025-08-18 14:30:07 +08:00
|
|
|
self.thread.request_queue = queue.Queue()
|
|
|
|
|
|
|
|
|
|
@patch.object(KVCacheRecvingThread, '_handle_request')
|
|
|
|
|
def test_run_loop_normal(self, mock_handle):
|
|
|
|
|
test_request = {
|
|
|
|
|
"request_id": "req1",
|
|
|
|
|
"local_block_ids": [1, 2],
|
|
|
|
|
"remote_block_ids": [3, 4],
|
|
|
|
|
"remote_engine_id": "remote_engine",
|
|
|
|
|
"remote_host": "localhost",
|
|
|
|
|
"remote_handshake_port": 6666,
|
2025-10-20 16:30:57 +08:00
|
|
|
"remote_transfer_port": 7777,
|
|
|
|
|
"offset": 0,
|
mooncake connector support pipeline parallel & fix pp with flashcomm1 (#4054)
### What this PR does / why we need it?
To support pipeline parallel with PD disaggregation, this PR support PP
in mooncake connector and fix other bugs when enable pp with other
optimization params, including following changes:
- mooncake connector support pp in prefill, we do not support decode pp
currently
- fix bugs when enable both pp and flashcomm1
- optimize ascend-scheduler to support full batch in multiple pipeline
stages, original implementation would cause all pipeline stages
batch_size total summed to max_num_seq, which makes pipeline is not
full, this optimization can make all stages running with full batch_size
= max_num_seq, the same changes will contribute to vllm scheduler too.
### Does this PR introduce _any_ user-facing change?
add `pp_size` in mooncake connector kv_connector_extra_config
```
"kv_connector_extra_config": {
"use_ascend_direct": true,
"prefill": {
"dp_size": 1,
"tp_size": 4,
"pp_size": 4
},
"decode": {
"dp_size": 16,
"tp_size": 1
}
}
```
### How was this patch tested?
- vLLM version: v0.12.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9
---------
Signed-off-by: chenxiao <Jaychou1620@Gmail.com>
Signed-off-by: Kurumi5210 <Jaychou1620@Gmail.com>
Signed-off-by: Kurumi5210 <jaychou1620@gmail.com>
Signed-off-by: 秋刀鱼 <jaychou1620@Gmail.com>
Co-authored-by: chenxiao <Jaychou1620@Gmail.com>
Co-authored-by: zss <zss@qq.com>
Co-authored-by: zss <3265779424@qq.com>
2025-12-10 16:01:43 +08:00
|
|
|
"tp_num_need_pulls": 2,
|
2025-11-21 18:08:56 +08:00
|
|
|
"all_task_done": False
|
2025-08-18 14:30:07 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
self.thread.request_queue.put(test_request)
|
|
|
|
|
self.thread.request_queue.put(None)
|
|
|
|
|
|
|
|
|
|
self.thread.start()
|
|
|
|
|
time.sleep(0.1)
|
|
|
|
|
self.thread.join(timeout=1.0)
|
|
|
|
|
|
|
|
|
|
self.assertTrue(self.thread.ready_event.is_set())
|
|
|
|
|
mock_handle.assert_called_once_with(test_request)
|
|
|
|
|
self.assertTrue(self.thread.request_queue.empty())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class MockVllmConfig:
|
|
|
|
|
|
|
|
|
|
def __init__(self):
|
2025-09-04 08:22:10 +08:00
|
|
|
self.model_config = MagicMock()
|
2025-08-18 14:30:07 +08:00
|
|
|
self.parallel_config = MagicMock()
|
|
|
|
|
self.cache_config = MagicMock()
|
|
|
|
|
self.kv_transfer_config = MagicMock()
|
2026-01-04 16:49:33 +08:00
|
|
|
self.speculative_config = MagicMock()
|
2025-09-04 08:22:10 +08:00
|
|
|
self.model_config.use_mla = True
|
2025-08-18 14:30:07 +08:00
|
|
|
self.parallel_config.tensor_parallel_size = 2
|
2025-10-21 20:15:24 +08:00
|
|
|
self.parallel_config.data_parallel_rank = 0
|
2025-08-18 14:30:07 +08:00
|
|
|
self.parallel_config.data_parallel_size_local = 1
|
mooncake connector support pipeline parallel & fix pp with flashcomm1 (#4054)
### What this PR does / why we need it?
To support pipeline parallel with PD disaggregation, this PR support PP
in mooncake connector and fix other bugs when enable pp with other
optimization params, including following changes:
- mooncake connector support pp in prefill, we do not support decode pp
currently
- fix bugs when enable both pp and flashcomm1
- optimize ascend-scheduler to support full batch in multiple pipeline
stages, original implementation would cause all pipeline stages
batch_size total summed to max_num_seq, which makes pipeline is not
full, this optimization can make all stages running with full batch_size
= max_num_seq, the same changes will contribute to vllm scheduler too.
### Does this PR introduce _any_ user-facing change?
add `pp_size` in mooncake connector kv_connector_extra_config
```
"kv_connector_extra_config": {
"use_ascend_direct": true,
"prefill": {
"dp_size": 1,
"tp_size": 4,
"pp_size": 4
},
"decode": {
"dp_size": 16,
"tp_size": 1
}
}
```
### How was this patch tested?
- vLLM version: v0.12.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9
---------
Signed-off-by: chenxiao <Jaychou1620@Gmail.com>
Signed-off-by: Kurumi5210 <Jaychou1620@Gmail.com>
Signed-off-by: Kurumi5210 <jaychou1620@gmail.com>
Signed-off-by: 秋刀鱼 <jaychou1620@Gmail.com>
Co-authored-by: chenxiao <Jaychou1620@Gmail.com>
Co-authored-by: zss <zss@qq.com>
Co-authored-by: zss <3265779424@qq.com>
2025-12-10 16:01:43 +08:00
|
|
|
self.parallel_config.pipeline_parallel_size = 1
|
|
|
|
|
self.parallel_config.data_parallel_rank_local = 0
|
|
|
|
|
self.model_config.get_num_layers_by_block_type = MagicMock(
|
|
|
|
|
return_value=32)
|
2025-08-18 14:30:07 +08:00
|
|
|
self.cache_config.block_size = 16
|
|
|
|
|
self.kv_transfer_config.kv_port = 5000
|
|
|
|
|
self.kv_transfer_config.kv_role = 'kv_producer'
|
|
|
|
|
self.kv_transfer_config.get_from_extra_config = MagicMock()
|
|
|
|
|
self.kv_transfer_config.get_from_extra_config.side_effect = lambda k, d: {
|
|
|
|
|
"prefill": {
|
|
|
|
|
"tp_size": 2,
|
mooncake connector support pipeline parallel & fix pp with flashcomm1 (#4054)
### What this PR does / why we need it?
To support pipeline parallel with PD disaggregation, this PR support PP
in mooncake connector and fix other bugs when enable pp with other
optimization params, including following changes:
- mooncake connector support pp in prefill, we do not support decode pp
currently
- fix bugs when enable both pp and flashcomm1
- optimize ascend-scheduler to support full batch in multiple pipeline
stages, original implementation would cause all pipeline stages
batch_size total summed to max_num_seq, which makes pipeline is not
full, this optimization can make all stages running with full batch_size
= max_num_seq, the same changes will contribute to vllm scheduler too.
### Does this PR introduce _any_ user-facing change?
add `pp_size` in mooncake connector kv_connector_extra_config
```
"kv_connector_extra_config": {
"use_ascend_direct": true,
"prefill": {
"dp_size": 1,
"tp_size": 4,
"pp_size": 4
},
"decode": {
"dp_size": 16,
"tp_size": 1
}
}
```
### How was this patch tested?
- vLLM version: v0.12.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9
---------
Signed-off-by: chenxiao <Jaychou1620@Gmail.com>
Signed-off-by: Kurumi5210 <Jaychou1620@Gmail.com>
Signed-off-by: Kurumi5210 <jaychou1620@gmail.com>
Signed-off-by: 秋刀鱼 <jaychou1620@Gmail.com>
Co-authored-by: chenxiao <Jaychou1620@Gmail.com>
Co-authored-by: zss <zss@qq.com>
Co-authored-by: zss <3265779424@qq.com>
2025-12-10 16:01:43 +08:00
|
|
|
"dp_size": 1,
|
|
|
|
|
"pp_size": 1
|
2025-08-18 14:30:07 +08:00
|
|
|
},
|
|
|
|
|
"decode": {
|
|
|
|
|
"tp_size": 2,
|
mooncake connector support pipeline parallel & fix pp with flashcomm1 (#4054)
### What this PR does / why we need it?
To support pipeline parallel with PD disaggregation, this PR support PP
in mooncake connector and fix other bugs when enable pp with other
optimization params, including following changes:
- mooncake connector support pp in prefill, we do not support decode pp
currently
- fix bugs when enable both pp and flashcomm1
- optimize ascend-scheduler to support full batch in multiple pipeline
stages, original implementation would cause all pipeline stages
batch_size total summed to max_num_seq, which makes pipeline is not
full, this optimization can make all stages running with full batch_size
= max_num_seq, the same changes will contribute to vllm scheduler too.
### Does this PR introduce _any_ user-facing change?
add `pp_size` in mooncake connector kv_connector_extra_config
```
"kv_connector_extra_config": {
"use_ascend_direct": true,
"prefill": {
"dp_size": 1,
"tp_size": 4,
"pp_size": 4
},
"decode": {
"dp_size": 16,
"tp_size": 1
}
}
```
### How was this patch tested?
- vLLM version: v0.12.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9
---------
Signed-off-by: chenxiao <Jaychou1620@Gmail.com>
Signed-off-by: Kurumi5210 <Jaychou1620@Gmail.com>
Signed-off-by: Kurumi5210 <jaychou1620@gmail.com>
Signed-off-by: 秋刀鱼 <jaychou1620@Gmail.com>
Co-authored-by: chenxiao <Jaychou1620@Gmail.com>
Co-authored-by: zss <zss@qq.com>
Co-authored-by: zss <3265779424@qq.com>
2025-12-10 16:01:43 +08:00
|
|
|
"dp_size": 1,
|
|
|
|
|
"pp_size": 1
|
2025-08-18 14:30:07 +08:00
|
|
|
}
|
|
|
|
|
}.get(k, d)
|
2025-10-20 16:30:57 +08:00
|
|
|
self.additional_config = {}
|
2025-08-18 14:30:07 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
class MockRequest:
|
|
|
|
|
|
|
|
|
|
def __init__(self,
|
|
|
|
|
request_id,
|
|
|
|
|
prompt_token_ids=None,
|
|
|
|
|
kv_transfer_params=None,
|
|
|
|
|
status=None):
|
|
|
|
|
self.request_id = request_id
|
|
|
|
|
self.prompt_token_ids = prompt_token_ids or [1, 2, 3, 4]
|
|
|
|
|
self.kv_transfer_params = kv_transfer_params or {}
|
|
|
|
|
self.status = status or "running"
|
|
|
|
|
self.output_token_ids = [101, 102]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestKVCacheTaskTracker(unittest.TestCase):
|
|
|
|
|
|
|
|
|
|
def setUp(self):
|
2025-09-04 08:22:10 +08:00
|
|
|
self.tracker = KVCacheTaskTracker()
|
2025-08-18 14:30:07 +08:00
|
|
|
|
2025-09-04 08:22:10 +08:00
|
|
|
def test_update_done_task_count(self):
|
2025-08-18 14:30:07 +08:00
|
|
|
self.assertEqual(len(self.tracker.finished_requests), 0)
|
2025-09-04 08:22:10 +08:00
|
|
|
self.assertEqual(len(self.tracker.delayed_free_requests), 0)
|
2025-09-24 11:22:46 +08:00
|
|
|
self.assertEqual(len(self.tracker.record_finished_requests), 0)
|
2025-09-04 08:22:10 +08:00
|
|
|
|
|
|
|
|
current_time = time.time()
|
|
|
|
|
self.tracker.add_delayed_request("req_1", current_time)
|
|
|
|
|
result = self.tracker.delayed_free_requests
|
2025-09-24 11:22:46 +08:00
|
|
|
result_record = self.tracker.record_finished_requests
|
2025-09-04 08:22:10 +08:00
|
|
|
self.assertEqual(len(result), 1)
|
2025-09-24 11:22:46 +08:00
|
|
|
self.assertEqual(result["req_1"], current_time)
|
|
|
|
|
self.assertEqual(len(result_record), 0)
|
2025-09-04 08:22:10 +08:00
|
|
|
|
|
|
|
|
self.tracker.update_done_task_count("req_1")
|
|
|
|
|
result_finished = self.tracker.finished_requests
|
|
|
|
|
result_delayed = self.tracker.delayed_free_requests
|
2025-09-24 11:22:46 +08:00
|
|
|
result_record = self.tracker.record_finished_requests
|
2025-09-04 08:22:10 +08:00
|
|
|
self.assertEqual(result_finished, {"req_1"})
|
|
|
|
|
self.assertEqual(len(result_delayed), 0)
|
2025-09-24 11:22:46 +08:00
|
|
|
self.assertEqual(len(result_record), 0)
|
|
|
|
|
|
|
|
|
|
self.tracker.update_done_task_count("req_2")
|
|
|
|
|
result_finished = self.tracker.finished_requests
|
|
|
|
|
result_delayed = self.tracker.delayed_free_requests
|
|
|
|
|
result_record = self.tracker.record_finished_requests
|
|
|
|
|
self.assertEqual(result_finished, {"req_1", "req_2"})
|
|
|
|
|
self.assertEqual(len(result_delayed), 0)
|
|
|
|
|
self.assertEqual(len(result_record), 1)
|
|
|
|
|
self.assertEqual(result_record, {"req_2"})
|
|
|
|
|
|
|
|
|
|
def test_updtate_add_delayed_request(self) -> None:
|
|
|
|
|
self.tracker.update_done_task_count("req2")
|
|
|
|
|
result_start_record = self.tracker.record_finished_requests
|
|
|
|
|
self.assertEqual(len(result_start_record), 1)
|
|
|
|
|
self.tracker.add_delayed_request("req2", time.time())
|
|
|
|
|
result_delayed = self.tracker.delayed_free_requests
|
|
|
|
|
result_end_record = self.tracker.record_finished_requests
|
|
|
|
|
self.assertEqual(len(result_delayed), 0)
|
|
|
|
|
self.assertEqual(len(result_end_record), 0)
|
2025-09-04 08:22:10 +08:00
|
|
|
|
|
|
|
|
def test_retrieve_expired_requests(self):
|
|
|
|
|
current_time = time.time()
|
|
|
|
|
self.tracker.add_delayed_request("req_1", current_time - 600)
|
|
|
|
|
self.tracker.add_delayed_request("req_2", current_time)
|
|
|
|
|
result = self.tracker._retrieve_expired_requests()
|
|
|
|
|
self.assertEqual(result, {
|
|
|
|
|
"req_1",
|
|
|
|
|
})
|
|
|
|
|
result_delay = self.tracker.delayed_free_requests
|
|
|
|
|
self.assertEqual(len(result_delay), 1)
|
2025-09-24 11:22:46 +08:00
|
|
|
self.assertIn("req_2", result_delay)
|
2025-08-18 14:30:07 +08:00
|
|
|
|
|
|
|
|
def test_duplicate_task_update(self):
|
2025-09-04 08:22:10 +08:00
|
|
|
self.tracker.update_done_task_count("req1")
|
|
|
|
|
self.tracker.update_done_task_count("req1")
|
|
|
|
|
self.tracker.update_done_task_count("req1")
|
2025-08-18 14:30:07 +08:00
|
|
|
|
|
|
|
|
finished = self.tracker.get_and_clear_finished_requests()
|
|
|
|
|
self.assertEqual(finished, {"req1"})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestMooncakeConnectorMetadata(unittest.TestCase):
|
|
|
|
|
|
|
|
|
|
def test_add_new_req(self):
|
|
|
|
|
meta = MooncakeConnectorMetadata()
|
2025-09-04 08:22:10 +08:00
|
|
|
self.assertEqual(len(meta.requests), 0)
|
|
|
|
|
self.assertEqual(len(meta.requests_to_send), 0)
|
|
|
|
|
|
2025-08-18 14:30:07 +08:00
|
|
|
meta.add_new_req(request_id="req1",
|
|
|
|
|
local_block_ids=[1, 2, 3],
|
[feature] mooncake support pcp/dcp in common conditions (#5224)
### What this PR does / why we need it?
1. This PR is proposed to support complicated pcp/dcp parallelisms in
Prefill and Decode nodes in Mooncake, such as Prefill: TP8/PCP2DCP8 and
Decode: TP8/DCP4/DP2, which is not supported now. We establish the link
mappings to transfer KVCache between prefill and decode nodes. The main
function is realized in Function of `_get_kv_split_metadata` in
Mooncake_connector.py
2. After a prefill rank is pulled KVCache by a decode rank, the decode
rank will send `DONE_RECVING_MSG` to the prefill rank and the prefill
rank will free its KVCache blocks. If a prefill rank is pulled KVCache
more than one time by several decode ranks and it surely could happen in
complicated pcp/dcp parallelisms, it will cause the prefill rank free
its KVCache blocks for several times, which could cause memory issue.
This PR solve this issue by counting the times of prefill rank would be
pulled KVCache and in the last time, it will free the prefill rank
KVCache blocks. The related code is in Function of `run_busy_loop` in
Mooncake_connector.py
3. If a prefill rank is not pulled KVCache by any decode ranks, the
first rank in decode node will send "DONE_RECVING_MSG" to free its
blocks. The related code is in Function of
`_send_done_signal_to_free_remote_port` in Mooncake_connector.py
### How was this patch tested?
This PR is tested in many pcp/dcp parallelisms, and the accuracy are all
correct.
MLA model:
Prefill node: TP8/DP2, Decode node: TP8/DP2
Prefill node: TP8/PCP2/DCP8, Decode node: TP8/DP2
Prefill node: TP8/PCP2/DCP8, Decode node: TP8/DCP4/DP2
Prefill node: TP8/PCP2/DCP4, Decode node: TP4/DCP2/DP4
Prefill node: TP8/PCP2/DCP2, Decode node: TP4/DCP4/DP4
Prefill node: TP8/PCP2, Decode node: TP4/DCP2
GQA model:
Prefill node: TP8/DP2, Decode node: TP8/DP2
Prefill node: TP8/PCP2/DCP2, Decode node: TP8/DP2
Prefill node: TP8/PCP2/DCP2, Decode node: TP8/DCP2/DP2
Prefill node: TP8/PCP2/DCP2, Decode node: TP4/DP4
Prefill node: TP16/DCP2/PCP1, Decode node: TP8/DCP2/DP2
- vLLM version: release/v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9
- Co-author by: Daishixun dsxtsteven@sina.com
---------
Signed-off-by: wangxiaochao <w00642655@china.huawei.com>
Co-authored-by: wangxiaochao <w00642655@china.huawei.com>
Co-authored-by: Jade Zheng <zheng.shoujian@outlook.com>
2025-12-31 09:53:03 +08:00
|
|
|
num_external_tokens=48,
|
2025-08-18 14:30:07 +08:00
|
|
|
kv_transfer_params={
|
|
|
|
|
"remote_block_ids": [4, 5, 6],
|
|
|
|
|
"remote_engine_id": "remote_engine",
|
|
|
|
|
"remote_host": "localhost",
|
2025-11-18 10:17:48 +08:00
|
|
|
"remote_port": 5000,
|
|
|
|
|
"remote_pcp_size": 1,
|
2026-01-26 14:28:33 +08:00
|
|
|
"remote_dcp_size": 1,
|
|
|
|
|
"remote_ptp_size": 2
|
2025-08-18 14:30:07 +08:00
|
|
|
})
|
|
|
|
|
|
|
|
|
|
self.assertEqual(len(meta.requests), 1)
|
|
|
|
|
req_meta = meta.requests["req1"]
|
|
|
|
|
self.assertIsInstance(req_meta, ReqMeta)
|
|
|
|
|
self.assertEqual(req_meta.local_block_ids, [1, 2, 3])
|
|
|
|
|
self.assertEqual(req_meta.remote_block_ids, [4, 5, 6])
|
|
|
|
|
self.assertEqual(req_meta.remote_engine_id, "remote_engine")
|
|
|
|
|
self.assertEqual(req_meta.remote_host, "localhost")
|
|
|
|
|
self.assertEqual(req_meta.remote_port, 5000)
|
2026-01-26 14:28:33 +08:00
|
|
|
self.assertEqual(req_meta.remote_ptp_size, 2)
|
2025-08-18 14:30:07 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestMooncakeConnectorSchedulerMatchedTokens(unittest.TestCase):
|
|
|
|
|
|
|
|
|
|
def setUp(self):
|
|
|
|
|
config = MockVllmConfig()
|
2025-10-23 20:39:14 +08:00
|
|
|
self.p1 = patch(
|
2026-01-15 08:57:40 +08:00
|
|
|
'vllm_ascend.distributed.kv_transfer.kv_p2p.mooncake_connector.init_ascend_config',
|
2025-12-07 15:19:42 +08:00
|
|
|
new=MagicMock())
|
|
|
|
|
self.p2 = patch(
|
2026-01-15 08:57:40 +08:00
|
|
|
'vllm_ascend.distributed.kv_transfer.kv_p2p.mooncake_connector.get_ascend_config',
|
2025-12-07 15:19:42 +08:00
|
|
|
new=MagicMock(return_value=MagicMock()))
|
2025-10-23 20:39:14 +08:00
|
|
|
self.p1.start()
|
2025-12-07 15:19:42 +08:00
|
|
|
self.p2.start()
|
2025-10-23 20:39:14 +08:00
|
|
|
self.addCleanup(self.p1.stop)
|
2025-12-07 15:19:42 +08:00
|
|
|
self.addCleanup(self.p2.stop)
|
2025-10-23 20:39:14 +08:00
|
|
|
self.scheduler = MooncakeConnectorScheduler(config, "test_engine")
|
2025-08-18 14:30:07 +08:00
|
|
|
|
|
|
|
|
def test_get_num_new_matched_tokens(self):
|
|
|
|
|
request = MockRequest("req1")
|
|
|
|
|
tokens, async_flag = self.scheduler.get_num_new_matched_tokens(
|
|
|
|
|
request, 0)
|
|
|
|
|
self.assertEqual(tokens, 0)
|
|
|
|
|
self.assertFalse(async_flag)
|
|
|
|
|
|
|
|
|
|
request.kv_transfer_params = {"do_remote_prefill": True}
|
|
|
|
|
tokens, async_flag = self.scheduler.get_num_new_matched_tokens(
|
|
|
|
|
request, 0)
|
2025-10-23 20:39:14 +08:00
|
|
|
self.assertEqual(tokens, 4)
|
2025-08-18 14:30:07 +08:00
|
|
|
self.assertTrue(async_flag)
|
|
|
|
|
|
|
|
|
|
def test_build_connector_meta(self):
|
|
|
|
|
request = MockRequest("req1")
|
|
|
|
|
blocks_mock = MagicMock()
|
|
|
|
|
blocks_mock.get_unhashed_block_ids.return_value = [4, 5, 6]
|
[feature] mooncake support pcp/dcp in common conditions (#5224)
### What this PR does / why we need it?
1. This PR is proposed to support complicated pcp/dcp parallelisms in
Prefill and Decode nodes in Mooncake, such as Prefill: TP8/PCP2DCP8 and
Decode: TP8/DCP4/DP2, which is not supported now. We establish the link
mappings to transfer KVCache between prefill and decode nodes. The main
function is realized in Function of `_get_kv_split_metadata` in
Mooncake_connector.py
2. After a prefill rank is pulled KVCache by a decode rank, the decode
rank will send `DONE_RECVING_MSG` to the prefill rank and the prefill
rank will free its KVCache blocks. If a prefill rank is pulled KVCache
more than one time by several decode ranks and it surely could happen in
complicated pcp/dcp parallelisms, it will cause the prefill rank free
its KVCache blocks for several times, which could cause memory issue.
This PR solve this issue by counting the times of prefill rank would be
pulled KVCache and in the last time, it will free the prefill rank
KVCache blocks. The related code is in Function of `run_busy_loop` in
Mooncake_connector.py
3. If a prefill rank is not pulled KVCache by any decode ranks, the
first rank in decode node will send "DONE_RECVING_MSG" to free its
blocks. The related code is in Function of
`_send_done_signal_to_free_remote_port` in Mooncake_connector.py
### How was this patch tested?
This PR is tested in many pcp/dcp parallelisms, and the accuracy are all
correct.
MLA model:
Prefill node: TP8/DP2, Decode node: TP8/DP2
Prefill node: TP8/PCP2/DCP8, Decode node: TP8/DP2
Prefill node: TP8/PCP2/DCP8, Decode node: TP8/DCP4/DP2
Prefill node: TP8/PCP2/DCP4, Decode node: TP4/DCP2/DP4
Prefill node: TP8/PCP2/DCP2, Decode node: TP4/DCP4/DP4
Prefill node: TP8/PCP2, Decode node: TP4/DCP2
GQA model:
Prefill node: TP8/DP2, Decode node: TP8/DP2
Prefill node: TP8/PCP2/DCP2, Decode node: TP8/DP2
Prefill node: TP8/PCP2/DCP2, Decode node: TP8/DCP2/DP2
Prefill node: TP8/PCP2/DCP2, Decode node: TP4/DP4
Prefill node: TP16/DCP2/PCP1, Decode node: TP8/DCP2/DP2
- vLLM version: release/v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9
- Co-author by: Daishixun dsxtsteven@sina.com
---------
Signed-off-by: wangxiaochao <w00642655@china.huawei.com>
Co-authored-by: wangxiaochao <w00642655@china.huawei.com>
Co-authored-by: Jade Zheng <zheng.shoujian@outlook.com>
2025-12-31 09:53:03 +08:00
|
|
|
self.scheduler._reqs_need_recv["req1"] = (request, [4, 5, 6], 48)
|
2025-08-18 14:30:07 +08:00
|
|
|
request.kv_transfer_params = {
|
|
|
|
|
"remote_block_ids": [1, 2, 3],
|
|
|
|
|
"remote_engine_id": "remote",
|
|
|
|
|
"remote_host": "localhost",
|
2025-11-18 10:17:48 +08:00
|
|
|
"remote_port": 5000,
|
|
|
|
|
"remote_pcp_size": 1,
|
|
|
|
|
"remote_dcp_size": 1
|
2025-08-18 14:30:07 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
meta = self.scheduler.build_connector_meta(MagicMock())
|
|
|
|
|
self.assertIsInstance(meta, MooncakeConnectorMetadata)
|
|
|
|
|
self.assertEqual(len(meta.requests), 1)
|
|
|
|
|
self.assertEqual(meta.requests["req1"].local_block_ids, [4, 5, 6])
|
|
|
|
|
self.assertEqual(meta.requests["req1"].remote_block_ids, [1, 2, 3])
|
|
|
|
|
self.assertEqual(len(self.scheduler._reqs_need_recv), 0)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestHelperFunctions(unittest.TestCase):
|
|
|
|
|
|
|
|
|
|
def test_group_concurrent_contiguous(self):
|
|
|
|
|
src: list[int] = [1, 2, 3, 5, 6]
|
|
|
|
|
dst: list[int] = [10, 11, 12, 14, 15]
|
|
|
|
|
|
|
|
|
|
src_groups, dst_groups = group_concurrent_contiguous(src, dst)
|
|
|
|
|
|
|
|
|
|
self.assertEqual(len(src_groups), 2)
|
|
|
|
|
self.assertEqual(src_groups[0], [1, 2, 3])
|
|
|
|
|
self.assertEqual(src_groups[1], [5, 6])
|
|
|
|
|
self.assertEqual(dst_groups[0], [10, 11, 12])
|
|
|
|
|
self.assertEqual(dst_groups[1], [14, 15])
|
|
|
|
|
|
|
|
|
|
def test_group_concurrent_contiguous_empty(self):
|
|
|
|
|
src: list[int] = []
|
|
|
|
|
dst: list[int] = []
|
|
|
|
|
src_groups, dst_groups = group_concurrent_contiguous(src, dst)
|
|
|
|
|
self.assertEqual(src_groups, [])
|
|
|
|
|
self.assertEqual(dst_groups, [])
|
|
|
|
|
|
|
|
|
|
def test_string_to_int64_hash(self):
|
|
|
|
|
hash1 = string_to_int64_hash("test_string")
|
|
|
|
|
hash2 = string_to_int64_hash("test_string")
|
|
|
|
|
self.assertEqual(hash1, hash2)
|
|
|
|
|
|
|
|
|
|
hash3 = string_to_int64_hash("different_string")
|
|
|
|
|
self.assertNotEqual(hash1, hash3)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestMooncakeConnectorForScheduler(unittest.TestCase):
|
|
|
|
|
|
|
|
|
|
def test_scheduler_role(self):
|
|
|
|
|
config = MockVllmConfig()
|
2025-10-20 16:30:57 +08:00
|
|
|
with patch(
|
2026-01-15 08:57:40 +08:00
|
|
|
'vllm_ascend.distributed.kv_transfer.kv_p2p.mooncake_connector.init_ascend_config'
|
2025-12-07 15:19:42 +08:00
|
|
|
), patch(
|
2026-01-15 08:57:40 +08:00
|
|
|
'vllm_ascend.distributed.kv_transfer.kv_p2p.mooncake_connector.get_ascend_config',
|
2025-12-07 15:19:42 +08:00
|
|
|
return_value=MagicMock()):
|
2025-10-20 16:30:57 +08:00
|
|
|
connector = MooncakeConnector(config, KVConnectorRole.SCHEDULER)
|
2025-08-18 14:30:07 +08:00
|
|
|
self.assertIsNotNone(connector.connector_scheduler)
|
|
|
|
|
self.assertIsNone(connector.connector_worker)
|
|
|
|
|
|
|
|
|
|
@patch.object(MooncakeConnectorScheduler, "get_num_new_matched_tokens")
|
|
|
|
|
def test_scheduler_methods(self, mock_method):
|
|
|
|
|
config = MockVllmConfig()
|
2025-10-20 16:30:57 +08:00
|
|
|
with patch(
|
2026-01-15 08:57:40 +08:00
|
|
|
'vllm_ascend.distributed.kv_transfer.kv_p2p.mooncake_connector.init_ascend_config'
|
2025-12-07 15:19:42 +08:00
|
|
|
), patch(
|
2026-01-15 08:57:40 +08:00
|
|
|
'vllm_ascend.distributed.kv_transfer.kv_p2p.mooncake_connector.get_ascend_config',
|
2025-12-07 15:19:42 +08:00
|
|
|
return_value=MagicMock()):
|
2025-10-20 16:30:57 +08:00
|
|
|
connector = MooncakeConnector(config, KVConnectorRole.SCHEDULER)
|
2025-08-18 14:30:07 +08:00
|
|
|
request = MockRequest("req1")
|
|
|
|
|
connector.get_num_new_matched_tokens(request, 0)
|
|
|
|
|
mock_method.assert_called_once_with(request, 0)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class MockKVCacheBlocks:
|
|
|
|
|
|
|
|
|
|
def get_unhashed_block_ids(self):
|
|
|
|
|
return [4, 5, 6]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class MockSchedulerOutput:
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class MockForwardContext:
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestMooncakeConnector(unittest.TestCase):
|
|
|
|
|
|
|
|
|
|
def setUp(self):
|
|
|
|
|
self.config = MockVllmConfig()
|
|
|
|
|
os.environ["ASCEND_RT_VISIBLE_DEVICES"] = "0,1"
|
|
|
|
|
|
|
|
|
|
def test_scheduler_initialization(self):
|
2025-10-20 16:30:57 +08:00
|
|
|
with patch(
|
2026-01-15 08:57:40 +08:00
|
|
|
'vllm_ascend.distributed.kv_transfer.kv_p2p.mooncake_connector.init_ascend_config'
|
2025-12-07 15:19:42 +08:00
|
|
|
), patch(
|
2026-01-15 08:57:40 +08:00
|
|
|
'vllm_ascend.distributed.kv_transfer.kv_p2p.mooncake_connector.get_ascend_config',
|
2025-12-07 15:19:42 +08:00
|
|
|
return_value=MagicMock()):
|
2025-10-20 16:30:57 +08:00
|
|
|
connector = MooncakeConnector(self.config,
|
|
|
|
|
KVConnectorRole.SCHEDULER)
|
2025-08-18 14:30:07 +08:00
|
|
|
self.assertIsNotNone(connector.connector_scheduler)
|
|
|
|
|
self.assertIsNone(connector.connector_worker)
|
|
|
|
|
|
|
|
|
|
@patch.object(MooncakeConnectorScheduler, "get_num_new_matched_tokens")
|
|
|
|
|
def test_get_num_new_matched_tokens(self, mock_method):
|
2025-10-20 16:30:57 +08:00
|
|
|
with patch(
|
2026-01-15 08:57:40 +08:00
|
|
|
'vllm_ascend.distributed.kv_transfer.kv_p2p.mooncake_connector.init_ascend_config'
|
2025-12-07 15:19:42 +08:00
|
|
|
), patch(
|
2026-01-15 08:57:40 +08:00
|
|
|
'vllm_ascend.distributed.kv_transfer.kv_p2p.mooncake_connector.get_ascend_config',
|
2025-12-07 15:19:42 +08:00
|
|
|
return_value=MagicMock()):
|
2025-10-20 16:30:57 +08:00
|
|
|
connector = MooncakeConnector(self.config,
|
|
|
|
|
KVConnectorRole.SCHEDULER)
|
2025-08-18 14:30:07 +08:00
|
|
|
request = MockRequest("req1")
|
|
|
|
|
connector.get_num_new_matched_tokens(request, 0)
|
|
|
|
|
mock_method.assert_called_once_with(request, 0)
|
|
|
|
|
|
|
|
|
|
@patch.object(MooncakeConnectorScheduler, "update_state_after_alloc")
|
|
|
|
|
def test_update_state_after_alloc(self, mock_method):
|
2025-10-20 16:30:57 +08:00
|
|
|
with patch(
|
2026-01-15 08:57:40 +08:00
|
|
|
'vllm_ascend.distributed.kv_transfer.kv_p2p.mooncake_connector.init_ascend_config'
|
2025-12-07 15:19:42 +08:00
|
|
|
), patch(
|
2026-01-15 08:57:40 +08:00
|
|
|
'vllm_ascend.distributed.kv_transfer.kv_p2p.mooncake_connector.get_ascend_config',
|
2025-12-07 15:19:42 +08:00
|
|
|
return_value=MagicMock()):
|
2025-10-20 16:30:57 +08:00
|
|
|
connector = MooncakeConnector(self.config,
|
|
|
|
|
KVConnectorRole.SCHEDULER)
|
2025-08-18 14:30:07 +08:00
|
|
|
request = MockRequest("req1")
|
|
|
|
|
blocks = MockKVCacheBlocks()
|
|
|
|
|
connector.update_state_after_alloc(request, blocks, 3)
|
|
|
|
|
mock_method.assert_called_once_with(request, blocks, 3)
|
|
|
|
|
|
|
|
|
|
@patch.object(MooncakeConnectorScheduler, "build_connector_meta")
|
|
|
|
|
def test_build_connector_meta(self, mock_method):
|
2025-10-20 16:30:57 +08:00
|
|
|
with patch(
|
2026-01-15 08:57:40 +08:00
|
|
|
'vllm_ascend.distributed.kv_transfer.kv_p2p.mooncake_connector.init_ascend_config'
|
2025-12-07 15:19:42 +08:00
|
|
|
), patch(
|
2026-01-15 08:57:40 +08:00
|
|
|
'vllm_ascend.distributed.kv_transfer.kv_p2p.mooncake_connector.get_ascend_config',
|
2025-12-07 15:19:42 +08:00
|
|
|
return_value=MagicMock()):
|
2025-10-20 16:30:57 +08:00
|
|
|
connector = MooncakeConnector(self.config,
|
|
|
|
|
KVConnectorRole.SCHEDULER)
|
2025-08-18 14:30:07 +08:00
|
|
|
scheduler_output = MockSchedulerOutput()
|
|
|
|
|
connector.build_connector_meta(scheduler_output)
|
|
|
|
|
mock_method.assert_called_once_with(scheduler_output)
|
|
|
|
|
|
|
|
|
|
@patch.object(MooncakeConnectorScheduler, "request_finished")
|
|
|
|
|
def test_request_finished(self, mock_method):
|
2025-10-20 16:30:57 +08:00
|
|
|
with patch(
|
2026-01-15 08:57:40 +08:00
|
|
|
'vllm_ascend.distributed.kv_transfer.kv_p2p.mooncake_connector.init_ascend_config'
|
2025-12-07 15:19:42 +08:00
|
|
|
), patch(
|
2026-01-15 08:57:40 +08:00
|
|
|
'vllm_ascend.distributed.kv_transfer.kv_p2p.mooncake_connector.get_ascend_config',
|
2025-12-07 15:19:42 +08:00
|
|
|
return_value=MagicMock()):
|
2025-10-20 16:30:57 +08:00
|
|
|
connector = MooncakeConnector(self.config,
|
|
|
|
|
KVConnectorRole.SCHEDULER)
|
2025-08-18 14:30:07 +08:00
|
|
|
request = MockRequest("req1")
|
|
|
|
|
connector.request_finished(request, [1, 2, 3])
|
|
|
|
|
mock_method.assert_called_once_with(request, [1, 2, 3])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestMooncakeConnectorScheduler(unittest.TestCase):
|
|
|
|
|
|
|
|
|
|
def setUp(self):
|
|
|
|
|
self.config = MockVllmConfig()
|
2025-10-20 16:30:57 +08:00
|
|
|
with patch(
|
2026-01-15 08:57:40 +08:00
|
|
|
'vllm_ascend.distributed.kv_transfer.kv_p2p.mooncake_connector.init_ascend_config'
|
2025-12-07 15:19:42 +08:00
|
|
|
), patch(
|
2026-01-15 08:57:40 +08:00
|
|
|
'vllm_ascend.distributed.kv_transfer.kv_p2p.mooncake_connector.get_ascend_config',
|
2025-12-07 15:19:42 +08:00
|
|
|
return_value=MagicMock()):
|
2025-10-20 16:30:57 +08:00
|
|
|
self.scheduler = MooncakeConnectorScheduler(
|
|
|
|
|
self.config, "test_engine")
|
2025-08-18 14:30:07 +08:00
|
|
|
|
|
|
|
|
def test_get_num_new_matched_tokens_no_remote_prefill(self):
|
|
|
|
|
request = MockRequest("req1")
|
|
|
|
|
tokens, async_flag = self.scheduler.get_num_new_matched_tokens(
|
|
|
|
|
request, 0)
|
|
|
|
|
self.assertEqual(tokens, 0)
|
|
|
|
|
self.assertFalse(async_flag)
|
|
|
|
|
|
|
|
|
|
def test_get_num_new_matched_tokens_with_remote_prefill(self):
|
|
|
|
|
request = MockRequest("req1",
|
|
|
|
|
kv_transfer_params={"do_remote_prefill": True})
|
|
|
|
|
tokens, async_flag = self.scheduler.get_num_new_matched_tokens(
|
|
|
|
|
request, 0)
|
2025-10-23 20:39:14 +08:00
|
|
|
self.assertEqual(tokens, 4)
|
2025-08-18 14:30:07 +08:00
|
|
|
self.assertTrue(async_flag)
|
|
|
|
|
|
|
|
|
|
def test_update_state_after_alloc_no_remote_prefill(self):
|
|
|
|
|
request = MockRequest("req1")
|
|
|
|
|
blocks = MagicMock()
|
|
|
|
|
self.scheduler.update_state_after_alloc(request, blocks, 0)
|
|
|
|
|
self.assertEqual(len(self.scheduler._reqs_need_recv), 0)
|
|
|
|
|
|
|
|
|
|
def test_update_state_after_alloc_with_remote_prefill(self):
|
|
|
|
|
request = MockRequest("req1",
|
|
|
|
|
kv_transfer_params={
|
|
|
|
|
"do_remote_prefill": True,
|
|
|
|
|
"remote_block_ids": [1, 2, 3],
|
|
|
|
|
"remote_engine_id": "remote",
|
|
|
|
|
"remote_host": "localhost",
|
|
|
|
|
"remote_port": 5000
|
|
|
|
|
})
|
|
|
|
|
blocks = MockKVCacheBlocks()
|
|
|
|
|
self.scheduler.update_state_after_alloc(request, blocks, 3)
|
|
|
|
|
self.assertEqual(len(self.scheduler._reqs_need_recv), 1)
|
|
|
|
|
self.assertEqual(self.scheduler._reqs_need_recv["req1"][0], request)
|
|
|
|
|
self.assertEqual(self.scheduler._reqs_need_recv["req1"][1], [4, 5, 6])
|
|
|
|
|
|
|
|
|
|
def test_request_finished_no_remote_decode(self):
|
|
|
|
|
request = MockRequest("req1")
|
|
|
|
|
delay_free, params = self.scheduler.request_finished(
|
|
|
|
|
request, [1, 2, 3])
|
|
|
|
|
self.assertFalse(delay_free)
|
|
|
|
|
self.assertIsNone(params)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestUtils(unittest.TestCase):
|
|
|
|
|
|
|
|
|
|
def test_string_to_int64_hash(self):
|
|
|
|
|
h1 = string_to_int64_hash("hello")
|
|
|
|
|
h2 = string_to_int64_hash("hello")
|
|
|
|
|
h3 = string_to_int64_hash("world")
|
|
|
|
|
self.assertEqual(h1, h2)
|
|
|
|
|
self.assertNotEqual(h1, h3)
|
|
|
|
|
self.assertIsInstance(h1, int)
|
|
|
|
|
|
|
|
|
|
def test_group_concurrent_contiguous(self):
|
|
|
|
|
src: list[int] = [1, 2, 3, 5, 6]
|
|
|
|
|
dst: list[int] = [10, 11, 12, 20, 21]
|
|
|
|
|
src_g, dst_g = group_concurrent_contiguous(src, dst)
|
|
|
|
|
self.assertEqual(src_g, [[1, 2, 3], [5, 6]])
|
|
|
|
|
self.assertEqual(dst_g, [[10, 11, 12], [20, 21]])
|
|
|
|
|
|
|
|
|
|
def test_group_empty(self):
|
|
|
|
|
src_g, dst_g = group_concurrent_contiguous([], [])
|
|
|
|
|
self.assertEqual(src_g, [])
|
|
|
|
|
self.assertEqual(dst_g, [])
|
|
|
|
|
|
|
|
|
|
def test_zmq_ctx_invalid_type(self):
|
|
|
|
|
with self.assertRaises(ValueError):
|
|
|
|
|
with zmq_ctx("INVALID", "tcp://127.0.0.1:5555"):
|
|
|
|
|
pass
|
|
|
|
|
|
2026-01-15 08:57:40 +08:00
|
|
|
@patch(
|
|
|
|
|
"vllm_ascend.distributed.kv_transfer.kv_p2p.mooncake_connector.make_zmq_socket"
|
|
|
|
|
)
|
2025-08-18 14:30:07 +08:00
|
|
|
def test_zmq_ctx_ok(self, mock_make_socket):
|
|
|
|
|
mock_socket = MagicMock()
|
|
|
|
|
mock_make_socket.return_value = mock_socket
|
|
|
|
|
with zmq_ctx(zmq.REQ, "tcp://localhost:1234") as s: # type: ignore
|
|
|
|
|
self.assertEqual(s, mock_socket)
|
|
|
|
|
|
2026-01-15 08:57:40 +08:00
|
|
|
@patch(
|
|
|
|
|
"vllm_ascend.distributed.kv_transfer.kv_p2p.mooncake_connector.logger")
|
2025-08-18 14:30:07 +08:00
|
|
|
def test_ensure_zmq_send_success(self, mock_logger):
|
|
|
|
|
mock_socket = MagicMock()
|
|
|
|
|
ensure_zmq_send(mock_socket, b"hello")
|
|
|
|
|
mock_socket.send.assert_called_once_with(b"hello")
|
|
|
|
|
|
2026-01-15 08:57:40 +08:00
|
|
|
@patch(
|
|
|
|
|
"vllm_ascend.distributed.kv_transfer.kv_p2p.mooncake_connector.logger")
|
2025-08-18 14:30:07 +08:00
|
|
|
def test_ensure_zmq_send_retry_and_fail(self, mock_logger):
|
|
|
|
|
mock_socket = MagicMock()
|
|
|
|
|
mock_socket.send.side_effect = zmq.ZMQError( # type: ignore
|
|
|
|
|
"send failed")
|
|
|
|
|
with self.assertRaises(RuntimeError):
|
|
|
|
|
ensure_zmq_send(mock_socket, b"hello", max_retries=2)
|
|
|
|
|
self.assertEqual(mock_socket.send.call_count, 2)
|
|
|
|
|
|
2026-01-15 08:57:40 +08:00
|
|
|
@patch(
|
|
|
|
|
"vllm_ascend.distributed.kv_transfer.kv_p2p.mooncake_connector.logger")
|
2025-08-18 14:30:07 +08:00
|
|
|
def test_ensure_zmq_recv_success(self, mock_logger):
|
|
|
|
|
mock_socket = MagicMock()
|
|
|
|
|
mock_socket.recv.return_value = b"response"
|
|
|
|
|
mock_poller = MagicMock()
|
|
|
|
|
mock_poller.poll.return_value = [
|
|
|
|
|
(mock_socket, zmq.POLLIN) # type: ignore
|
|
|
|
|
]
|
|
|
|
|
data = ensure_zmq_recv(mock_socket, mock_poller)
|
|
|
|
|
self.assertEqual(data, b"response")
|
|
|
|
|
|
2026-01-15 08:57:40 +08:00
|
|
|
@patch(
|
|
|
|
|
"vllm_ascend.distributed.kv_transfer.kv_p2p.mooncake_connector.logger")
|
2025-08-18 14:30:07 +08:00
|
|
|
def test_ensure_zmq_recv_timeout_and_fail(self, mock_logger):
|
|
|
|
|
mock_socket = MagicMock()
|
|
|
|
|
mock_poller = MagicMock()
|
|
|
|
|
mock_poller.poll.return_value = []
|
|
|
|
|
with self.assertRaises(RuntimeError):
|
|
|
|
|
ensure_zmq_recv(mock_socket,
|
|
|
|
|
mock_poller,
|
|
|
|
|
timeout=0.01,
|
|
|
|
|
max_retries=2)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class MockMooncakeAgentMetadata:
|
|
|
|
|
|
|
|
|
|
def __init__(self, **kwargs):
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class MockMooncakeConnectorMetadata:
|
|
|
|
|
|
|
|
|
|
def __init__(self):
|
|
|
|
|
self.requests = {}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class MockKVCacheSendingThread(threading.Thread):
|
|
|
|
|
|
|
|
|
|
def __init__(self, *args, **kwargs):
|
|
|
|
|
super().__init__()
|
|
|
|
|
self.daemon = True
|
|
|
|
|
self._finished_requests = set()
|
|
|
|
|
|
|
|
|
|
def get_and_clear_finished_requests(self):
|
|
|
|
|
return self._finished_requests
|
|
|
|
|
|
|
|
|
|
def start(self):
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class MockKVCacheRecvingThread(threading.Thread):
|
|
|
|
|
|
|
|
|
|
def __init__(self, *args, **kwargs):
|
|
|
|
|
super().__init__()
|
|
|
|
|
self.daemon = True
|
|
|
|
|
self._finished_requests = set()
|
|
|
|
|
self.add_request = MagicMock()
|
|
|
|
|
|
|
|
|
|
def get_and_clear_finished_requests(self):
|
|
|
|
|
return self._finished_requests
|
|
|
|
|
|
|
|
|
|
def start(self):
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class MockTensor:
|
|
|
|
|
|
|
|
|
|
def __init__(self, *args, **kwargs):
|
|
|
|
|
self.size = MagicMock(return_value=(10, 16, 8, 16))
|
|
|
|
|
self.element_size = MagicMock(return_value=4)
|
|
|
|
|
self.shape = (10, 16, 8, 16)
|
|
|
|
|
self.data_ptr = MagicMock(return_value=0x1000)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
mock_logger = MagicMock()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class MockTransferEngine:
|
|
|
|
|
|
|
|
|
|
def initialize(self, *args, **kwargs):
|
|
|
|
|
return 0
|
|
|
|
|
|
|
|
|
|
def register_memory(self, *args, **kwargs):
|
|
|
|
|
return 1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class MockEnvsAscend:
|
|
|
|
|
MOONCAKE_CONNECTOR_PROTOCOL = "mock_protocol"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def mock_get_tensor_model_parallel_rank():
|
|
|
|
|
return 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def mock_get_tp_group():
|
|
|
|
|
return MagicMock()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def mock_get_ip():
|
|
|
|
|
return "127.0.0.1"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def mock_string_to_int64_hash(s):
|
|
|
|
|
return hash(s)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestMooncakeConnectorWorker(unittest.TestCase):
|
|
|
|
|
|
|
|
|
|
def setUp(self):
|
|
|
|
|
self.mock_transfer_engine = MagicMock()
|
|
|
|
|
self.mock_transfer_engine.get_rpc_port.return_value = 9090
|
|
|
|
|
self.mock_transfer_engine.initialize.return_value = 0
|
|
|
|
|
self.mock_transfer_engine.register_memory.return_value = 0
|
2025-12-05 10:31:49 +08:00
|
|
|
|
2025-08-18 14:30:07 +08:00
|
|
|
self.patches = [
|
|
|
|
|
patch('torch.Tensor.size', return_value=(10, 16, 8, 16)),
|
|
|
|
|
patch('torch.Tensor.element_size', return_value=4),
|
|
|
|
|
patch('torch.Tensor.data_ptr', return_value=0x1000),
|
|
|
|
|
patch('math.prod', return_value=128),
|
|
|
|
|
patch(
|
2026-01-15 08:57:40 +08:00
|
|
|
'vllm_ascend.distributed.kv_transfer.kv_p2p.mooncake_connector.get_tensor_model_parallel_rank',
|
2025-08-18 14:30:07 +08:00
|
|
|
mock_get_tensor_model_parallel_rank),
|
|
|
|
|
patch(
|
2026-01-15 08:57:40 +08:00
|
|
|
'vllm_ascend.distributed.kv_transfer.kv_p2p.mooncake_connector.get_tp_group',
|
|
|
|
|
mock_get_tp_group),
|
|
|
|
|
patch(
|
|
|
|
|
'vllm_ascend.distributed.kv_transfer.kv_p2p.mooncake_connector.get_pp_group',
|
|
|
|
|
return_value=_mock_pp_group),
|
|
|
|
|
patch(
|
|
|
|
|
'vllm_ascend.distributed.kv_transfer.kv_p2p.mooncake_connector.get_ip',
|
|
|
|
|
mock_get_ip),
|
|
|
|
|
patch(
|
|
|
|
|
'vllm_ascend.distributed.kv_transfer.kv_p2p.mooncake_connector.string_to_int64_hash',
|
2025-08-18 14:30:07 +08:00
|
|
|
mock_string_to_int64_hash),
|
2025-10-23 20:39:14 +08:00
|
|
|
patch(
|
2026-01-15 08:57:40 +08:00
|
|
|
'vllm_ascend.distributed.kv_transfer.kv_p2p.mooncake_connector.global_te.get_transfer_engine',
|
2025-10-23 20:39:14 +08:00
|
|
|
return_value=self.mock_transfer_engine),
|
2025-12-07 15:19:42 +08:00
|
|
|
patch(
|
2026-01-15 08:57:40 +08:00
|
|
|
'vllm_ascend.distributed.kv_transfer.kv_p2p.mooncake_connector.global_te.register_buffer',
|
2025-12-07 15:19:42 +08:00
|
|
|
return_value=None),
|
2025-08-18 14:30:07 +08:00
|
|
|
patch(
|
2026-01-15 08:57:40 +08:00
|
|
|
'vllm_ascend.distributed.kv_transfer.kv_p2p.mooncake_connector.KVCacheSendingThread',
|
|
|
|
|
MagicMock()),
|
|
|
|
|
patch(
|
|
|
|
|
'vllm_ascend.distributed.kv_transfer.kv_p2p.mooncake_connector.KVCacheRecvingThread',
|
|
|
|
|
MagicMock()),
|
|
|
|
|
patch(
|
|
|
|
|
'vllm_ascend.distributed.kv_transfer.kv_p2p.mooncake_connector.logger',
|
2025-08-18 14:30:07 +08:00
|
|
|
MagicMock()),
|
|
|
|
|
patch(
|
2026-01-15 08:57:40 +08:00
|
|
|
'vllm_ascend.distributed.kv_transfer.kv_p2p.mooncake_connector.threading.Event',
|
2025-08-18 14:30:07 +08:00
|
|
|
MagicMock()),
|
2025-12-07 15:19:42 +08:00
|
|
|
patch(
|
2026-01-15 08:57:40 +08:00
|
|
|
'vllm_ascend.distributed.kv_transfer.kv_p2p.mooncake_connector.get_ascend_config',
|
2025-12-07 15:19:42 +08:00
|
|
|
return_value=MagicMock()),
|
2025-08-18 14:30:07 +08:00
|
|
|
]
|
|
|
|
|
|
|
|
|
|
for p in self.patches:
|
|
|
|
|
p.start() # type: ignore
|
|
|
|
|
|
|
|
|
|
self.vllm_config = MockVllmConfig()
|
|
|
|
|
self.engine_id = "test_engine"
|
|
|
|
|
self.kv_caches = {"layer1": (MagicMock(), MagicMock())}
|
|
|
|
|
|
|
|
|
|
def tearDown(self):
|
|
|
|
|
for p in self.patches:
|
|
|
|
|
p.stop() # type: ignore
|
|
|
|
|
|
|
|
|
|
def test_register_kv_caches_producer(self):
|
|
|
|
|
worker = MooncakeConnectorWorker(self.vllm_config, self.engine_id)
|
|
|
|
|
worker.register_kv_caches(self.kv_caches)
|
|
|
|
|
self.assertEqual(len(worker.kv_caches), 1)
|
|
|
|
|
self.assertIsNotNone(worker.kv_send_thread)
|
|
|
|
|
self.assertIsNone(worker.kv_recv_thread)
|
|
|
|
|
|
|
|
|
|
def test_register_kv_caches_consumer(self):
|
|
|
|
|
self.vllm_config.kv_transfer_config.kv_role = 'kv_consumer'
|
|
|
|
|
worker = MooncakeConnectorWorker(self.vllm_config, self.engine_id)
|
|
|
|
|
worker.register_kv_caches(self.kv_caches)
|
|
|
|
|
self.assertIsNone(worker.kv_send_thread)
|
|
|
|
|
self.assertIsNotNone(worker.kv_recv_thread)
|
|
|
|
|
|
|
|
|
|
def test_register_kv_caches_mla_case(self):
|
|
|
|
|
mla_cache1 = MagicMock()
|
|
|
|
|
mla_cache1.size.return_value = (10, 16, 1, 16)
|
|
|
|
|
mla_cache2 = MagicMock()
|
|
|
|
|
mla_cache2.size.return_value = (10, 16, 1, 8)
|
|
|
|
|
mla_caches = {"layer1": (mla_cache1, mla_cache2)}
|
|
|
|
|
|
|
|
|
|
worker = MooncakeConnectorWorker(self.vllm_config, self.engine_id)
|
|
|
|
|
worker.register_kv_caches(mla_caches)
|
|
|
|
|
self.assertTrue(worker.use_mla)
|
|
|
|
|
self.assertEqual(len(worker.block_len), 2)
|
|
|
|
|
|
2025-09-03 22:25:08 +08:00
|
|
|
def test_device_id_selection_with_physical_devices(self):
|
|
|
|
|
# Test with physical devices set
|
|
|
|
|
worker = MooncakeConnectorWorker(self.vllm_config, self.engine_id)
|
|
|
|
|
# Default tp_rank is 0, so device_id should be 10
|
2025-12-07 15:19:42 +08:00
|
|
|
self.assertIsNotNone(worker.engine)
|
2025-09-03 22:25:08 +08:00
|
|
|
|
mooncake connector support pipeline parallel & fix pp with flashcomm1 (#4054)
### What this PR does / why we need it?
To support pipeline parallel with PD disaggregation, this PR support PP
in mooncake connector and fix other bugs when enable pp with other
optimization params, including following changes:
- mooncake connector support pp in prefill, we do not support decode pp
currently
- fix bugs when enable both pp and flashcomm1
- optimize ascend-scheduler to support full batch in multiple pipeline
stages, original implementation would cause all pipeline stages
batch_size total summed to max_num_seq, which makes pipeline is not
full, this optimization can make all stages running with full batch_size
= max_num_seq, the same changes will contribute to vllm scheduler too.
### Does this PR introduce _any_ user-facing change?
add `pp_size` in mooncake connector kv_connector_extra_config
```
"kv_connector_extra_config": {
"use_ascend_direct": true,
"prefill": {
"dp_size": 1,
"tp_size": 4,
"pp_size": 4
},
"decode": {
"dp_size": 16,
"tp_size": 1
}
}
```
### How was this patch tested?
- vLLM version: v0.12.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9
---------
Signed-off-by: chenxiao <Jaychou1620@Gmail.com>
Signed-off-by: Kurumi5210 <Jaychou1620@Gmail.com>
Signed-off-by: Kurumi5210 <jaychou1620@gmail.com>
Signed-off-by: 秋刀鱼 <jaychou1620@Gmail.com>
Co-authored-by: chenxiao <Jaychou1620@Gmail.com>
Co-authored-by: zss <zss@qq.com>
Co-authored-by: zss <3265779424@qq.com>
2025-12-10 16:01:43 +08:00
|
|
|
def test_get_remote_tp_rank(self):
|
|
|
|
|
|
2026-01-26 14:28:33 +08:00
|
|
|
def get_tp_rank(prefill_tp_size: int,
|
|
|
|
|
prefill_pp_size: int,
|
|
|
|
|
decode_tp_size: int,
|
|
|
|
|
num_kv_heads: int,
|
|
|
|
|
tp_num_need_pulls: int,
|
|
|
|
|
is_deepseek_mla: bool,
|
|
|
|
|
remote_ptp_size: Optional[int] = None):
|
2026-01-15 08:57:40 +08:00
|
|
|
with patch(
|
|
|
|
|
'vllm_ascend.distributed.kv_transfer.kv_p2p.mooncake_connector.get_ascend_config',
|
mooncake connector support pipeline parallel & fix pp with flashcomm1 (#4054)
### What this PR does / why we need it?
To support pipeline parallel with PD disaggregation, this PR support PP
in mooncake connector and fix other bugs when enable pp with other
optimization params, including following changes:
- mooncake connector support pp in prefill, we do not support decode pp
currently
- fix bugs when enable both pp and flashcomm1
- optimize ascend-scheduler to support full batch in multiple pipeline
stages, original implementation would cause all pipeline stages
batch_size total summed to max_num_seq, which makes pipeline is not
full, this optimization can make all stages running with full batch_size
= max_num_seq, the same changes will contribute to vllm scheduler too.
### Does this PR introduce _any_ user-facing change?
add `pp_size` in mooncake connector kv_connector_extra_config
```
"kv_connector_extra_config": {
"use_ascend_direct": true,
"prefill": {
"dp_size": 1,
"tp_size": 4,
"pp_size": 4
},
"decode": {
"dp_size": 16,
"tp_size": 1
}
}
```
### How was this patch tested?
- vLLM version: v0.12.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9
---------
Signed-off-by: chenxiao <Jaychou1620@Gmail.com>
Signed-off-by: Kurumi5210 <Jaychou1620@Gmail.com>
Signed-off-by: Kurumi5210 <jaychou1620@gmail.com>
Signed-off-by: 秋刀鱼 <jaychou1620@Gmail.com>
Co-authored-by: chenxiao <Jaychou1620@Gmail.com>
Co-authored-by: zss <zss@qq.com>
Co-authored-by: zss <3265779424@qq.com>
2025-12-10 16:01:43 +08:00
|
|
|
return_value=MagicMock()), \
|
|
|
|
|
patch.object(self.vllm_config.kv_transfer_config, 'get_from_extra_config',
|
|
|
|
|
side_effect=lambda k, d=None: {
|
|
|
|
|
"prefill": {"tp_size": prefill_tp_size, "dp_size": 1, "pp_size": prefill_pp_size},
|
|
|
|
|
"decode": {"tp_size": decode_tp_size, "dp_size": 1, "pp_size": 1}
|
|
|
|
|
}.get(k, d)):
|
2025-12-22 20:21:45 +08:00
|
|
|
self.vllm_config.model_config.hf_text_config.num_key_value_heads = num_kv_heads
|
mooncake connector support pipeline parallel & fix pp with flashcomm1 (#4054)
### What this PR does / why we need it?
To support pipeline parallel with PD disaggregation, this PR support PP
in mooncake connector and fix other bugs when enable pp with other
optimization params, including following changes:
- mooncake connector support pp in prefill, we do not support decode pp
currently
- fix bugs when enable both pp and flashcomm1
- optimize ascend-scheduler to support full batch in multiple pipeline
stages, original implementation would cause all pipeline stages
batch_size total summed to max_num_seq, which makes pipeline is not
full, this optimization can make all stages running with full batch_size
= max_num_seq, the same changes will contribute to vllm scheduler too.
### Does this PR introduce _any_ user-facing change?
add `pp_size` in mooncake connector kv_connector_extra_config
```
"kv_connector_extra_config": {
"use_ascend_direct": true,
"prefill": {
"dp_size": 1,
"tp_size": 4,
"pp_size": 4
},
"decode": {
"dp_size": 16,
"tp_size": 1
}
}
```
### How was this patch tested?
- vLLM version: v0.12.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9
---------
Signed-off-by: chenxiao <Jaychou1620@Gmail.com>
Signed-off-by: Kurumi5210 <Jaychou1620@Gmail.com>
Signed-off-by: Kurumi5210 <jaychou1620@gmail.com>
Signed-off-by: 秋刀鱼 <jaychou1620@Gmail.com>
Co-authored-by: chenxiao <Jaychou1620@Gmail.com>
Co-authored-by: zss <zss@qq.com>
Co-authored-by: zss <3265779424@qq.com>
2025-12-10 16:01:43 +08:00
|
|
|
self.vllm_config.model_config.is_deepseek_mla = is_deepseek_mla
|
|
|
|
|
worker = MooncakeConnectorWorker(self.vllm_config,
|
|
|
|
|
self.engine_id)
|
|
|
|
|
worker.tp_num_need_pulls = tp_num_need_pulls
|
|
|
|
|
worker.use_sparse = 0
|
2026-01-26 14:28:33 +08:00
|
|
|
return worker._get_remote_ranks_for_req(
|
|
|
|
|
'test', remote_ptp_size)
|
mooncake connector support pipeline parallel & fix pp with flashcomm1 (#4054)
### What this PR does / why we need it?
To support pipeline parallel with PD disaggregation, this PR support PP
in mooncake connector and fix other bugs when enable pp with other
optimization params, including following changes:
- mooncake connector support pp in prefill, we do not support decode pp
currently
- fix bugs when enable both pp and flashcomm1
- optimize ascend-scheduler to support full batch in multiple pipeline
stages, original implementation would cause all pipeline stages
batch_size total summed to max_num_seq, which makes pipeline is not
full, this optimization can make all stages running with full batch_size
= max_num_seq, the same changes will contribute to vllm scheduler too.
### Does this PR introduce _any_ user-facing change?
add `pp_size` in mooncake connector kv_connector_extra_config
```
"kv_connector_extra_config": {
"use_ascend_direct": true,
"prefill": {
"dp_size": 1,
"tp_size": 4,
"pp_size": 4
},
"decode": {
"dp_size": 16,
"tp_size": 1
}
}
```
### How was this patch tested?
- vLLM version: v0.12.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9
---------
Signed-off-by: chenxiao <Jaychou1620@Gmail.com>
Signed-off-by: Kurumi5210 <Jaychou1620@Gmail.com>
Signed-off-by: Kurumi5210 <jaychou1620@gmail.com>
Signed-off-by: 秋刀鱼 <jaychou1620@Gmail.com>
Co-authored-by: chenxiao <Jaychou1620@Gmail.com>
Co-authored-by: zss <zss@qq.com>
Co-authored-by: zss <3265779424@qq.com>
2025-12-10 16:01:43 +08:00
|
|
|
|
|
|
|
|
self.assertIn(
|
|
|
|
|
get_tp_rank(16, 1, 1, 4, 4, False)[0],
|
|
|
|
|
[[0, 4, 8, 12], [1, 5, 9, 13], [2, 6, 10, 14], [3, 7, 11, 15]])
|
|
|
|
|
self.assertIn(
|
|
|
|
|
get_tp_rank(8, 1, 1, 4, 4, False)[0], [[0, 2, 4, 6], [1, 3, 5, 7]])
|
|
|
|
|
self.assertIn(get_tp_rank(4, 1, 1, 4, 4, False)[0], [[0, 1, 2, 3]])
|
|
|
|
|
self.assertIn(get_tp_rank(16, 1, 4, 4, 1, False),
|
|
|
|
|
[[[0], [4], [8], [12]], [[1], [5], [9], [13]],
|
|
|
|
|
[[2], [6], [10], [14]], [[3], [7], [11], [15]]])
|
|
|
|
|
self.assertIn(get_tp_rank(8, 1, 4, 4, 1, False),
|
|
|
|
|
[[[0], [2], [4], [6]], [[1], [3], [5], [7]]])
|
|
|
|
|
self.assertIn(get_tp_rank(4, 2, 2, 4, 2, False),
|
|
|
|
|
[[[0, 1, 4, 5], [2, 3, 6, 7]]])
|
|
|
|
|
self.assertIn(get_tp_rank(4, 1, 4, 4, 1, False),
|
|
|
|
|
[[[0], [1], [2], [3]]])
|
|
|
|
|
self.assertIn(
|
|
|
|
|
get_tp_rank(8, 2, 1, 4, 4, False)[0],
|
|
|
|
|
[[0, 2, 4, 6, 8, 10, 12, 14], [1, 3, 5, 7, 9, 11, 13, 15]])
|
|
|
|
|
self.assertIn(get_tp_rank(4, 2, 2, 4, 2, False),
|
|
|
|
|
[[[0, 1, 4, 5], [2, 3, 6, 7]]])
|
|
|
|
|
self.assertIn(get_tp_rank(2, 2, 1, 4, 2, False), [[[0, 1, 2, 3]]])
|
|
|
|
|
self.assertIn(
|
|
|
|
|
get_tp_rank(4, 4, 2, 8, 2, False),
|
|
|
|
|
[[[0, 1, 4, 5, 8, 9, 12, 13], [2, 3, 6, 7, 10, 11, 14, 15]]])
|
|
|
|
|
self.assertIn(
|
|
|
|
|
get_tp_rank(4, 2, 1, 4, 4, False)[0], [[0, 1, 2, 3, 4, 5, 6, 7]])
|
|
|
|
|
self.assertIn(
|
|
|
|
|
get_tp_rank(4, 4, 1, 4, 4, False)[0],
|
|
|
|
|
[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]])
|
|
|
|
|
self.assertIn(get_tp_rank(8, 2, 4, 4, 1, False),
|
|
|
|
|
[[[0, 8], [2, 10], [4, 12], [6, 14]],
|
|
|
|
|
[[1, 9], [3, 11], [5, 13], [7, 15]]])
|
|
|
|
|
self.assertIn(get_tp_rank(4, 2, 4, 4, 4, False),
|
|
|
|
|
[[[0, 4], [1, 5], [2, 6], [3, 7]]])
|
|
|
|
|
self.assertIn(
|
|
|
|
|
get_tp_rank(4, 4, 4, 4, 1, False),
|
|
|
|
|
[[[0, 4, 8, 12], [1, 5, 9, 13], [2, 6, 10, 14], [3, 7, 11, 15]]])
|
|
|
|
|
self.assertIn(
|
|
|
|
|
get_tp_rank(16, 1, 1, 1, 1,
|
|
|
|
|
True)[0], [[0], [1], [2], [3], [4], [5], [6], [7], [8],
|
|
|
|
|
[9], [10], [11], [12], [13], [14], [15]])
|
|
|
|
|
self.assertIn(get_tp_rank(4, 1, 4, 1, 1, True), [[[0], [1], [2], [3]]])
|
|
|
|
|
self.assertIn(
|
|
|
|
|
get_tp_rank(8, 2, 1, 1, 1, True)[0],
|
|
|
|
|
[[0, 8], [2, 10], [4, 12], [6, 14], [1, 9], [3, 11], [5, 13],
|
|
|
|
|
[7, 15]])
|
|
|
|
|
self.assertIn(
|
|
|
|
|
get_tp_rank(4, 4, 1, 1, 1, True)[0],
|
|
|
|
|
[[0, 4, 8, 12], [1, 5, 9, 13], [2, 6, 10, 14], [3, 7, 11, 15]])
|
|
|
|
|
self.assertIn(
|
|
|
|
|
get_tp_rank(8, 2, 4, 1, 1, True)[0],
|
|
|
|
|
[[0, 8], [2, 10], [4, 12], [6, 14], [1, 9], [3, 11], [5, 13],
|
|
|
|
|
[7, 15]])
|
|
|
|
|
self.assertIn(
|
|
|
|
|
get_tp_rank(4, 4, 4, 1, 1, True),
|
|
|
|
|
[[[0, 4, 8, 12], [1, 5, 9, 13], [2, 6, 10, 14], [3, 7, 11, 15]]])
|
|
|
|
|
|
2026-01-26 14:28:33 +08:00
|
|
|
# check remote ptp size
|
|
|
|
|
self.assertListEqual(get_tp_rank(16, 1, 2, 4, 2, False, 8),
|
|
|
|
|
get_tp_rank(8, 1, 2, 4, 2, False))
|
|
|
|
|
self.assertListEqual(get_tp_rank(8, 1, 2, 4, 2, False, 4),
|
|
|
|
|
get_tp_rank(4, 1, 2, 4, 2, False))
|
|
|
|
|
self.assertListEqual(get_tp_rank(4, 1, 2, 4, 1, False, 2),
|
|
|
|
|
get_tp_rank(2, 1, 2, 4, 1, False))
|
|
|
|
|
|
2025-12-18 15:07:14 +08:00
|
|
|
def test_get_kv_split_metadata(self):
|
|
|
|
|
|
2026-01-26 14:28:33 +08:00
|
|
|
def get_kv_split_metadata(use_mla,
|
|
|
|
|
pcp_size,
|
|
|
|
|
dcp_size,
|
|
|
|
|
tp_size,
|
|
|
|
|
tp_rank,
|
|
|
|
|
pcp_rank,
|
|
|
|
|
_prefill_tp_size,
|
|
|
|
|
remote_pcp_size,
|
|
|
|
|
remote_dcp_size,
|
|
|
|
|
remote_port,
|
|
|
|
|
remote_block_ids,
|
|
|
|
|
local_block_ids,
|
|
|
|
|
remote_engine_id,
|
|
|
|
|
remote_ptp_size=None):
|
2025-12-18 15:07:14 +08:00
|
|
|
|
|
|
|
|
worker = MooncakeConnectorWorker(self.vllm_config, self.engine_id)
|
|
|
|
|
|
|
|
|
|
worker.use_mla = use_mla
|
|
|
|
|
worker.pcp_size = pcp_size
|
|
|
|
|
worker.dcp_size = dcp_size
|
|
|
|
|
worker.tp_size = tp_size
|
|
|
|
|
worker.tp_rank = tp_rank
|
|
|
|
|
worker.pcp_rank = pcp_rank
|
|
|
|
|
worker._prefill_tp_size = _prefill_tp_size
|
2026-01-10 22:43:52 +08:00
|
|
|
worker.local_remote_block_port_mapping = {}
|
[feature] mooncake support pcp/dcp in common conditions (#5224)
### What this PR does / why we need it?
1. This PR is proposed to support complicated pcp/dcp parallelisms in
Prefill and Decode nodes in Mooncake, such as Prefill: TP8/PCP2DCP8 and
Decode: TP8/DCP4/DP2, which is not supported now. We establish the link
mappings to transfer KVCache between prefill and decode nodes. The main
function is realized in Function of `_get_kv_split_metadata` in
Mooncake_connector.py
2. After a prefill rank is pulled KVCache by a decode rank, the decode
rank will send `DONE_RECVING_MSG` to the prefill rank and the prefill
rank will free its KVCache blocks. If a prefill rank is pulled KVCache
more than one time by several decode ranks and it surely could happen in
complicated pcp/dcp parallelisms, it will cause the prefill rank free
its KVCache blocks for several times, which could cause memory issue.
This PR solve this issue by counting the times of prefill rank would be
pulled KVCache and in the last time, it will free the prefill rank
KVCache blocks. The related code is in Function of `run_busy_loop` in
Mooncake_connector.py
3. If a prefill rank is not pulled KVCache by any decode ranks, the
first rank in decode node will send "DONE_RECVING_MSG" to free its
blocks. The related code is in Function of
`_send_done_signal_to_free_remote_port` in Mooncake_connector.py
### How was this patch tested?
This PR is tested in many pcp/dcp parallelisms, and the accuracy are all
correct.
MLA model:
Prefill node: TP8/DP2, Decode node: TP8/DP2
Prefill node: TP8/PCP2/DCP8, Decode node: TP8/DP2
Prefill node: TP8/PCP2/DCP8, Decode node: TP8/DCP4/DP2
Prefill node: TP8/PCP2/DCP4, Decode node: TP4/DCP2/DP4
Prefill node: TP8/PCP2/DCP2, Decode node: TP4/DCP4/DP4
Prefill node: TP8/PCP2, Decode node: TP4/DCP2
GQA model:
Prefill node: TP8/DP2, Decode node: TP8/DP2
Prefill node: TP8/PCP2/DCP2, Decode node: TP8/DP2
Prefill node: TP8/PCP2/DCP2, Decode node: TP8/DCP2/DP2
Prefill node: TP8/PCP2/DCP2, Decode node: TP4/DP4
Prefill node: TP16/DCP2/PCP1, Decode node: TP8/DCP2/DP2
- vLLM version: release/v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9
- Co-author by: Daishixun dsxtsteven@sina.com
---------
Signed-off-by: wangxiaochao <w00642655@china.huawei.com>
Co-authored-by: wangxiaochao <w00642655@china.huawei.com>
Co-authored-by: Jade Zheng <zheng.shoujian@outlook.com>
2025-12-31 09:53:03 +08:00
|
|
|
worker.block_size = 16
|
|
|
|
|
worker.num_key_value_heads = 1
|
2025-12-18 15:07:14 +08:00
|
|
|
|
|
|
|
|
meta = types.SimpleNamespace()
|
|
|
|
|
|
|
|
|
|
meta.remote_pcp_size = remote_pcp_size
|
|
|
|
|
meta.remote_dcp_size = remote_dcp_size
|
2026-01-26 14:28:33 +08:00
|
|
|
meta.remote_ptp_size = remote_ptp_size
|
2025-12-18 15:07:14 +08:00
|
|
|
meta.remote_port = remote_port
|
|
|
|
|
meta.remote_block_ids = remote_block_ids
|
|
|
|
|
meta.local_block_ids = local_block_ids
|
[feature] mooncake support pcp/dcp in common conditions (#5224)
### What this PR does / why we need it?
1. This PR is proposed to support complicated pcp/dcp parallelisms in
Prefill and Decode nodes in Mooncake, such as Prefill: TP8/PCP2DCP8 and
Decode: TP8/DCP4/DP2, which is not supported now. We establish the link
mappings to transfer KVCache between prefill and decode nodes. The main
function is realized in Function of `_get_kv_split_metadata` in
Mooncake_connector.py
2. After a prefill rank is pulled KVCache by a decode rank, the decode
rank will send `DONE_RECVING_MSG` to the prefill rank and the prefill
rank will free its KVCache blocks. If a prefill rank is pulled KVCache
more than one time by several decode ranks and it surely could happen in
complicated pcp/dcp parallelisms, it will cause the prefill rank free
its KVCache blocks for several times, which could cause memory issue.
This PR solve this issue by counting the times of prefill rank would be
pulled KVCache and in the last time, it will free the prefill rank
KVCache blocks. The related code is in Function of `run_busy_loop` in
Mooncake_connector.py
3. If a prefill rank is not pulled KVCache by any decode ranks, the
first rank in decode node will send "DONE_RECVING_MSG" to free its
blocks. The related code is in Function of
`_send_done_signal_to_free_remote_port` in Mooncake_connector.py
### How was this patch tested?
This PR is tested in many pcp/dcp parallelisms, and the accuracy are all
correct.
MLA model:
Prefill node: TP8/DP2, Decode node: TP8/DP2
Prefill node: TP8/PCP2/DCP8, Decode node: TP8/DP2
Prefill node: TP8/PCP2/DCP8, Decode node: TP8/DCP4/DP2
Prefill node: TP8/PCP2/DCP4, Decode node: TP4/DCP2/DP4
Prefill node: TP8/PCP2/DCP2, Decode node: TP4/DCP4/DP4
Prefill node: TP8/PCP2, Decode node: TP4/DCP2
GQA model:
Prefill node: TP8/DP2, Decode node: TP8/DP2
Prefill node: TP8/PCP2/DCP2, Decode node: TP8/DP2
Prefill node: TP8/PCP2/DCP2, Decode node: TP8/DCP2/DP2
Prefill node: TP8/PCP2/DCP2, Decode node: TP4/DP4
Prefill node: TP16/DCP2/PCP1, Decode node: TP8/DCP2/DP2
- vLLM version: release/v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9
- Co-author by: Daishixun dsxtsteven@sina.com
---------
Signed-off-by: wangxiaochao <w00642655@china.huawei.com>
Co-authored-by: wangxiaochao <w00642655@china.huawei.com>
Co-authored-by: Jade Zheng <zheng.shoujian@outlook.com>
2025-12-31 09:53:03 +08:00
|
|
|
meta.num_external_tokens = pcp_size * dcp_size * len(
|
|
|
|
|
local_block_ids) * worker.block_size
|
|
|
|
|
meta.num_prompt_blocks = pcp_size * dcp_size * len(local_block_ids)
|
2026-01-10 22:43:52 +08:00
|
|
|
meta.remote_engine_id = remote_engine_id
|
2025-12-18 15:07:14 +08:00
|
|
|
|
|
|
|
|
remote_handshake_port_list, local_block_ids_list, remote_block_ids_list = worker._get_kv_split_metadata(
|
|
|
|
|
'0', meta)
|
|
|
|
|
|
|
|
|
|
return remote_handshake_port_list, local_block_ids_list, remote_block_ids_list
|
|
|
|
|
|
|
|
|
|
self.assertEqual(
|
|
|
|
|
get_kv_split_metadata(True, 1, 1, 8, 1, 0, 8, 1, 8, 30000, [1],
|
2026-01-10 22:43:52 +08:00
|
|
|
[1], 0),
|
2025-12-18 15:07:14 +08:00
|
|
|
([[30001], [30002], [30003], [30004], [30005], [30006], [30007],
|
|
|
|
|
[30000]], [[], [], [], [], [], [], [], [1]], [[], [], [], [], [],
|
|
|
|
|
[], [], [1]]))
|
|
|
|
|
|
|
|
|
|
self.assertEqual(
|
|
|
|
|
get_kv_split_metadata(False, 1, 1, 8, 1, 0, 8, 2, 8, 30000, [1],
|
2026-01-10 22:43:52 +08:00
|
|
|
[1], 0),
|
2025-12-18 15:07:14 +08:00
|
|
|
([[30001], [30002], [30003], [30004], [30005], [30006], [30007],
|
|
|
|
|
[30008], [30009], [30010], [30011], [30012], [30013], [30014],
|
|
|
|
|
[30015], [30000]
|
|
|
|
|
], [[], [], [], [], [], [], [], [], [], [], [], [], [], [], [],
|
|
|
|
|
[1]], [[], [], [], [], [], [], [], [], [], [], [], [], [],
|
|
|
|
|
[], [], [1]]))
|
|
|
|
|
|
|
|
|
|
self.assertEqual(
|
|
|
|
|
get_kv_split_metadata(True, 1, 1, 8, 1, 0, 8, 2, 2, 30000, [1],
|
2026-01-10 22:43:52 +08:00
|
|
|
[1], 0),
|
2025-12-18 15:07:14 +08:00
|
|
|
([[30001], [30008], [30009], [30000]], [[], [], [], [1]
|
|
|
|
|
], [[], [], [], [1]]))
|
|
|
|
|
|
|
|
|
|
self.assertEqual(
|
|
|
|
|
get_kv_split_metadata(False, 1, 1, 8, 1, 0, 8, 2, 2, 30000, [1],
|
2026-01-10 22:43:52 +08:00
|
|
|
[1], 0),
|
2025-12-18 15:07:14 +08:00
|
|
|
([[30001], [30008], [30009], [30000]], [[], [], [], [1]
|
|
|
|
|
], [[], [], [], [1]]))
|
|
|
|
|
|
|
|
|
|
self.assertEqual(
|
|
|
|
|
get_kv_split_metadata(True, 1, 2, 8, 1, 0, 8, 2, 2, 30000, [1],
|
2026-01-10 22:43:52 +08:00
|
|
|
[1], 0),
|
[feature] mooncake support pcp/dcp in common conditions (#5224)
### What this PR does / why we need it?
1. This PR is proposed to support complicated pcp/dcp parallelisms in
Prefill and Decode nodes in Mooncake, such as Prefill: TP8/PCP2DCP8 and
Decode: TP8/DCP4/DP2, which is not supported now. We establish the link
mappings to transfer KVCache between prefill and decode nodes. The main
function is realized in Function of `_get_kv_split_metadata` in
Mooncake_connector.py
2. After a prefill rank is pulled KVCache by a decode rank, the decode
rank will send `DONE_RECVING_MSG` to the prefill rank and the prefill
rank will free its KVCache blocks. If a prefill rank is pulled KVCache
more than one time by several decode ranks and it surely could happen in
complicated pcp/dcp parallelisms, it will cause the prefill rank free
its KVCache blocks for several times, which could cause memory issue.
This PR solve this issue by counting the times of prefill rank would be
pulled KVCache and in the last time, it will free the prefill rank
KVCache blocks. The related code is in Function of `run_busy_loop` in
Mooncake_connector.py
3. If a prefill rank is not pulled KVCache by any decode ranks, the
first rank in decode node will send "DONE_RECVING_MSG" to free its
blocks. The related code is in Function of
`_send_done_signal_to_free_remote_port` in Mooncake_connector.py
### How was this patch tested?
This PR is tested in many pcp/dcp parallelisms, and the accuracy are all
correct.
MLA model:
Prefill node: TP8/DP2, Decode node: TP8/DP2
Prefill node: TP8/PCP2/DCP8, Decode node: TP8/DP2
Prefill node: TP8/PCP2/DCP8, Decode node: TP8/DCP4/DP2
Prefill node: TP8/PCP2/DCP4, Decode node: TP4/DCP2/DP4
Prefill node: TP8/PCP2/DCP2, Decode node: TP4/DCP4/DP4
Prefill node: TP8/PCP2, Decode node: TP4/DCP2
GQA model:
Prefill node: TP8/DP2, Decode node: TP8/DP2
Prefill node: TP8/PCP2/DCP2, Decode node: TP8/DP2
Prefill node: TP8/PCP2/DCP2, Decode node: TP8/DCP2/DP2
Prefill node: TP8/PCP2/DCP2, Decode node: TP4/DP4
Prefill node: TP16/DCP2/PCP1, Decode node: TP8/DCP2/DP2
- vLLM version: release/v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9
- Co-author by: Daishixun dsxtsteven@sina.com
---------
Signed-off-by: wangxiaochao <w00642655@china.huawei.com>
Co-authored-by: wangxiaochao <w00642655@china.huawei.com>
Co-authored-by: Jade Zheng <zheng.shoujian@outlook.com>
2025-12-31 09:53:03 +08:00
|
|
|
([[30000], [30008]], [[1], []], [[1], []]))
|
2025-12-18 15:07:14 +08:00
|
|
|
|
|
|
|
|
self.assertEqual(
|
|
|
|
|
get_kv_split_metadata(False, 1, 2, 8, 1, 0, 8, 2, 2, 30000, [1],
|
2026-01-10 22:43:52 +08:00
|
|
|
[1], 0),
|
[feature] mooncake support pcp/dcp in common conditions (#5224)
### What this PR does / why we need it?
1. This PR is proposed to support complicated pcp/dcp parallelisms in
Prefill and Decode nodes in Mooncake, such as Prefill: TP8/PCP2DCP8 and
Decode: TP8/DCP4/DP2, which is not supported now. We establish the link
mappings to transfer KVCache between prefill and decode nodes. The main
function is realized in Function of `_get_kv_split_metadata` in
Mooncake_connector.py
2. After a prefill rank is pulled KVCache by a decode rank, the decode
rank will send `DONE_RECVING_MSG` to the prefill rank and the prefill
rank will free its KVCache blocks. If a prefill rank is pulled KVCache
more than one time by several decode ranks and it surely could happen in
complicated pcp/dcp parallelisms, it will cause the prefill rank free
its KVCache blocks for several times, which could cause memory issue.
This PR solve this issue by counting the times of prefill rank would be
pulled KVCache and in the last time, it will free the prefill rank
KVCache blocks. The related code is in Function of `run_busy_loop` in
Mooncake_connector.py
3. If a prefill rank is not pulled KVCache by any decode ranks, the
first rank in decode node will send "DONE_RECVING_MSG" to free its
blocks. The related code is in Function of
`_send_done_signal_to_free_remote_port` in Mooncake_connector.py
### How was this patch tested?
This PR is tested in many pcp/dcp parallelisms, and the accuracy are all
correct.
MLA model:
Prefill node: TP8/DP2, Decode node: TP8/DP2
Prefill node: TP8/PCP2/DCP8, Decode node: TP8/DP2
Prefill node: TP8/PCP2/DCP8, Decode node: TP8/DCP4/DP2
Prefill node: TP8/PCP2/DCP4, Decode node: TP4/DCP2/DP4
Prefill node: TP8/PCP2/DCP2, Decode node: TP4/DCP4/DP4
Prefill node: TP8/PCP2, Decode node: TP4/DCP2
GQA model:
Prefill node: TP8/DP2, Decode node: TP8/DP2
Prefill node: TP8/PCP2/DCP2, Decode node: TP8/DP2
Prefill node: TP8/PCP2/DCP2, Decode node: TP8/DCP2/DP2
Prefill node: TP8/PCP2/DCP2, Decode node: TP4/DP4
Prefill node: TP16/DCP2/PCP1, Decode node: TP8/DCP2/DP2
- vLLM version: release/v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9
- Co-author by: Daishixun dsxtsteven@sina.com
---------
Signed-off-by: wangxiaochao <w00642655@china.huawei.com>
Co-authored-by: wangxiaochao <w00642655@china.huawei.com>
Co-authored-by: Jade Zheng <zheng.shoujian@outlook.com>
2025-12-31 09:53:03 +08:00
|
|
|
([[30000], [30008]], [[1], []], [[1], []]))
|
2025-12-18 15:07:14 +08:00
|
|
|
|
|
|
|
|
self.assertEqual(
|
|
|
|
|
get_kv_split_metadata(True, 1, 2, 8, 0, 0, 8, 2, 2, 30000,
|
2026-01-10 22:43:52 +08:00
|
|
|
[1, 2, 3], [1, 2, 3, 4, 5], 0),
|
[feature] mooncake support pcp/dcp in common conditions (#5224)
### What this PR does / why we need it?
1. This PR is proposed to support complicated pcp/dcp parallelisms in
Prefill and Decode nodes in Mooncake, such as Prefill: TP8/PCP2DCP8 and
Decode: TP8/DCP4/DP2, which is not supported now. We establish the link
mappings to transfer KVCache between prefill and decode nodes. The main
function is realized in Function of `_get_kv_split_metadata` in
Mooncake_connector.py
2. After a prefill rank is pulled KVCache by a decode rank, the decode
rank will send `DONE_RECVING_MSG` to the prefill rank and the prefill
rank will free its KVCache blocks. If a prefill rank is pulled KVCache
more than one time by several decode ranks and it surely could happen in
complicated pcp/dcp parallelisms, it will cause the prefill rank free
its KVCache blocks for several times, which could cause memory issue.
This PR solve this issue by counting the times of prefill rank would be
pulled KVCache and in the last time, it will free the prefill rank
KVCache blocks. The related code is in Function of `run_busy_loop` in
Mooncake_connector.py
3. If a prefill rank is not pulled KVCache by any decode ranks, the
first rank in decode node will send "DONE_RECVING_MSG" to free its
blocks. The related code is in Function of
`_send_done_signal_to_free_remote_port` in Mooncake_connector.py
### How was this patch tested?
This PR is tested in many pcp/dcp parallelisms, and the accuracy are all
correct.
MLA model:
Prefill node: TP8/DP2, Decode node: TP8/DP2
Prefill node: TP8/PCP2/DCP8, Decode node: TP8/DP2
Prefill node: TP8/PCP2/DCP8, Decode node: TP8/DCP4/DP2
Prefill node: TP8/PCP2/DCP4, Decode node: TP4/DCP2/DP4
Prefill node: TP8/PCP2/DCP2, Decode node: TP4/DCP4/DP4
Prefill node: TP8/PCP2, Decode node: TP4/DCP2
GQA model:
Prefill node: TP8/DP2, Decode node: TP8/DP2
Prefill node: TP8/PCP2/DCP2, Decode node: TP8/DP2
Prefill node: TP8/PCP2/DCP2, Decode node: TP8/DCP2/DP2
Prefill node: TP8/PCP2/DCP2, Decode node: TP4/DP4
Prefill node: TP16/DCP2/PCP1, Decode node: TP8/DCP2/DP2
- vLLM version: release/v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9
- Co-author by: Daishixun dsxtsteven@sina.com
---------
Signed-off-by: wangxiaochao <w00642655@china.huawei.com>
Co-authored-by: wangxiaochao <w00642655@china.huawei.com>
Co-authored-by: Jade Zheng <zheng.shoujian@outlook.com>
2025-12-31 09:53:03 +08:00
|
|
|
([[30000], [30008]], [[1, 2, 3], [4, 5]], [[1, 2, 3], [1, 2]]))
|
2025-12-18 15:07:14 +08:00
|
|
|
|
2026-01-26 14:28:33 +08:00
|
|
|
# check remote ptp size
|
|
|
|
|
self.assertEqual(
|
|
|
|
|
get_kv_split_metadata(True, 1, 1, 8, 1, 0, 8, 1, 8, 30000, [1],
|
|
|
|
|
[1], 0, 16),
|
|
|
|
|
get_kv_split_metadata(True, 1, 1, 8, 1, 0, 16, 1, 8, 30000, [1],
|
|
|
|
|
[1], 0)
|
|
|
|
|
)
|
|
|
|
|
self.assertEqual(
|
|
|
|
|
get_kv_split_metadata(False, 1, 1, 8, 1, 0, 8, 1, 8, 30000, [1],
|
|
|
|
|
[1], 0, 16),
|
|
|
|
|
get_kv_split_metadata(False, 1, 1, 8, 1, 0, 16, 1, 8, 30000, [1],
|
|
|
|
|
[1], 0)
|
|
|
|
|
)
|
|
|
|
|
self.assertEqual(
|
|
|
|
|
get_kv_split_metadata(False, 1, 1, 8, 1, 0, 8, 2, 8, 30000, [1],
|
|
|
|
|
[1], 0, 16),
|
|
|
|
|
get_kv_split_metadata(False, 1, 1, 8, 1, 0, 16, 2, 8, 30000, [1],
|
|
|
|
|
[1], 0)
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
def test_get_tp_num_need_pulls(self):
|
|
|
|
|
worker = MooncakeConnectorWorker(self.vllm_config, self.engine_id)
|
|
|
|
|
worker.num_key_value_heads = 8
|
|
|
|
|
|
|
|
|
|
tp_num_need_pulls = worker._get_tp_num_need_pulls(prefill_tp_size=4)
|
|
|
|
|
self.assertEqual(tp_num_need_pulls, 1)
|
|
|
|
|
|
|
|
|
|
worker.vllm_config.model_config.is_deepseek_mla = False
|
|
|
|
|
tp_num_need_pulls = worker._get_tp_num_need_pulls(prefill_tp_size=4)
|
|
|
|
|
self.assertEqual(tp_num_need_pulls, 2)
|
|
|
|
|
|
|
|
|
|
tp_num_need_pulls = worker._get_tp_num_need_pulls(prefill_tp_size=None)
|
|
|
|
|
self.assertEqual(tp_num_need_pulls, 1)
|
|
|
|
|
|
2025-08-18 14:30:07 +08:00
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
unittest.main()
|