xc-llm-ascend/tests/ut/kv_connector/test_llmdatadist_connector.py

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.

import os
import types

from tests.ut.kv_connector.utils import (create_request, create_scheduler,
                                         create_vllm_config)
from vllm_ascend.distributed.llmdatadist_c_mgr_connector import (
    LLMDataDistCMgrConnectorMetadata, LLMDataDistCMgrConnectorWorker, LLMRole)


def test_basic_inferface():
    """Unit test for basic LLMDataDistCMgrConnector interface functionality."""

    vllm_config = create_vllm_config()
    scheduler = create_scheduler(vllm_config)

    # 2 Full Blocks and 1 Half Block.
    BLOCK_SIZE = vllm_config.cache_config.block_size
    NUM_EXTERNAL_FULL_BLOCKS = 2
    NUM_TOKENS = int(BLOCK_SIZE * (NUM_EXTERNAL_FULL_BLOCKS + 0.5))

    request = create_request(request_id=1,
                             num_tokens=NUM_TOKENS,
                             do_remote_prefill=True)
    request_id = request.request_id

    scheduler.add_request(request)

    # Remote Prefill, triggers LLMDataDistCMgrConnectorMetadata.
    scheduler_output = scheduler.schedule()
    kv_connector_metadata = scheduler_output.kv_connector_metadata
    assert kv_connector_metadata is not None
    assert isinstance(kv_connector_metadata, LLMDataDistCMgrConnectorMetadata)

    assert len(kv_connector_metadata.requests) == 1
    assert request_id in kv_connector_metadata.requests
    req_meta = kv_connector_metadata.requests[request_id]

    for block_id, block in zip(
            req_meta.local_block_ids, scheduler.kv_cache_manager.coordinator.
            single_type_managers[0].req_to_blocks[request_id]):
        assert block_id == block.block_id


def test_read_agent_metadata():
    rank_table = {
        "version":
        "1.2",
        "server_count":
        "2",
        "prefill_device_list": [{
            "server_id": "192.168.1.1",
            "device_id": "0",
            "device_ip": "10.30.0.1",
            "cluster_id": "0",
        }, {
            "server_id": "192.168.1.1",
            "device_id": "1",
            "device_ip": "10.30.0.2",
            "cluster_id": "1",
        }, {
            "server_id": "192.168.1.2",
            "device_id": "0",
            "device_ip": "10.30.0.3",
            "cluster_id": "2",
        }, {
            "server_id": "192.168.1.2",
            "device_id": "1",
            "device_ip": "10.30.0.4",
            "cluster_id": "3",
        }]
    }

    def get_device_ip(worker_local_ip, worker_tp_rank, worker_visible_devices):
        old_visible_devices = os.environ.get("ASCEND_RT_VISIBLE_DEVICES", "")
        worker = types.SimpleNamespace()
        worker.local_ip = worker_local_ip
        worker.tp_rank = worker_tp_rank
        worker.llm_datadist_role = LLMRole.PROMPT
        os.environ["ASCEND_RT_VISIBLE_DEVICES"] = worker_visible_devices
        agent_metadata = LLMDataDistCMgrConnectorWorker.read_agent_metadata(
            worker, rank_table)
        os.environ["ASCEND_RT_VISIBLE_DEVICES"] = old_visible_devices
        return agent_metadata.device_ip

    assert get_device_ip("192.168.1.1", 0, "0") == "10.30.0.1"
    assert get_device_ip("192.168.1.1", 0, "1") == "10.30.0.2"
    assert get_device_ip("192.168.1.2", 0, "0") == "10.30.0.3"
    assert get_device_ip("192.168.1.2", 0, "1") == "10.30.0.4"
    assert get_device_ip("192.168.1.1", 0, "0,1") == "10.30.0.1"
    assert get_device_ip("192.168.1.1", 1, "0,1") == "10.30.0.2"
    assert get_device_ip("192.168.1.1", 0, "") == "10.30.0.1"
    assert get_device_ip("192.168.1.1", 1, "") == "10.30.0.2"
Disaggregate prefill for kv cache register style (#950) ### What this PR does / why we need it? This PR adopt `LLMDataDist` for kv cache register and `pull_blocks` style disaggregate prefill implementation. The interface implementation mainly follows the design of NIXL PR https://github.com/vllm-project/vllm/pull/17751/files#diff-7eaad0b7dee0626bf29d10081b0f0c5e3ea15a4af97e7b182a4e0d35f8346953 . This PR can be test with the following step: - Generate the rank table for all machine. - execute`toy_proxy.py` to launch the disaggregate prefill proxy server, specify the prefill ip, port and the decode ip, port - Run the prefill server and decode server. - send the request to the disaggregate prefill proxy ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.9.2 - vLLM main: https://github.com/vllm-project/vllm/commit/8d0a01a5f2b53794e4bc6b734d7b63cb8a9b7d7d --------- Signed-off-by: ganyi <pleaplusone.gy@gmail.com> Signed-off-by: machenglong <machenglong_yewu@cmss.chinamobile.com> Signed-off-by: liziyu179 <3475441767@qq.com> Signed-off-by: underfitc <hucong24@huawei.com> Signed-off-by: zouyida2052 <zouyida@huawei.com> Signed-off-by: liziyu <liziyu16@huawei.com> Signed-off-by: underfituu <hzhucong@163.com> Co-authored-by: machenglong <machenglong_yewu@cmss.chinamobile.com> Co-authored-by: liziyu179 <3475441767@qq.com> Co-authored-by: underfitc <hucong24@huawei.com> Co-authored-by: zouyida2052 <zouyida@huawei.com> Co-authored-by: liziyu <liziyu16@huawei.com> Co-authored-by: underfituu <hzhucong@163.com> 2025-07-26 17:15:47 +08:00			`# SPDX-License-Identifier: Apache-2.0`
			`# SPDX-FileCopyrightText: Copyright contributors to the vLLM project`
			`# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.`

[Bugfix][PD] Make multiple Ps and Ds work on a single machine (#2080) (cherry picked from commit 816375e0c1071d0696dfab1a1ce35674f9f37aa0) ### What this PR does / why we need it? Suppose that you want to start a prefiller instance with npus `2,3` only. So you start the instance with `ASCEND_RT_VISIBLE_DEVICES=2,3`. The current programming will start two workers, whose ranks are `0` and `1` respectedly. And they will pick the first and second ip addresses of npus in the ranktable instead of the thirdth and forth ones. But actually they are using card `2,3` and therefore they can not link with remote instances when they attempt to transfer the KVCache. Hence, at most 1 prefiller instance and at most 1 decoder instance can work on a single machine since they always pick the first npu ip address in the ranktable currently. This pull request is proposed to fix the problem. This fix pick ips of only those devices that are in `ASCEND_RT_VISIBLE_DEVICES` from the ranktable. ### Does this PR introduce _any_ user-facing change? If the user use ranktable generated by `gen_ranktable.sh`, they should not face any change. ### How was this patch tested? Qwen-0.6B 1P 1D, dp=2, `ASCEND_RT_VISIBLE_DEVICES=2,3` for prefiller and `ASCEND_RT_VISIBLE_DEVICES=4,5` for decoder. - vLLM version: v0.10.0 - vLLM main: https://github.com/vllm-project/vllm/commit/ad57f23f6a528ab01066998b41796a44340fd43d Signed-off-by: CaveNightingale <cavenightingale@foxmail.com> 2025-08-04 17:22:18 +08:00			`import os`
			`import types`

Disaggregate prefill for kv cache register style (#950) ### What this PR does / why we need it? This PR adopt `LLMDataDist` for kv cache register and `pull_blocks` style disaggregate prefill implementation. The interface implementation mainly follows the design of NIXL PR https://github.com/vllm-project/vllm/pull/17751/files#diff-7eaad0b7dee0626bf29d10081b0f0c5e3ea15a4af97e7b182a4e0d35f8346953 . This PR can be test with the following step: - Generate the rank table for all machine. - execute`toy_proxy.py` to launch the disaggregate prefill proxy server, specify the prefill ip, port and the decode ip, port - Run the prefill server and decode server. - send the request to the disaggregate prefill proxy ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.9.2 - vLLM main: https://github.com/vllm-project/vllm/commit/8d0a01a5f2b53794e4bc6b734d7b63cb8a9b7d7d --------- Signed-off-by: ganyi <pleaplusone.gy@gmail.com> Signed-off-by: machenglong <machenglong_yewu@cmss.chinamobile.com> Signed-off-by: liziyu179 <3475441767@qq.com> Signed-off-by: underfitc <hucong24@huawei.com> Signed-off-by: zouyida2052 <zouyida@huawei.com> Signed-off-by: liziyu <liziyu16@huawei.com> Signed-off-by: underfituu <hzhucong@163.com> Co-authored-by: machenglong <machenglong_yewu@cmss.chinamobile.com> Co-authored-by: liziyu179 <3475441767@qq.com> Co-authored-by: underfitc <hucong24@huawei.com> Co-authored-by: zouyida2052 <zouyida@huawei.com> Co-authored-by: liziyu <liziyu16@huawei.com> Co-authored-by: underfituu <hzhucong@163.com> 2025-07-26 17:15:47 +08:00			`from tests.ut.kv_connector.utils import (create_request, create_scheduler,`
			`create_vllm_config)`
[Bugfix][PD] Make multiple Ps and Ds work on a single machine (#2080) (cherry picked from commit 816375e0c1071d0696dfab1a1ce35674f9f37aa0) ### What this PR does / why we need it? Suppose that you want to start a prefiller instance with npus `2,3` only. So you start the instance with `ASCEND_RT_VISIBLE_DEVICES=2,3`. The current programming will start two workers, whose ranks are `0` and `1` respectedly. And they will pick the first and second ip addresses of npus in the ranktable instead of the thirdth and forth ones. But actually they are using card `2,3` and therefore they can not link with remote instances when they attempt to transfer the KVCache. Hence, at most 1 prefiller instance and at most 1 decoder instance can work on a single machine since they always pick the first npu ip address in the ranktable currently. This pull request is proposed to fix the problem. This fix pick ips of only those devices that are in `ASCEND_RT_VISIBLE_DEVICES` from the ranktable. ### Does this PR introduce _any_ user-facing change? If the user use ranktable generated by `gen_ranktable.sh`, they should not face any change. ### How was this patch tested? Qwen-0.6B 1P 1D, dp=2, `ASCEND_RT_VISIBLE_DEVICES=2,3` for prefiller and `ASCEND_RT_VISIBLE_DEVICES=4,5` for decoder. - vLLM version: v0.10.0 - vLLM main: https://github.com/vllm-project/vllm/commit/ad57f23f6a528ab01066998b41796a44340fd43d Signed-off-by: CaveNightingale <cavenightingale@foxmail.com> 2025-08-04 17:22:18 +08:00			`from vllm_ascend.distributed.llmdatadist_c_mgr_connector import (`
			`LLMDataDistCMgrConnectorMetadata, LLMDataDistCMgrConnectorWorker, LLMRole)`
Disaggregate prefill for kv cache register style (#950) ### What this PR does / why we need it? This PR adopt `LLMDataDist` for kv cache register and `pull_blocks` style disaggregate prefill implementation. The interface implementation mainly follows the design of NIXL PR https://github.com/vllm-project/vllm/pull/17751/files#diff-7eaad0b7dee0626bf29d10081b0f0c5e3ea15a4af97e7b182a4e0d35f8346953 . This PR can be test with the following step: - Generate the rank table for all machine. - execute`toy_proxy.py` to launch the disaggregate prefill proxy server, specify the prefill ip, port and the decode ip, port - Run the prefill server and decode server. - send the request to the disaggregate prefill proxy ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.9.2 - vLLM main: https://github.com/vllm-project/vllm/commit/8d0a01a5f2b53794e4bc6b734d7b63cb8a9b7d7d --------- Signed-off-by: ganyi <pleaplusone.gy@gmail.com> Signed-off-by: machenglong <machenglong_yewu@cmss.chinamobile.com> Signed-off-by: liziyu179 <3475441767@qq.com> Signed-off-by: underfitc <hucong24@huawei.com> Signed-off-by: zouyida2052 <zouyida@huawei.com> Signed-off-by: liziyu <liziyu16@huawei.com> Signed-off-by: underfituu <hzhucong@163.com> Co-authored-by: machenglong <machenglong_yewu@cmss.chinamobile.com> Co-authored-by: liziyu179 <3475441767@qq.com> Co-authored-by: underfitc <hucong24@huawei.com> Co-authored-by: zouyida2052 <zouyida@huawei.com> Co-authored-by: liziyu <liziyu16@huawei.com> Co-authored-by: underfituu <hzhucong@163.com> 2025-07-26 17:15:47 +08:00

			`def test_basic_inferface():`
			`"""Unit test for basic LLMDataDistCMgrConnector interface functionality."""`

			`vllm_config = create_vllm_config()`
			`scheduler = create_scheduler(vllm_config)`

			`# 2 Full Blocks and 1 Half Block.`
			`BLOCK_SIZE = vllm_config.cache_config.block_size`
			`NUM_EXTERNAL_FULL_BLOCKS = 2`
			`NUM_TOKENS = int(BLOCK_SIZE * (NUM_EXTERNAL_FULL_BLOCKS + 0.5))`

			`request = create_request(request_id=1,`
			`num_tokens=NUM_TOKENS,`
			`do_remote_prefill=True)`
			`request_id = request.request_id`

			`scheduler.add_request(request)`

			`# Remote Prefill, triggers LLMDataDistCMgrConnectorMetadata.`
			`scheduler_output = scheduler.schedule()`
			`kv_connector_metadata = scheduler_output.kv_connector_metadata`
			`assert kv_connector_metadata is not None`
			`assert isinstance(kv_connector_metadata, LLMDataDistCMgrConnectorMetadata)`

			`assert len(kv_connector_metadata.requests) == 1`
			`assert request_id in kv_connector_metadata.requests`
			`req_meta = kv_connector_metadata.requests[request_id]`

			`for block_id, block in zip(`
			`req_meta.local_block_ids, scheduler.kv_cache_manager.coordinator.`
			`single_type_managers[0].req_to_blocks[request_id]):`
			`assert block_id == block.block_id`
[Bugfix][PD] Make multiple Ps and Ds work on a single machine (#2080) (cherry picked from commit 816375e0c1071d0696dfab1a1ce35674f9f37aa0) ### What this PR does / why we need it? Suppose that you want to start a prefiller instance with npus `2,3` only. So you start the instance with `ASCEND_RT_VISIBLE_DEVICES=2,3`. The current programming will start two workers, whose ranks are `0` and `1` respectedly. And they will pick the first and second ip addresses of npus in the ranktable instead of the thirdth and forth ones. But actually they are using card `2,3` and therefore they can not link with remote instances when they attempt to transfer the KVCache. Hence, at most 1 prefiller instance and at most 1 decoder instance can work on a single machine since they always pick the first npu ip address in the ranktable currently. This pull request is proposed to fix the problem. This fix pick ips of only those devices that are in `ASCEND_RT_VISIBLE_DEVICES` from the ranktable. ### Does this PR introduce _any_ user-facing change? If the user use ranktable generated by `gen_ranktable.sh`, they should not face any change. ### How was this patch tested? Qwen-0.6B 1P 1D, dp=2, `ASCEND_RT_VISIBLE_DEVICES=2,3` for prefiller and `ASCEND_RT_VISIBLE_DEVICES=4,5` for decoder. - vLLM version: v0.10.0 - vLLM main: https://github.com/vllm-project/vllm/commit/ad57f23f6a528ab01066998b41796a44340fd43d Signed-off-by: CaveNightingale <cavenightingale@foxmail.com> 2025-08-04 17:22:18 +08:00

			`def test_read_agent_metadata():`
			`rank_table = {`
			`"version":`
			`"1.2",`
			`"server_count":`
			`"2",`
			`"prefill_device_list": [{`
			`"server_id": "192.168.1.1",`
			`"device_id": "0",`
			`"device_ip": "10.30.0.1",`
			`"cluster_id": "0",`
			`}, {`
			`"server_id": "192.168.1.1",`
			`"device_id": "1",`
			`"device_ip": "10.30.0.2",`
			`"cluster_id": "1",`
			`}, {`
			`"server_id": "192.168.1.2",`
			`"device_id": "0",`
			`"device_ip": "10.30.0.3",`
			`"cluster_id": "2",`
			`}, {`
			`"server_id": "192.168.1.2",`
			`"device_id": "1",`
			`"device_ip": "10.30.0.4",`
			`"cluster_id": "3",`
			`}]`
			`}`

			`def get_device_ip(worker_local_ip, worker_tp_rank, worker_visible_devices):`
			`old_visible_devices = os.environ.get("ASCEND_RT_VISIBLE_DEVICES", "")`
			`worker = types.SimpleNamespace()`
			`worker.local_ip = worker_local_ip`
			`worker.tp_rank = worker_tp_rank`
			`worker.llm_datadist_role = LLMRole.PROMPT`
			`os.environ["ASCEND_RT_VISIBLE_DEVICES"] = worker_visible_devices`
			`agent_metadata = LLMDataDistCMgrConnectorWorker.read_agent_metadata(`
			`worker, rank_table)`
			`os.environ["ASCEND_RT_VISIBLE_DEVICES"] = old_visible_devices`
			`return agent_metadata.device_ip`

			`assert get_device_ip("192.168.1.1", 0, "0") == "10.30.0.1"`
			`assert get_device_ip("192.168.1.1", 0, "1") == "10.30.0.2"`
			`assert get_device_ip("192.168.1.2", 0, "0") == "10.30.0.3"`
			`assert get_device_ip("192.168.1.2", 0, "1") == "10.30.0.4"`
			`assert get_device_ip("192.168.1.1", 0, "0,1") == "10.30.0.1"`
			`assert get_device_ip("192.168.1.1", 1, "0,1") == "10.30.0.2"`
			`assert get_device_ip("192.168.1.1", 0, "") == "10.30.0.1"`
			`assert get_device_ip("192.168.1.1", 1, "") == "10.30.0.2"`