[Disaggregated Prefill] P2P Disaggregated Prefill based on llm_datadist (#694)

### What this PR does / why we need it? - This PR proposes a P2P version of Disaggregated Prefill based on llm_datadist which manages data transfer. - This solution reconstructs previous offline single-node Disaggregated Prefill solution, and supports multi-node and online serveing now. - Currently this solution supports 1P1D situation of Deepseek hybrid parallelism (P: TP+EP, D: DP+EP). Note that xPyD situation is considered in the solution design, and will be supported soon within v1 engine. --------- Signed-off-by: hw_whx <wanghexiang7@huawei.com> Signed-off-by: ganyi <pleaplusone.gy@gmail.com> Co-authored-by: hw_whx <wanghexiang7@huawei.com> Co-authored-by: ganyi <pleaplusone.gy@gmail.com>
2025-05-01 22:31:36 +08:00
parent 84e2ed898b
commit 8b194ad12e
18 changed files with 1769 additions and 32 deletions
--- a/examples/disaggregated_prefill/disaggregated_prefill_offline.py
+++ b/examples/disaggregated_prefill/disaggregated_prefill_offline.py
@@ -0,0 +1,138 @@
+"""
+ This file demonstrates the example usage of disaggregated prefilling
+ We will launch 2 vllm instances (NPU 0,1 for prefill and NPU 2,3 for decode),
+ and then transfer the KV cache between them.
+ prompy_device_ips denotes device ip of NPU 0,1
+ decode_device_ips denotes device ip of NPU 2,3
+ The device ips of all NPUs in current server can be found through
+ examples/disaggregated_prefill/find_device_ips.py
+ """
+import multiprocessing as mp
+import os
+import time
+from multiprocessing import Event, Process
+
+kv_connector_extra_config = {
+    "prompt_device_ips": ["1.2.3.1", "1.2.3.2"],
+    "decode_device_ips": ["1.2.3.9", "1.2.3.10"],
+    "llmdatadist_comm_port": 26000,
+}
+
+
+def clean_up():
+    import gc
+
+    import torch
+    from vllm.distributed.parallel_state import (
+        destroy_distributed_environment, destroy_model_parallel)
+    destroy_model_parallel()
+    destroy_distributed_environment()
+    gc.collect()
+    torch.npu.empty_cache()
+
+
+def run_prefill(prefill_done, process_close):
+    os.environ["ASCEND_RT_VISIBLE_DEVICES"] = "0,1"
+
+    from vllm import LLM, SamplingParams
+    from vllm.config import KVTransferConfig
+
+    prompts = [
+        "Hello, how are you today?", "Hi, what is your name?",
+        "Tell me a very long story.", "what is your favourite book?"
+    ]
+    sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1)
+
+    ktc = KVTransferConfig.from_cli(
+        '{"kv_connector":"AscendSimpleConnector","kv_buffer_device":"npu","kv_role":"kv_producer", "kv_parallel_size":2}'
+    )
+    global kv_connector_extra_config
+    ktc.kv_connector_extra_config = kv_connector_extra_config
+    llm = LLM(model="deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
+              kv_transfer_config=ktc,
+              max_model_len=2000,
+              gpu_memory_utilization=0.8,
+              tensor_parallel_size=2)
+
+    llm.generate(prompts, sampling_params)
+    print("Prefill node is finished.")
+    prefill_done.set()
+
+    # To keep the prefill node running in case the decode node is not done;
+    # otherwise, the script might exit prematurely, causing incomplete decoding.
+    try:
+        while not process_close.is_set():
+            time.sleep(1)
+    except KeyboardInterrupt:
+        print("Script stopped by user.")
+    finally:
+        print("Cleanup prefill resources")
+        del llm
+        clean_up()
+
+
+def run_decode(prefill_done):
+    os.environ["ASCEND_RT_VISIBLE_DEVICES"] = "2,3"
+
+    from vllm import LLM, SamplingParams
+    from vllm.config import KVTransferConfig
+
+    prompts = [
+        "Hello, how are you today?",
+        "Hi, what is your name?",
+    ]
+    sampling_params = SamplingParams(temperature=0, top_p=0.95)
+
+    ktc = KVTransferConfig.from_cli(
+        '{"kv_connector":"AscendSimpleConnector","kv_buffer_device":"npu","kv_role":"kv_consumer","kv_parallel_size":2}'
+    )
+    global kv_connector_extra_config
+    ktc.kv_connector_extra_config = kv_connector_extra_config
+    llm = LLM(model="deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
+              kv_transfer_config=ktc,
+              max_model_len=2000,
+              gpu_memory_utilization=0.8,
+              tensor_parallel_size=2)
+
+    # Wait for the producer to start the consumer
+    print("Waiting for prefill node to finish...")
+    prefill_done.wait()
+
+    # At this point when the prefill_done is set, the kv-cache should have been
+    # transferred to this decode node, so we can start decoding.
+    outputs = llm.generate(prompts, sampling_params)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+    del llm
+    clean_up()
+
+
+if __name__ == "__main__":
+    mp.get_context('spawn')
+
+    prefill_done = Event()
+    process_close = Event()
+    prefill_process = Process(target=run_prefill,
+                              args=(
+                                  prefill_done,
+                                  process_close,
+                              ))
+    decode_process = Process(target=run_decode, args=(prefill_done, ))
+
+    # Start prefill node
+    prefill_process.start()
+
+    # Start decode node
+    decode_process.start()
+
+    # Terminate the prefill node when decode is finished
+    decode_process.join()
+
+    # Terminate prefill process
+    process_close.set()
+    prefill_process.join()
+    prefill_process.terminate()
+    print("All process done!")
--- a/examples/disaggregated_prefill/dp_proxy.py
+++ b/examples/disaggregated_prefill/dp_proxy.py
@@ -0,0 +1,463 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import asyncio
+import copy
+import logging
+import os
+import threading
+import time
+import uuid
+
+import aiohttp
+import msgpack  # type: ignore
+import zmq
+from quart import Quart, make_response, request
+
+DP_PROXY_HTTP_PORT = 10004
+DP_PROXY_ZMQ_REG_PORT = 30006
+DP_PROXY_ZMQ_NOTIFY_PORT = 30005
+
+PD_PROXY_ADDRESS = "127.0.0.1:30002"
+
+MY_HTTP_ADDRESS = f"127.0.0.1:{DP_PROXY_HTTP_PORT}"
+MY_ZMQ_ADDRESS_PLACEHOLDER = f"127.0.0.1:{DP_PROXY_ZMQ_REG_PORT}"
+
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+
+TIME_INTERVAL_FOR_IDLE_RUN = 5e-4
+DP_SIZE = 2
+
+dp_instances: dict[str, bool] = {}
+dp_cv = threading.Condition()
+round_robin_index = 0
+_idle_send_loop = None
+
+
+def make_idle_request():
+    # Same as before
+    data = {
+        "prompt": "hi",
+        "max_tokens": 1,
+        "temperature": 0,
+    }
+    return data
+
+
+def random_uuid() -> str:
+    return str(uuid.uuid4().hex)
+
+
+async def send_idle_token_to_client(schedule_dict):
+    for key, value in schedule_dict.items():
+        if value:
+            continue
+        request_received_id = random_uuid()
+        idle_request_data = make_idle_request()
+        forward_request_id = f"dp_idle_{key}_{request_received_id}"
+        target_url = f'http://{key}/v1/completions'
+        logger.debug(
+            f"DP Decode Proxy: Sending idle token to D node {key} at {target_url}"
+        )
+        generator = forward_request_internal(target_url, idle_request_data,
+                                             forward_request_id)
+        try:
+            async for response in generator:
+                logger.debug(
+                    f"DP Decode Proxy: Idle Request {request_received_id}: response from {key}, got response: {response}"
+                )
+        except Exception as e:
+            logger.warning(
+                f"DP Decode Proxy: Error sending idle token to {key}: {e}")
+
+
+def metadata_collect_trigger(poller, router_socket):
+    global dp_instances
+    global dp_cv
+    global _idle_send_loop
+    with dp_cv:
+        dp_cv.wait()
+    while True:
+        try:
+            schedule_dict = copy.deepcopy(dp_instances)
+            for key in schedule_dict.keys():
+                schedule_dict[key] = False
+            first_start = False
+            start_time = None
+            while not all(schedule_dict.values()):
+                if start_time is not None:
+                    time_interval = time.time() - start_time
+                    logger.debug("check time interval: ", time_interval)
+                    if time_interval > TIME_INTERVAL_FOR_IDLE_RUN:
+                        logger.debug(
+                            "exceeds max time interval send idle token to client"
+                        )
+                        # Send idle token to client in case of single dp rank run solo and block on the CCL part
+                        asyncio.run_coroutine_threadsafe(
+                            send_idle_token_to_client(schedule_dict),
+                            _idle_send_loop)  # type: ignore
+                        # Note: Reset start time prevent consistently send idle token to client
+                        # We only reset start time here, for some of the client may loss the idle token send from this proxy
+                        # and we only exit this while loop when we make sure all the client are exactly start inference in this
+                        # step
+                        start_time = time.time()
+                socks = dict(poller.poll(timeout=500))  # timeout in 500ms
+                if socks:
+                    logger.debug("receive socks from moniter threads: ", socks)
+                if router_socket in socks:
+                    messages = router_socket.recv_multipart()
+                    try:
+                        # {"info": "notify_step", "http_address": ""}
+                        for message in messages:
+                            data = msgpack.loads(message)
+                            http_addr = None
+                            logger.debug(f"receive message {data}")
+                            if data.get("info") == "notify_step":
+                                http_addr = data.get("http_address")
+                                if http_addr in schedule_dict.keys():
+                                    schedule_dict[http_addr] = True
+                                    logger.debug("set first time")
+                                    if not first_start:
+                                        logger.debug("record start time")
+                                        first_start = True
+                                        start_time = time.time()
+                                else:
+                                    logger.warning("Unrecognize http address")
+                            else:
+                                logger.warning(
+                                    "Got unrecognize info type! We only accept notify step info yet"
+                                )
+                    except (msgpack.UnpackException, TypeError, KeyError) as e:
+                        logger.error(
+                            f"Error processing message from {http_addr}: {e}. Message: {data}"
+                        )
+        except zmq.ZMQError as e:  # type: ignore
+            logger.error(f"ZMQ Error in monitor thread: {e}")
+            if e.errno == zmq.ETERM:  # type: ignore
+                logger.error(
+                    "Monitor thread terminating due to context termination.")
+                break
+            time.sleep(1)
+        except Exception as e:
+            logger.error(f"Unexpected error in monitor thread: {e}")
+            import traceback
+            traceback.print_exc()
+            time.sleep(1)
+
+
+def _listen_for_d_register(poller, router_socket):
+    global dp_instances
+    global dp_cv
+    global DP_SIZE
+    logger.info(
+        f"DP Decode Proxy: D Node ZMQ Listener started on ROUTER port {DP_PROXY_ZMQ_REG_PORT}"
+    )
+
+    while True:
+        try:
+            socks = dict(poller.poll(timeout=1000))
+            if router_socket in socks:
+                remote_id, message = router_socket.recv_multipart()
+                try:
+                    data = msgpack.loads(message)
+                    if data.get("type") == "DP":
+                        http_addr = data.get("http_address")
+                        zmq_addr = data.get("zmq_address")
+                        if http_addr:
+                            with dp_cv:
+                                if http_addr not in dp_instances:
+                                    logger.info(
+                                        f"DP Decode Proxy: Registering D Node instance: http={http_addr}, zmq={zmq_addr}"
+                                    )
+                                    dp_instances[http_addr] = True
+                                    if len(dp_instances) >= DP_SIZE:
+                                        logger.info(
+                                            f"DP Decode Proxy: Reached expected D Node count ({DP_SIZE}). Notifying metadata collector."
+                                        )
+                                        dp_cv.notify_all()
+                                else:
+                                    pass
+                        else:
+                            logger.warning(
+                                f"DP Decode Proxy: Received D Node registration from {remote_id.decode()} without http_address. Data: {data}"
+                            )
+                    else:
+                        logger.warning(
+                            f"DP Decode Proxy: Received message with unexpected type from {remote_id.decode()}. Type: {data.get('type')}, Data: {data}"
+                        )
+
+                except (msgpack.UnpackException, TypeError, KeyError) as e:
+                    logger.error(
+                        f"DP Decode Proxy: Error processing D Node registration from {remote_id.decode()}: {e}. Message: {message}"
+                    )
+                except Exception as e:
+                    logger.error(
+                        f"DP Decode Proxy: Unexpected error processing D Node registration from {remote_id.decode()}: {e}"
+                    )
+
+        except zmq.ZMQError as e:  # type: ignore
+            logger.error(
+                f"DP Decode Proxy: ZMQ Error in D Node listener thread: {e}")
+            if e.errno == zmq.ETERM:  # type: ignore
+                logger.info(
+                    "DP Decode Proxy: D Node Listener thread terminating.")
+                break
+            time.sleep(1)
+        except Exception as e:
+            logger.error(
+                f"DP Decode Proxy: Unexpected error in D Node listener thread: {e}"
+            )
+            import traceback
+            traceback.print_exc()
+            time.sleep(1)
+
+
+def _register_to_pd_proxy(pd_proxy_zmq_addr, my_http_addr, my_zmq_addr):
+    context = None
+    sock = None
+    while True:
+        try:
+            if context is None:
+                context = zmq.Context()  # type: ignore
+            if sock is None:
+                sock = context.socket(zmq.DEALER)  # type: ignore
+                identity = f"dp_proxy_{my_http_addr}".encode('utf-8')
+                sock.setsockopt(zmq.IDENTITY, identity)  # type: ignore
+                sock.setsockopt(zmq.LINGER, 0)  # type: ignore
+                logger.info(
+                    f"DP Decode Proxy: Attempting to connect to PD Proxy at {pd_proxy_zmq_addr}..."
+                )
+                sock.connect(f"tcp://{pd_proxy_zmq_addr}")
+                logger.info(
+                    f"DP Decode Proxy: Connected to PD Proxy at {pd_proxy_zmq_addr}."
+                )
+
+            data = {
+                "type": "D",
+                "http_address": my_http_addr,
+                "zmq_address": my_zmq_addr
+            }
+            logger.debug(
+                f"DP Decode Proxy: Sending registration/heartbeat to PD Proxy: {data}"
+            )
+            sock.send(msgpack.dumps(data))
+            time.sleep(5)
+
+        except zmq.ZMQError as e:  # type: ignore
+            logger.error(
+                f"DP Decode Proxy: ZMQ Error connecting/sending to PD Proxy ({pd_proxy_zmq_addr}): {e}"
+            )
+            if sock:
+                sock.close()
+                sock = None
+            time.sleep(10)
+        except Exception as e:
+            logger.error(
+                f"DP Decode Proxy: Unexpected error in PD Proxy registration thread: {e}"
+            )
+            import traceback
+            traceback.print_exc()
+            if sock:
+                sock.close()
+                sock = None
+            time.sleep(10)
+        finally:
+            pass
+
+
+def start_zmq_thread(hostname, port, socket_type, target_func, thread_name):
+    """Generic ZMQ thread starter for ROUTER or PULL."""
+    if not hostname:
+        hostname = "0.0.0.0"
+    context = zmq.Context.instance()  # type: ignore
+    socket = context.socket(socket_type)
+    socket.setsockopt(zmq.LINGER, 0)  # type: ignore
+    try:
+        socket.bind(f"tcp://{hostname}:{port}")
+    except zmq.ZMQError as e:  # type: ignore
+        logger.error(
+            f"DP Decode Proxy: Error binding ZMQ {socket_type} socket to tcp://{hostname}:{port}: {e}"
+        )
+        socket.close()
+        raise
+
+    poller = zmq.Poller()  # type: ignore
+    poller.register(socket, zmq.POLLIN)  # type: ignore
+
+    thread = threading.Thread(target=target_func,
+                              args=(poller, socket),
+                              daemon=True,
+                              name=thread_name)
+    thread.start()
+    return thread, socket
+
+
+def start_thread_with_event_loop():
+    global _idle_send_loop
+    asyncio.set_event_loop(_idle_send_loop)
+    _idle_send_loop.run_forever()  # type: ignore
+
+
+async def forward_request_internal(url, data, request_id):
+    try:
+        async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+            headers = {
+                "Authorization":
+                f"Bearer {os.environ.get('OPENAI_API_KEY', '')}",
+                "X-Request-Id": request_id,
+                "Content-Type": "application/json"
+            }
+            async with session.post(url=url, json=data,
+                                    headers=headers) as response:
+                if response.status == 200:
+                    async for chunk_bytes in response.content.iter_chunked(
+                            1024):
+                        yield chunk_bytes
+                else:
+                    error_content = await response.read()
+                    logger.warning(
+                        f"DP Decode Proxy: Error from D node {url} (status {response.status}): {error_content.decode(errors='ignore')}"
+                    )
+                    yield error_content
+
+    except aiohttp.ClientError as e:
+        logger.warning(
+            f"DP Decode Proxy: Error forwarding request {request_id} to D node {url}: {e}"
+        )
+        error_msg = f"Failed to connect or communicate with D node at {url}: {e}".encode(
+            'utf-8')
+        yield error_msg
+
+
+AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
+app = Quart(__name__)
+
+
+@app.route('/v1/completions', methods=['POST'])
+async def handle_request():
+    global dp_instances
+    global dp_cv
+    global round_robin_index
+
+    request_received_id = request.headers.get("X-Request-Id")
+    if not request_received_id:
+        fallback_id = f"dp_fallback_{random_uuid()}"
+        logger.warning(
+            f"DP Decode Proxy: Received request without X-Request-Id header. Using fallback ID: {fallback_id}"
+        )
+        request_received_id = fallback_id
+    else:
+        logger.info(
+            f"DP Decode Proxy: Received request from PD Proxy, using propagated ID: {request_received_id}"
+        )
+
+    try:
+        original_request_data = await request.get_json()
+        if not original_request_data:
+            return await make_response("Request body must be valid JSON.", 400)
+
+        target_addr = None
+        with dp_cv:
+            if not dp_instances:
+                logger.warning(
+                    f"DP Decode Proxy: Request {request_received_id}: No D Node instances available/registered."
+                )
+                return await make_response("No Decode instances available.",
+                                           503)
+
+            dp_addresses = list(dp_instances.keys())
+            if not dp_addresses:
+                logger.error(
+                    f"DP Decode Proxy: Request {request_received_id}: Internal error - dp_instances populated but list is empty."
+                )
+                return await make_response("Internal Server Error", 500)
+
+            current_selection_index = round_robin_index % len(dp_addresses)
+            target_addr = dp_addresses[current_selection_index]
+            round_robin_index += 1
+
+        logger.info(
+            f"DP Decode Proxy: Request {request_received_id}: Routing Decode to D Node {target_addr} (Index {current_selection_index})"
+        )
+
+        target_url = f'http://{target_addr}/v1/completions'
+
+        generator = forward_request_internal(target_url, original_request_data,
+                                             request_received_id)
+
+        response = await make_response(generator)
+        response.timeout = None
+
+        if original_request_data.get("stream", False):
+            response.headers['Content-Type'] = 'text/event-stream'
+            response.headers['Cache-Control'] = 'no-cache'
+        else:
+            response.headers['Content-Type'] = 'application/json'
+
+        logger.debug(
+            f"DP Decode Proxy: Request {request_received_id}: Streaming response from D node {target_addr}"
+        )
+        return response
+
+    except Exception as e:
+        logger.error(
+            f"DP Decode Proxy: Error handling request {request_received_id}: {e}"
+        )
+        return await make_response("Internal Server Error", 500)
+
+
+if __name__ == '__main__':
+    d_listener_thread, d_reg_socket = start_zmq_thread(
+        "0.0.0.0",
+        DP_PROXY_ZMQ_REG_PORT,
+        zmq.ROUTER,  # type: ignore
+        _listen_for_d_register,  # type: ignore
+        "DP_DNodeListenerThread")
+
+    metadata_thread, notify_socket = start_zmq_thread(
+        "0.0.0.0",
+        DP_PROXY_ZMQ_NOTIFY_PORT,
+        zmq.PULL,  # type: ignore
+        metadata_collect_trigger,
+        "DP_MetadataMonitorThread")
+
+    _idle_send_loop = asyncio.new_event_loop()
+    idle_loop_thread = threading.Thread(target=start_thread_with_event_loop,
+                                        daemon=True,
+                                        name="DP_IdleSendLoopThread")
+    idle_loop_thread.start()
+
+    pd_register_thread = threading.Thread(target=_register_to_pd_proxy,
+                                          args=(PD_PROXY_ADDRESS,
+                                                MY_HTTP_ADDRESS,
+                                                MY_ZMQ_ADDRESS_PLACEHOLDER),
+                                          daemon=True,
+                                          name="DP_PDRegisterThread")
+    pd_register_thread.start()
+
+    logger.info(
+        f"DP Decode Proxy: Starting Quart web server on http://0.0.0.0:{DP_PROXY_HTTP_PORT}"
+    )
+    zmq_context = zmq.Context.instance()  # type: ignore
+    try:
+        app.run(host='0.0.0.0', port=DP_PROXY_HTTP_PORT)
+    except KeyboardInterrupt:
+        logger.info("DP Decode Proxy: KeyboardInterrupt received, stopping...")
+    except Exception as e:
+        logger.error(f"DP Decode Proxy: Failed to run Quart server: {e}")
+    finally:
+        logger.info("DP Decode Proxy: Shutting down...")
+        if _idle_send_loop and _idle_send_loop.is_running():
+            logger.info("DP Decode Proxy: Stopping idle send loop...")
+            _idle_send_loop.call_soon_threadsafe(_idle_send_loop.stop)
+
+        if d_reg_socket:
+            d_reg_socket.close()
+        if notify_socket:
+            notify_socket.close()
+        if zmq_context:
+            zmq_context.term()
+
+        logger.info("DP Decode Proxy: Shutdown complete.")
--- a/examples/disaggregated_prefill/find_device_ips.py
+++ b/examples/disaggregated_prefill/find_device_ips.py
@@ -0,0 +1,67 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+# Adapted from vllm-project/vllm/examples/offline_inference/basic.py
+#
+"""
+ This file provides a function to obtain ips of all NPU Devices in current machine.
+"""
+
+import os
+import re
+import subprocess
+
+import vllm_ascend.envs as envs
+
+# Get all device ips using hccn_tool
+HCCN_TOOL_PATH = envs.HCCN_PATH
+
+
+def get_device_ips(world_size: int):
+    npu_info = subprocess.run(
+        ["npu-smi", "info", "-m"],
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        universal_newlines=True,
+    )
+    if npu_info.returncode != 0 or not os.path.exists(HCCN_TOOL_PATH):
+        raise RuntimeError("No npu-smi/hccn_tool tools provided for NPU.")
+    npu_start_idx = int(
+        re.match(r".*\n\t([0-9]+).*",
+                 npu_info.stdout).group(1))  # type: ignore
+    device_ip_list = []
+    for ip_offset in range(world_size):
+        cmd = [
+            HCCN_TOOL_PATH,
+            "-i",
+            f"{npu_start_idx + ip_offset}",
+            "-ip",
+            "-g",
+        ]
+        device_ip_info = subprocess.run(
+            cmd,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            universal_newlines=True,
+        )
+        device_ip = re.match(r"ipaddr:(.*)\n",
+                             device_ip_info.stdout).group(1)  # type: ignore
+        device_ip_list.append(device_ip)
+    return device_ip_list
+
+
+# Pass number of NPUs into this function.
+print(get_device_ips(8))
--- a/examples/disaggregated_prefill/p2p_disaggrefated_prefill_proxy.py
+++ b/examples/disaggregated_prefill/p2p_disaggrefated_prefill_proxy.py
@@ -0,0 +1,186 @@
+import os
+import socket
+import threading
+import uuid
+
+import aiohttp
+import msgpack  # type: ignore
+import zmq
+from quart import Quart, make_response, request
+
+prefill_instances: dict[str, str] = {}  # http_address: zmq_address
+decode_instances: dict[str, str] = {}  # http_address: zmq_address
+
+prefill_cv = threading.Condition()
+decode_cv = threading.Condition()
+
+
+def _listen_for_register(poller, router_socket):
+    while True:
+        socks = dict(poller.poll())
+        if router_socket in socks:
+            remote_address, message = router_socket.recv_multipart()
+            # data: {"type": "P", "http_address": "ip:port",
+            #        "zmq_address": "ip:port"}
+            data = msgpack.loads(message)
+            if data["type"] == "P":
+                global prefill_instances
+                global prefill_cv
+                with prefill_cv:
+                    prefill_instances[
+                        data["http_address"]] = data["zmq_address"]
+                    print(
+                        "Get a prefill register with http_addr %s and zmq_addr %s",
+                        data["http_address"],
+                        data["zmq_address"],
+                    )
+            elif data["type"] == "D":
+                global decode_instances
+                global decode_cv
+                with decode_cv:
+                    decode_instances[
+                        data["http_address"]] = data["zmq_address"]
+                    print(
+                        "Get a decode register with http_addr %s and zmq_addr %s",
+                        data["http_address"],
+                        data["zmq_address"],
+                    )
+            else:
+                print(
+                    "Unexpected, Received message from %s, data: %s",
+                    remote_address,
+                    data,
+                )
+
+
+def start_service_discovery(hostname, port):
+    if not hostname:
+        hostname = socket.gethostname()
+    if port == 0:
+        raise ValueError("Port cannot be 0")
+
+    context = zmq.Context()  # type: ignore
+    router_socket = context.socket(zmq.ROUTER)  # type: ignore
+    router_socket.bind(f"tcp://{hostname}:{port}")
+
+    poller = zmq.Poller()  # type: ignore
+    poller.register(router_socket, zmq.POLLIN)  # type: ignore
+
+    _listener_thread = threading.Thread(target=_listen_for_register,
+                                        args=[poller, router_socket],
+                                        daemon=True)
+    _listener_thread.start()
+    return _listener_thread
+
+
+AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
+
+app = Quart(__name__)
+
+
+def random_uuid() -> str:
+    return str(uuid.uuid4().hex)
+
+
+async def forward_request(url, data, request_id):
+    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+        headers = {
+            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
+            "X-Request-Id": request_id,
+        }
+        async with session.post(url=url, json=data,
+                                headers=headers) as response:
+            if response.status == 200:
+                async for chunk_bytes in response.content.iter_chunked(1024):
+                    yield chunk_bytes
+
+
+@app.route("/v1/completions", methods=["POST"])
+async def handle_request():
+    try:
+        original_request_data = await request.get_json()
+
+        prefill_request = original_request_data.copy()
+        # change max_tokens = 1 to let it only do prefill
+        prefill_request["max_tokens"] = 1
+
+        global prefill_instances
+        global prefill_cv
+        with prefill_cv:
+            if len(prefill_instances) > 1:
+                print(
+                    "Found more than 1 Prefill instances. Currently we only support 1P1D, so only"
+                    f"the first Prefill instance({list(prefill_instances.keys())[0]}) will be used!"
+                )
+            if len(prefill_instances) == 0:
+                res_str = (
+                    "No Prefill instances has been registered to proxy. Please confirm that you have successfully"
+                    " and correctly started a Prefill vLLM instance.")
+                print(res_str)
+                response = await make_response(res_str)
+                return response
+            # prefill_addr, prefill_zmq_addr = random.choice(
+            #     list(prefill_instances.items()))
+            prefill_addr, prefill_zmq_addr = list(prefill_instances.items())[0]
+            print(
+                "handle_request, prefill_addr: %s, zmq_addr: %s",
+                prefill_addr,
+                prefill_zmq_addr,
+            )
+
+        global decode_instances
+        global decode_cv
+        with decode_cv:
+            if len(decode_instances) > 1:
+                print(
+                    "Found more than 1 Decode instances. Currently we only support 1P1D, so only"
+                    f"the first Decode instance({list(decode_instances.keys())[0]}) will be used!"
+                )
+            if len(decode_instances) == 0:
+                res_str = (
+                    "No Decode instances has been registered to proxy. Please confirm that you have successfully"
+                    " and correctly started a Decode vLLM instance.")
+                print(res_str)
+                response = await make_response(res_str)
+                return response
+            # decode_addr, decode_zmq_addr = random.choice(
+            #     list(decode_instances.items()))
+            decode_addr, decode_zmq_addr = list(decode_instances.items())[0]
+            print(
+                "handle_request, decode_addr: %s, zmq_addr: %s",
+                decode_addr,
+                decode_zmq_addr,
+            )
+
+        request_id = f"___prefill_addr_{prefill_addr}___decode_addr_{decode_addr}_{random_uuid()}"
+
+        # finish prefill
+        async for _ in forward_request(f"http://{prefill_addr}/v1/completions",
+                                       prefill_request, request_id):
+            continue
+
+        # return decode
+        generator = forward_request(
+            f"http://{decode_addr}/v1/completions",
+            original_request_data,
+            request_id,
+        )
+        response = await make_response(generator)
+        response.timeout = None
+
+        return response
+
+    except Exception as e:
+        import sys
+        import traceback
+
+        exc_info = sys.exc_info()
+        print("Error occurred in disagg prefill proxy server")
+        print(e)
+        print("".join(traceback.format_exception(*exc_info)))
+
+
+if __name__ == "__main__":
+    t = start_service_discovery("0.0.0.0", 30001)
+    app.run(host="0.0.0.0", port=10001)
+    t.join()
--- a/examples/disaggregated_prefill/run_decode_server.sh
+++ b/examples/disaggregated_prefill/run_decode_server.sh
@@ -0,0 +1,37 @@
+export HCCL_IF_IP=2.0.0.0
+export GLOO_SOCKET_IFNAME="enp189s0f0"
+export TP_SOCKET_IFNAME="enp189s0f0"
+export HCCL_SOCKET_IFNAME="enp189s0f0"
+
+export OMP_PROC_BIND=false
+export OMP_NUM_THREADS=100
+
+export VLLM_USE_V1=0
+
+export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+
+
+vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
+    --host 0.0.0.0 \
+    --port 20002 \
+    --tensor-parallel-size 8 \
+    --seed 1024 \
+    --served-model-name deepseek \
+    --max-model-len 2000 \
+    --max-num-batched-tokens 2000 \
+    --trust-remote-code \
+    --gpu-memory-utilization 0.9 \
+    --kv-transfer-config \
+    '{"kv_connector": "AscendSimpleConnector",
+    "kv_buffer_device": "npu",
+    "kv_role": "kv_consumer",
+    "kv_parallel_size": 8,
+    "kv_port":"21001",
+    "kv_connector_extra_config":
+    {"prompt_device_ips": ["1.2.3.1", "1.2.3.2", "1.2.3.3", "1.2.3.4", "1.2.3.5", "1.2.3.6", "1.2.3.7", "1.2.3.8"],
+    "decode_device_ips": ["1.2.3.9", "1.2.3.10", "1.2.3.11", "1.2.3.12", "1.2.3.13", "1.2.3.14", "1.2.3.15", "1.2.3.16"],
+    "llmdatadist_comm_port": 26000,
+    "proxy_ip":"3.0.0.0",
+    "proxy_port":"30001",
+    "http_port": 10002}
+    }'
--- a/examples/disaggregated_prefill/run_prefill_server.sh
+++ b/examples/disaggregated_prefill/run_prefill_server.sh
@@ -0,0 +1,37 @@
+export HCCL_IF_IP=1.0.0.0
+export GLOO_SOCKET_IFNAME="enp189s0f0"
+export TP_SOCKET_IFNAME="enp189s0f0"
+export HCCL_SOCKET_IFNAME="enp189s0f0"
+
+export OMP_PROC_BIND=false
+export OMP_NUM_THREADS=100
+
+export VLLM_USE_V1=0
+
+export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+
+
+vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
+    --host 0.0.0.0 \
+    --port 10002 \
+    --tensor-parallel-size 8 \
+    --seed 1024 \
+    --served-model-name deepseek \
+    --max-model-len 2000 \
+    --max-num-batched-tokens 2000 \
+    --trust-remote-code \
+    --gpu-memory-utilization 0.9 \
+    --kv-transfer-config \
+    '{"kv_connector": "AscendSimpleConnector",
+    "kv_buffer_device": "npu",
+    "kv_role": "kv_producer",
+    "kv_parallel_size": 8,
+    "kv_port":"11001",
+    "kv_connector_extra_config":
+    {"prompt_device_ips": ["1.2.3.1", "1.2.3.2", "1.2.3.3", "1.2.3.4", "1.2.3.5", "1.2.3.6", "1.2.3.7", "1.2.3.8"],
+    "decode_device_ips": ["1.2.3.9", "1.2.3.10", "1.2.3.11", "1.2.3.12", "1.2.3.13", "1.2.3.14", "1.2.3.15", "1.2.3.16"],
+    "llmdatadist_comm_port": 26000,
+    "proxy_ip":"3.0.0.0",
+    "proxy_port":"30001",
+    "http_port": 10002}
+    }'