Disaggregate prefill for kv cache register style (#950)
### What this PR does / why we need it?
This PR adopt `LLMDataDist` for kv cache register and `pull_blocks`
style disaggregate prefill implementation. The interface implementation
mainly follows the design of NIXL PR
https://github.com/vllm-project/vllm/pull/17751/files#diff-7eaad0b7dee0626bf29d10081b0f0c5e3ea15a4af97e7b182a4e0d35f8346953
.
This PR can be test with the following step:
- Generate the rank table for all machine.
- execute`toy_proxy.py` to launch the disaggregate prefill proxy server,
specify the prefill ip, port and the decode ip, port
- Run the prefill server and decode server.
- send the request to the disaggregate prefill proxy
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.9.2
- vLLM main:
8d0a01a5f2
---------
Signed-off-by: ganyi <pleaplusone.gy@gmail.com>
Signed-off-by: machenglong <machenglong_yewu@cmss.chinamobile.com>
Signed-off-by: liziyu179 <3475441767@qq.com>
Signed-off-by: underfitc <hucong24@huawei.com>
Signed-off-by: zouyida2052 <zouyida@huawei.com>
Signed-off-by: liziyu <liziyu16@huawei.com>
Signed-off-by: underfituu <hzhucong@163.com>
Co-authored-by: machenglong <machenglong_yewu@cmss.chinamobile.com>
Co-authored-by: liziyu179 <3475441767@qq.com>
Co-authored-by: underfitc <hucong24@huawei.com>
Co-authored-by: zouyida2052 <zouyida@huawei.com>
Co-authored-by: liziyu <liziyu16@huawei.com>
Co-authored-by: underfituu <hzhucong@163.com>
This commit is contained in:
141
tests/e2e/pd_disaggreate/run_edge_case_test.sh
Normal file
141
tests/e2e/pd_disaggreate/run_edge_case_test.sh
Normal file
@@ -0,0 +1,141 @@
|
||||
#!/bin/bash
|
||||
export LCCL_DETERMINISTIC=1
|
||||
export HCCL_DETERMINISTIC=true
|
||||
export CLOSE_MATMUL_K_SHIFT=1
|
||||
export VLLM_USE_V1=1
|
||||
|
||||
set -xe
|
||||
|
||||
# Models to run
|
||||
MODELS=(
|
||||
"Qwen/Qwen3-0.6B-Instruct"
|
||||
)
|
||||
|
||||
# Find the git repository root directory
|
||||
GIT_ROOT=$(git rev-parse --show-toplevel)
|
||||
|
||||
# Trap the SIGINT signal (triggered by Ctrl+C)
|
||||
trap 'kill $(jobs -pr)' SIGINT SIGTERM EXIT
|
||||
|
||||
# Gen ranktable
|
||||
RANKTABLE_PATH=${GIT_ROOT}/examples/disaggregate_prefill_v1/ranktable.json
|
||||
if [ -f "$RANKTABLE_PATH" ]; then
|
||||
rm "$RANKTABLE_PATH"
|
||||
fi
|
||||
cd ${GIT_ROOT}/examples/disaggregate_prefill_v1
|
||||
LOCAL_HOST=`hostname -I|awk -F " " '{print$1}'`
|
||||
bash gen_ranktable.sh --ips $LOCAL_HOST --network-card-name enp189s0f0 --prefill-device-cnt 1 --decode-device-cnt 1
|
||||
cd -
|
||||
export DISAGGREGATED_PREFILL_RANK_TABLE_PATH="$RANKTABLE_PATH"
|
||||
|
||||
# Waits for vLLM to start.
|
||||
wait_for_server() {
|
||||
local port=$1
|
||||
timeout 1200 bash -c "
|
||||
until curl -s localhost:${port}/health > /dev/null; do
|
||||
sleep 1
|
||||
done" && return 0 || return 1
|
||||
}
|
||||
|
||||
# Function to clean up previous instances
|
||||
cleanup_instances() {
|
||||
echo "Cleaning up any running vLLM instances..."
|
||||
pkill -f "vllm serve" || true
|
||||
sleep 2
|
||||
}
|
||||
|
||||
# Handle to get model-specific arguments for deepseek
|
||||
get_model_args() {
|
||||
local model_name=$1
|
||||
local extra_args=""
|
||||
|
||||
if [[ "$model_name" == *"deepseek"* ]]; then
|
||||
extra_args="--trust-remote-code"
|
||||
fi
|
||||
|
||||
echo "$extra_args"
|
||||
}
|
||||
|
||||
|
||||
# Function to run tests for a specific model
|
||||
run_tests_for_model() {
|
||||
local model_name=$1
|
||||
echo "================================"
|
||||
echo "Testing model: $model_name"
|
||||
echo "================================"
|
||||
|
||||
# Get model-specific arguments
|
||||
local model_args=$(get_model_args "$model_name")
|
||||
|
||||
# Start prefill instance
|
||||
PREFILL_PORT=8001
|
||||
|
||||
BASE_CMD="ASCEND_RT_VISIBLE_DEVICES=0 VLLM_LLMDD_RPC_PORT=5559 vllm serve $model_name \
|
||||
--port $PREFILL_PORT \
|
||||
--seed 1024 \
|
||||
--enforce-eager \
|
||||
--disable-log-requests \
|
||||
--gpu-memory-utilization 0.8 \
|
||||
--kv-transfer-config '{\"kv_connector\":\"LLMDataDistCMgrConnector\",\"kv_role\":\"kv_producer\",\"kv_buffer_device\":\"npu\",\"kv_parallel_size\":\"1\",\"kv_port\":\"20001\",\"engine_id\":\"0\",\"kv_connector_module_path\":\"vllm_ascend.distributed.llmdatadist_c_mgr_connector\"}'"
|
||||
|
||||
if [ -n "$model_args" ]; then
|
||||
FULL_CMD="$BASE_CMD $model_args"
|
||||
else
|
||||
FULL_CMD="$BASE_CMD"
|
||||
fi
|
||||
|
||||
eval "$FULL_CMD &"
|
||||
|
||||
# Start decode instance
|
||||
DECODE_PORT=8002
|
||||
|
||||
# Build the command with or without model-specific args
|
||||
BASE_CMD="ASCEND_RT_VISIBLE_DEVICES=1 VLLM_LLMDD_RPC_PORT=6000 vllm serve $model_name \
|
||||
--port $DECODE_PORT \
|
||||
--seed 1024 \
|
||||
--enforce-eager \
|
||||
--disable-log-requests \
|
||||
--gpu-memory-utilization 0.8 \
|
||||
--kv-transfer-config '{\"kv_connector\":\"LLMDataDistCMgrConnector\",\"kv_role\":\"kv_consumer\",\"kv_buffer_device\":\"npu\",\"kv_parallel_size\":\"1\",\"kv_port\":\"20001\",\"engine_id\":\"0\",\"kv_connector_module_path\":\"vllm_ascend.distributed.llmdatadist_c_mgr_connector\"}'"
|
||||
|
||||
if [ -n "$model_args" ]; then
|
||||
FULL_CMD="$BASE_CMD $model_args"
|
||||
else
|
||||
FULL_CMD="$BASE_CMD"
|
||||
fi
|
||||
|
||||
eval "$FULL_CMD &"
|
||||
|
||||
# Wait for all instances to start
|
||||
echo "Waiting for prefill instance on port $PORT to start..."
|
||||
wait_for_server $PREFILL_PORT
|
||||
echo "Waiting for decode instance on port $PORT to start..."
|
||||
wait_for_server $DECODE_PORT
|
||||
|
||||
# Build the command for the proxy server with all the hosts and ports
|
||||
PROXY_PORT=8192
|
||||
PROXY_CMD="python ${GIT_ROOT}/examples/disaggregate_prefill_v1/toy_proxy_server.py --port $PROXY_PORT"
|
||||
PROXY_CMD+=" --prefiller-ports ${PREFILL_PORT}"
|
||||
PROXY_CMD+=" --decoder-ports ${DECODE_PORT}"
|
||||
# Start the proxy server
|
||||
echo "Starting proxy server with command: $PROXY_CMD"
|
||||
$PROXY_CMD &
|
||||
|
||||
# Wait for the proxy to start
|
||||
sleep 5
|
||||
|
||||
# Run lm eval for this model
|
||||
echo "Running tests for $model_name"
|
||||
PREFILL_PORT=$PREFILL_PORT DECODE_PORT=$DECODE_PORT PROXY_PORT=$PROXY_PORT python -m pytest -s -v ${GIT_ROOT}/tests/e2e/pd_disaggreate/test_edge_cases.py
|
||||
|
||||
# Clean up before running next model
|
||||
cleanup_instances
|
||||
sleep 3
|
||||
}
|
||||
|
||||
# Run tests for each model
|
||||
for model in "${MODELS[@]}"; do
|
||||
run_tests_for_model "$model"
|
||||
done
|
||||
|
||||
echo "All tests completed!"
|
||||
81
tests/e2e/pd_disaggreate/test_edge_cases.py
Normal file
81
tests/e2e/pd_disaggreate/test_edge_cases.py
Normal file
@@ -0,0 +1,81 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# This code is from: https://github.com/vllm-project/vllm/blob/main/tests/v1/kv_connector/nixl_integration/test_edge_cases.py
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
||||
import os
|
||||
|
||||
import openai
|
||||
|
||||
PREFILL_PORT = os.getenv("PREFILL_PORT", None)
|
||||
DECODE_PORT = os.getenv("DECODE_PORT", None)
|
||||
PROXY_PORT = os.getenv("PROXY_PORT", None)
|
||||
|
||||
if PREFILL_PORT is None or DECODE_PORT is None or PROXY_PORT is None:
|
||||
raise ValueError(
|
||||
"Please set the PREFILL_PORT, DECODE_PORT, and PROXY_PORT.")
|
||||
|
||||
LONG_PROMPT = "Red Hat is the best company in the world to work for because it works on open source software, which means that all the contributions are delivered to the community. As a result, when working on projects like vLLM we are able to meet many amazing people from various organizations like AMD, Google, NVIDIA, " # noqa: E501
|
||||
PROMPT = "Red Hat is the best company in the world to work for because it works on open source software, which means that all the contributions are delivered to the community. As a result," # noqa: E501
|
||||
SHORT_PROMPT = "Red Hat is "
|
||||
|
||||
|
||||
def test_edge_cases():
|
||||
# Set the OpenAI API key and base URL
|
||||
decode_client = openai.OpenAI(
|
||||
api_key="MY_KEY",
|
||||
base_url=f"http://localhost:{DECODE_PORT}/v1",
|
||||
)
|
||||
prefill_client = openai.OpenAI(
|
||||
api_key="MY_KEY",
|
||||
base_url=f"http://localhost:{PREFILL_PORT}/v1",
|
||||
)
|
||||
proxy_client = openai.OpenAI(
|
||||
api_key="MY_KEY",
|
||||
base_url=f"http://localhost:{PROXY_PORT}/v1",
|
||||
)
|
||||
|
||||
# Get the list of models
|
||||
models = decode_client.models.list()
|
||||
MODEL = models.data[0].id
|
||||
|
||||
# (1) Check that we can handle a very short prompt,
|
||||
# less than the length of the block size.
|
||||
completion = proxy_client.completions.create(model=MODEL,
|
||||
prompt=SHORT_PROMPT,
|
||||
temperature=0)
|
||||
proxy_response = completion.choices[0].text
|
||||
completion = prefill_client.completions.create(model=MODEL,
|
||||
prompt=SHORT_PROMPT,
|
||||
temperature=0)
|
||||
prefill_response = completion.choices[0].text
|
||||
print(f"SMALL PROMPT: {proxy_response=}")
|
||||
print(f"SMALL PROMPT: {prefill_response=}")
|
||||
assert proxy_response == prefill_response
|
||||
|
||||
# (2) Check that we can handle a full prefix cache
|
||||
# hit on the D worker but not on the P worker.
|
||||
# (2a): prime the D worker.
|
||||
completion = decode_client.completions.create(model=MODEL,
|
||||
prompt=PROMPT,
|
||||
temperature=0)
|
||||
decode_response = completion.choices[0].text
|
||||
# (2b): send via the P/D setup
|
||||
completion = proxy_client.completions.create(model=MODEL,
|
||||
prompt=PROMPT,
|
||||
temperature=0)
|
||||
proxy_response = completion.choices[0].text
|
||||
print(f"FULL CACHE HIT: {proxy_response=}")
|
||||
assert proxy_response == decode_response
|
||||
|
||||
# (3) Check that we can handle a partial prefix cache
|
||||
# hit on the D worker.
|
||||
completion = proxy_client.completions.create(model=MODEL,
|
||||
prompt=LONG_PROMPT,
|
||||
temperature=0)
|
||||
proxy_response = completion.choices[0].text
|
||||
completion = prefill_client.completions.create(model=MODEL,
|
||||
prompt=LONG_PROMPT,
|
||||
temperature=0)
|
||||
prefill_response = completion.choices[0].text
|
||||
print(f"PARTIAL CACHE HIT: {proxy_response=}")
|
||||
assert proxy_response == prefill_response
|
||||
Reference in New Issue
Block a user