From 11bebb518c02f078fd59991f3f9bbacdeb800970 Mon Sep 17 00:00:00 2001 From: zhangyiming <34808445+menogrey@users.noreply.github.com> Date: Thu, 11 Dec 2025 09:23:38 +0800 Subject: [PATCH] [E2E] Remove unused PD-disaggreate scripts in E2E test. (#4837) ### What this PR does / why we need it? Remove unused PD-disaggreate scripts in E2E test. - vLLM version: v0.12.0 - vLLM main: https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9 Signed-off-by: menogrey <1299267905@qq.com> --- .../e2e/pd_disaggreate/run_edge_case_test.sh | 131 ----------------- tests/e2e/pd_disaggreate/setup_pd.sh | 134 ------------------ tests/e2e/pd_disaggreate/test_edge_cases.py | 81 ----------- tests/e2e/pd_disaggreate/test_pd_e2e.py | 109 -------------- tests/e2e/run_disagg_pd.sh | 58 -------- 5 files changed, 513 deletions(-) delete mode 100644 tests/e2e/pd_disaggreate/run_edge_case_test.sh delete mode 100644 tests/e2e/pd_disaggreate/setup_pd.sh delete mode 100644 tests/e2e/pd_disaggreate/test_edge_cases.py delete mode 100644 tests/e2e/pd_disaggreate/test_pd_e2e.py delete mode 100644 tests/e2e/run_disagg_pd.sh diff --git a/tests/e2e/pd_disaggreate/run_edge_case_test.sh b/tests/e2e/pd_disaggreate/run_edge_case_test.sh deleted file mode 100644 index 9bf49478..00000000 --- a/tests/e2e/pd_disaggreate/run_edge_case_test.sh +++ /dev/null @@ -1,131 +0,0 @@ -#!/bin/bash -export LCCL_DETERMINISTIC=1 -export HCCL_DETERMINISTIC=true -export CLOSE_MATMUL_K_SHIFT=1 - -set -xe - -# Models to run -MODELS=( - "Qwen/Qwen3-0.6B-Instruct" -) - -# Find the git repository root directory -GIT_ROOT=$(git rev-parse --show-toplevel) - -# Trap the SIGINT signal (triggered by Ctrl+C) -trap 'kill $(jobs -pr)' SIGINT SIGTERM EXIT - -# Waits for vLLM to start. -wait_for_server() { - local port=$1 - timeout 1200 bash -c " - until curl -s localhost:${port}/health > /dev/null; do - sleep 1 - done" && return 0 || return 1 -} - -# Function to clean up previous instances -cleanup_instances() { - echo "Cleaning up any running vLLM instances..." - pkill -f "vllm serve" || true - sleep 2 -} - -# Handle to get model-specific arguments for deepseek -get_model_args() { - local model_name=$1 - local extra_args="" - - if [[ "$model_name" == *"deepseek"* ]]; then - extra_args="--trust-remote-code" - fi - - echo "$extra_args" -} - - -# Function to run tests for a specific model -run_tests_for_model() { - local model_name=$1 - echo "================================" - echo "Testing model: $model_name" - echo "================================" - - # Get model-specific arguments - local model_args=$(get_model_args "$model_name") - - # Start prefill instance - PREFILL_PORT=8001 - - BASE_CMD="ASCEND_RT_VISIBLE_DEVICES=0 vllm serve $model_name \ - --port $PREFILL_PORT \ - --seed 1024 \ - --enforce-eager \ - --disable-log-requests \ - --gpu-memory-utilization 0.8 \ - --distributed-executor-backend mp \ - --kv-transfer-config '{\"kv_connector\":\"MooncakeConnector\",\"kv_role\":\"kv_producer\",\"kv_port\":\"30000\",\"engine_id\":\"0\",\"kv_connector_module_path\":\"vllm_ascend.distributed.mooncake_connector\",\"kv_connector_extra_config\":{\"prefill\":{\"dp_size\":1,\"tp_size\":1},\"decode\":{\"dp_size\":1,\"tp_size\":1}}}'" - - if [ -n "$model_args" ]; then - FULL_CMD="$BASE_CMD $model_args" - else - FULL_CMD="$BASE_CMD" - fi - - eval "$FULL_CMD &" - - # Start decode instance - DECODE_PORT=8002 - - # Build the command with or without model-specific args - BASE_CMD="ASCEND_RT_VISIBLE_DEVICES=1 vllm serve $model_name \ - --port $DECODE_PORT \ - --seed 1024 \ - --enforce-eager \ - --disable-log-requests \ - --gpu-memory-utilization 0.8 \ - --distributed-executor-backend mp \ - --kv-transfer-config '{\"kv_connector\":\"MooncakeConnector\",\"kv_role\":\"kv_consumer\",\"kv_port\":\"30100\",\"engine_id\":\"1\",\"kv_connector_module_path\":\"vllm_ascend.distributed.mooncake_connector\",\"kv_connector_extra_config\":{\"prefill\":{\"dp_size\":1,\"tp_size\":1},\"decode\":{\"dp_size\":1,\"tp_size\":1}}}'" - - if [ -n "$model_args" ]; then - FULL_CMD="$BASE_CMD $model_args" - else - FULL_CMD="$BASE_CMD" - fi - - eval "$FULL_CMD &" - - # Wait for all instances to start - echo "Waiting for prefill instance on port $PORT to start..." - wait_for_server $PREFILL_PORT - echo "Waiting for decode instance on port $PORT to start..." - wait_for_server $DECODE_PORT - - # Build the command for the proxy server with all the hosts and ports - PROXY_PORT=8192 - PROXY_CMD="python ${GIT_ROOT}/examples/disaggregated_prefill_v1/load_balance_proxy_server_example.py --port $PROXY_PORT" - PROXY_CMD+=" --prefiller-ports ${PREFILL_PORT}" - PROXY_CMD+=" --decoder-ports ${DECODE_PORT}" - # Start the proxy server - echo "Starting proxy server with command: $PROXY_CMD" - $PROXY_CMD & - - # Wait for the proxy to start - sleep 5 - - # Run lm eval for this model - echo "Running tests for $model_name" - PREFILL_PORT=$PREFILL_PORT DECODE_PORT=$DECODE_PORT PROXY_PORT=$PROXY_PORT python -m pytest -s -v ${GIT_ROOT}/tests/e2e/pd_disaggreate/test_edge_cases.py - - # Clean up before running next model - cleanup_instances - sleep 3 -} - -# Run tests for each model -for model in "${MODELS[@]}"; do - run_tests_for_model "$model" -done - -echo "All tests completed!" \ No newline at end of file diff --git a/tests/e2e/pd_disaggreate/setup_pd.sh b/tests/e2e/pd_disaggreate/setup_pd.sh deleted file mode 100644 index 675bee43..00000000 --- a/tests/e2e/pd_disaggreate/setup_pd.sh +++ /dev/null @@ -1,134 +0,0 @@ -#!/bin/bash - -# -# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# This file is a part of the vllm-ascend project. -# - -function run_prefill_instance() { - local model_name=$1 - local tp_size=$2 - local prefill_port=$3 - local register_port=$4 - local prefill_device_ips=$5 - local decode_device_ips=$6 - - echo "================================" - echo "Testing model: $model_name" - echo "================================" - # Start prefill instance - - KV_CONFIG=$(jq -n \ - --arg kv_connector "AscendSimpleConnector" \ - --arg kv_buffer_device "npu" \ - --arg kv_role "kv_producer" \ - --argjson kv_parallel_size 8 \ - --arg kv_port 11001 \ - --argjson prefill_device_ips "$prefill_device_ips" \ - --argjson decode_device_ips "$decode_device_ips" \ - --argjson llmdatadist_comm_port 26000 \ - --arg proxy_ip "0.0.0.0" \ - --argjson proxy_port "$register_port" \ - --argjson http_port "$prefill_port" \ - '{ - "kv_connector": $kv_connector, - "kv_buffer_device": $kv_buffer_device, - "kv_role": $kv_role, - "kv_parallel_size": $kv_parallel_size, - "kv_port": $kv_port, - "kv_connector_extra_config": { - "prefill_device_ips": $prefill_device_ips, - "decode_device_ips": $decode_device_ips, - "llmdatadist_comm_port": $llmdatadist_comm_port, - "proxy_ip": $proxy_ip, - "proxy_port": $proxy_port, - "http_port": $http_port - } - }') - - # start prefill instance - ASCEND_RT_VISIBLE_DEVICES=0 vllm serve $model_name \ - --host 0.0.0.0 \ - --port $prefill_port \ - --tensor-parallel-size $tp_size \ - --served-model-name Deepseek \ - --max-model-len 2000 \ - --trust-remote-code \ - --kv-transfer-config "$KV_CONFIG" -} - - - -function run_decode_instance() { - # Start decode instance - local model_name=$1 - local tp_size=$2 - local decode_port=$3 - local register_port=$4 - local prefill_device_ips=$5 - local decode_device_ips=$6 - - KV_CONFIG=$(jq -n \ - --arg kv_connector "AscendSimpleConnector" \ - --arg kv_buffer_device "npu" \ - --arg kv_role "kv_consumer" \ - --argjson kv_parallel_size 8 \ - --arg kv_port 21001 \ - --argjson prefill_device_ips "$prefill_device_ips" \ - --argjson decode_device_ips "$decode_device_ips" \ - --argjson llmdatadist_comm_port 26000 \ - --arg proxy_ip "0.0.0.0" \ - --argjson proxy_port "$register_port" \ - --argjson http_port "$decode_port" \ - '{ - "kv_connector": $kv_connector, - "kv_buffer_device": $kv_buffer_device, - "kv_role": $kv_role, - "kv_parallel_size": $kv_parallel_size, - "kv_port": $kv_port, - "kv_connector_extra_config": { - "prefill_device_ips": $prefill_device_ips, - "decode_device_ips": $decode_device_ips, - "llmdatadist_comm_port": $llmdatadist_comm_port, - "proxy_ip": $proxy_ip, - "proxy_port": $proxy_port, - "http_port": $http_port - } - }') - - # start decode instance - ASCEND_RT_VISIBLE_DEVICES=1 vllm serve $model_name \ - --host 0.0.0.0 \ - --port $decode_port \ - --tensor-parallel-size $tp_size \ - --seed 1024 \ - --served-model-name Deepseek \ - --max-model-len 2000 \ - --max-num-batched-tokens 2000 \ - --trust-remote-code \ - --gpu-memory-utilization 0.9 \ - --kv-transfer-config "$KV_CONFIG" -} - -function run_proxy_server() { - # Build the command for the proxy server with all the hosts and ports - register_port=$1 - proxy_port=$2 - PROXY_CMD="python examples/disaggregated_prefill/p2p_disaggrefated_prefill_proxy.py --http-port $proxy_port --register-port $register_port" - - # Start the proxy server - echo "Starting proxy server with command: $PROXY_CMD" - $PROXY_CMD & -} diff --git a/tests/e2e/pd_disaggreate/test_edge_cases.py b/tests/e2e/pd_disaggreate/test_edge_cases.py deleted file mode 100644 index fe53ddc6..00000000 --- a/tests/e2e/pd_disaggreate/test_edge_cases.py +++ /dev/null @@ -1,81 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# This code is from: https://github.com/vllm-project/vllm/blob/main/tests/v1/kv_connector/nixl_integration/test_edge_cases.py -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -import os - -import openai - -PREFILL_PORT = os.getenv("PREFILL_PORT", None) -DECODE_PORT = os.getenv("DECODE_PORT", None) -PROXY_PORT = os.getenv("PROXY_PORT", None) - -if PREFILL_PORT is None or DECODE_PORT is None or PROXY_PORT is None: - raise ValueError( - "Please set the PREFILL_PORT, DECODE_PORT, and PROXY_PORT.") - -LONG_PROMPT = "Red Hat is the best company in the world to work for because it works on open source software, which means that all the contributions are delivered to the community. As a result, when working on projects like vLLM we are able to meet many amazing people from various organizations like AMD, Google, NVIDIA, " # noqa: E501 -PROMPT = "Red Hat is the best company in the world to work for because it works on open source software, which means that all the contributions are delivered to the community. As a result," # noqa: E501 -SHORT_PROMPT = "Red Hat is " - - -def test_edge_cases(): - # Set the OpenAI API key and base URL - decode_client = openai.OpenAI( - api_key="MY_KEY", - base_url=f"http://localhost:{DECODE_PORT}/v1", - ) - prefill_client = openai.OpenAI( - api_key="MY_KEY", - base_url=f"http://localhost:{PREFILL_PORT}/v1", - ) - proxy_client = openai.OpenAI( - api_key="MY_KEY", - base_url=f"http://localhost:{PROXY_PORT}/v1", - ) - - # Get the list of models - models = decode_client.models.list() - MODEL = models.data[0].id - - # (1) Check that we can handle a very short prompt, - # less than the length of the block size. - completion = proxy_client.completions.create(model=MODEL, - prompt=SHORT_PROMPT, - temperature=0) - proxy_response = completion.choices[0].text - completion = prefill_client.completions.create(model=MODEL, - prompt=SHORT_PROMPT, - temperature=0) - prefill_response = completion.choices[0].text - print(f"SMALL PROMPT: {proxy_response=}") - print(f"SMALL PROMPT: {prefill_response=}") - assert proxy_response == prefill_response - - # (2) Check that we can handle a full prefix cache - # hit on the D worker but not on the P worker. - # (2a): prime the D worker. - completion = decode_client.completions.create(model=MODEL, - prompt=PROMPT, - temperature=0) - decode_response = completion.choices[0].text - # (2b): send via the P/D setup - completion = proxy_client.completions.create(model=MODEL, - prompt=PROMPT, - temperature=0) - proxy_response = completion.choices[0].text - print(f"FULL CACHE HIT: {proxy_response=}") - assert proxy_response == decode_response - - # (3) Check that we can handle a partial prefix cache - # hit on the D worker. - completion = proxy_client.completions.create(model=MODEL, - prompt=LONG_PROMPT, - temperature=0) - proxy_response = completion.choices[0].text - completion = prefill_client.completions.create(model=MODEL, - prompt=LONG_PROMPT, - temperature=0) - prefill_response = completion.choices[0].text - print(f"PARTIAL CACHE HIT: {proxy_response=}") - assert proxy_response == prefill_response \ No newline at end of file diff --git a/tests/e2e/pd_disaggreate/test_pd_e2e.py b/tests/e2e/pd_disaggreate/test_pd_e2e.py deleted file mode 100644 index 5fd92321..00000000 --- a/tests/e2e/pd_disaggreate/test_pd_e2e.py +++ /dev/null @@ -1,109 +0,0 @@ -# -# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# This file is a part of the vllm-ascend project. -# - -import os -import signal -import subprocess -import time - -import psutil -import requests - - -def kill_process_and_children(pid): - try: - parent = psutil.Process(pid) - children = parent.children(recursive=True) - for child in children: - print(f"Killing child process {child.pid}") - child.kill() - print(f"Killing parent process {pid}") - parent.kill() - except psutil.NoSuchProcess: - pass - - -def kill_all_vllm_related(): - current_pid = os.getpid() - - for proc in psutil.process_iter(['pid', 'cmdline']): - try: - if proc.pid == current_pid: - continue - cmd = ' '.join(proc.info['cmdline']) - if "vllm" in cmd or "proxy" in cmd or "engine_worker" in cmd: - kill_process_and_children(proc.pid) - except Exception: - continue - - -PROXY_PORT = 10102 -DECODE_PORT = 8002 - -SCRIPT_PATH = os.path.abspath("./tests/e2e/run_disagg_pd.sh") - - -def wait_for_port(port, timeout=30): - import socket - start = time.time() - while time.time() - start < timeout: - with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: - if sock.connect_ex(("127.0.0.1", port)) == 0: - return True - time.sleep(1) - raise TimeoutError(f"Port {port} not ready after {timeout}s") - - -def start_and_test_pipeline(): - print("Launching bash script to run vLLM PD setup...") - proc = subprocess.Popen(["bash", SCRIPT_PATH]) - try: - print("Waiting for proxy port to be available...") - wait_for_port(PROXY_PORT, 180) - wait_for_port(DECODE_PORT, 600) - - # request - payload = { - "model": "Deepseek", - "prompt": "The future of AI is", - "max_tokens": 64, - "temperature": 0, - } - response = requests.post( - f"http://localhost:{PROXY_PORT}/v1/completions", - headers={"Content-Type": "application/json"}, - json=payload, - timeout=10) - assert response.status_code == 200, f"HTTP failed: {response.status_code}" - result = response.json() - print("Response:", result) - assert "text" in result["choices"][0] - assert len(result["choices"][0]["text"].strip()) > 0 - - finally: - # clean up subprocesses - print("Cleaning up subprocess...") - proc.send_signal(signal.SIGINT) - try: - proc.wait(timeout=10) - except subprocess.TimeoutExpired: - proc.kill() - kill_all_vllm_related() - - -def test_disaggregated_pd_pipeline(): - start_and_test_pipeline() diff --git a/tests/e2e/run_disagg_pd.sh b/tests/e2e/run_disagg_pd.sh deleted file mode 100644 index 99d0faa4..00000000 --- a/tests/e2e/run_disagg_pd.sh +++ /dev/null @@ -1,58 +0,0 @@ -#!/bin/bash - -# -# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# This file is a part of the vllm-ascend project. -# - -set -eo errexit - -. $(dirname "$0")/common.sh -. $(dirname "$0")/pd_disaggreate/setup_pd.sh - -export VLLM_USE_MODELSCOPE="True" - -MODEL_NAME="deepseek-ai/DeepSeek-V2-Lite" -# TODO: add tp case -TP_SIZE=1 - -# TODO: support multi-card -prefill_ip=$(/usr/local/Ascend/driver/tools/hccn_tool -i 0 -ip -g | grep "ipaddr" | awk -F: '{print $2}' | xargs) -PREFILL_DEVICE_IPS="[\"$prefill_ip\"]" - -decode_ip=$(/usr/local/Ascend/driver/tools/hccn_tool -i 1 -ip -g | grep "ipaddr" | awk -F: '{print $2}' | xargs) -DECODE_DEVICE_IPS="[\"$decode_ip\"]" - -_info "====> Start pd disaggregated test" -REGISTER_PORT=10101 -PREOXY_PORT=10102 -run_proxy_server $REGISTER_PORT $PREOXY_PORT -_info "Started pd disaggregated proxy server" - -PREFILL_PROC_NAME="Prefill-instance" -PREFILL_PORT=8001 -_info "Starting prefill instance" -run_prefill_instance $MODEL_NAME $TP_SIZE $PREFILL_PORT $REGISTER_PORT $PREFILL_DEVICE_IPS $DECODE_DEVICE_IPS & -_info "Waiting for prefill instance ready" -wait_url_ready $PREFILL_PROC_NAME "http://localhost:${PREFILL_PORT}/v1/completions" - -DECODE_PROC_NAME="Decode-instance" -DECODE_PORT=8002 -_info "Starting decode instance" -run_decode_instance $MODEL_NAME $TP_SIZE $DECODE_PORT $REGISTER_PORT $PREFILL_DEVICE_IPS $DECODE_DEVICE_IPS & -_info "Waiting for decode instance ready" -wait_url_ready $DECODE_PROC_NAME "http://localhost:${DECODE_PORT}/v1/completions" - -_info "pd disaggregated system is ready for handling request"