[E2E] Remove unused PD-disaggreate scripts in E2E test. (#4837)

### What this PR does / why we need it? Remove unused PD-disaggreate scripts in E2E test. - vLLM version: v0.12.0 - vLLM main: ad32e3e19c Signed-off-by: menogrey <1299267905@qq.com>
2025-12-11 09:23:38 +08:00
parent 0eefbe75b6
commit 11bebb518c
5 changed files with 0 additions and 513 deletions
--- a/tests/e2e/pd_disaggreate/run_edge_case_test.sh
+++ b/tests/e2e/pd_disaggreate/run_edge_case_test.sh
@@ -1,131 +0,0 @@
 #!/bin/bash
 export LCCL_DETERMINISTIC=1
 export HCCL_DETERMINISTIC=true
 export CLOSE_MATMUL_K_SHIFT=1
 set -xe
 # Models to run
 MODELS=(
    "Qwen/Qwen3-0.6B-Instruct"
 )
 # Find the git repository root directory
 GIT_ROOT=$(git rev-parse --show-toplevel)
 # Trap the SIGINT signal (triggered by Ctrl+C)
 trap 'kill $(jobs -pr)' SIGINT SIGTERM EXIT
 # Waits for vLLM to start.
 wait_for_server() {
  local port=$1
  timeout 1200 bash -c "
    until curl -s localhost:${port}/health > /dev/null; do
      sleep 1
    done" && return 0 || return 1
 }
 # Function to clean up previous instances
 cleanup_instances() {
  echo "Cleaning up any running vLLM instances..."
  pkill -f "vllm serve" || true
  sleep 2
 }
 # Handle to get model-specific arguments for deepseek
 get_model_args() {
  local model_name=$1
  local extra_args=""
  if [[ "$model_name" == *"deepseek"* ]]; then
    extra_args="--trust-remote-code"
  fi
  echo "$extra_args"
 }
 # Function to run tests for a specific model
 run_tests_for_model() {
  local model_name=$1
  echo "================================"
  echo "Testing model: $model_name"
  echo "================================"
  # Get model-specific arguments
  local model_args=$(get_model_args "$model_name")
  # Start prefill instance
  PREFILL_PORT=8001
  BASE_CMD="ASCEND_RT_VISIBLE_DEVICES=0 vllm serve $model_name \
  --port $PREFILL_PORT \
  --seed 1024 \
  --enforce-eager \
  --disable-log-requests \
  --gpu-memory-utilization 0.8 \
  --distributed-executor-backend mp \
  --kv-transfer-config '{\"kv_connector\":\"MooncakeConnector\",\"kv_role\":\"kv_producer\",\"kv_port\":\"30000\",\"engine_id\":\"0\",\"kv_connector_module_path\":\"vllm_ascend.distributed.mooncake_connector\",\"kv_connector_extra_config\":{\"prefill\":{\"dp_size\":1,\"tp_size\":1},\"decode\":{\"dp_size\":1,\"tp_size\":1}}}'"
  if [ -n "$model_args" ]; then
  FULL_CMD="$BASE_CMD $model_args"
  else
  FULL_CMD="$BASE_CMD"
  fi
  eval "$FULL_CMD &"
  # Start decode instance
  DECODE_PORT=8002
  # Build the command with or without model-specific args
  BASE_CMD="ASCEND_RT_VISIBLE_DEVICES=1 vllm serve $model_name \
  --port $DECODE_PORT \
  --seed 1024 \
  --enforce-eager \
  --disable-log-requests \
  --gpu-memory-utilization 0.8 \
  --distributed-executor-backend mp \
  --kv-transfer-config '{\"kv_connector\":\"MooncakeConnector\",\"kv_role\":\"kv_consumer\",\"kv_port\":\"30100\",\"engine_id\":\"1\",\"kv_connector_module_path\":\"vllm_ascend.distributed.mooncake_connector\",\"kv_connector_extra_config\":{\"prefill\":{\"dp_size\":1,\"tp_size\":1},\"decode\":{\"dp_size\":1,\"tp_size\":1}}}'"
  if [ -n "$model_args" ]; then
  FULL_CMD="$BASE_CMD $model_args"
  else
  FULL_CMD="$BASE_CMD"
  fi
  eval "$FULL_CMD &"
  # Wait for all instances to start
  echo "Waiting for prefill instance on port $PORT to start..."
  wait_for_server $PREFILL_PORT
  echo "Waiting for decode instance on port $PORT to start..."
  wait_for_server $DECODE_PORT
  # Build the command for the proxy server with all the hosts and ports
  PROXY_PORT=8192
  PROXY_CMD="python ${GIT_ROOT}/examples/disaggregated_prefill_v1/load_balance_proxy_server_example.py --port $PROXY_PORT"
  PROXY_CMD+=" --prefiller-ports ${PREFILL_PORT}"
  PROXY_CMD+=" --decoder-ports ${DECODE_PORT}"
  # Start the proxy server
  echo "Starting proxy server with command: $PROXY_CMD"
  $PROXY_CMD &
  # Wait for the proxy to start
  sleep 5
  # Run lm eval for this model
  echo "Running tests for $model_name"
  PREFILL_PORT=$PREFILL_PORT DECODE_PORT=$DECODE_PORT PROXY_PORT=$PROXY_PORT python -m pytest -s -v ${GIT_ROOT}/tests/e2e/pd_disaggreate/test_edge_cases.py
  # Clean up before running next model
  cleanup_instances
  sleep 3
 }
 # Run tests for each model
 for model in "${MODELS[@]}"; do
  run_tests_for_model "$model"
 done
 echo "All tests completed!"
--- a/tests/e2e/pd_disaggreate/setup_pd.sh
+++ b/tests/e2e/pd_disaggreate/setup_pd.sh
@@ -1,134 +0,0 @@
 #!/bin/bash
 #
 # Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # This file is a part of the vllm-ascend project.
 #
 function run_prefill_instance() {
  local model_name=$1
  local tp_size=$2
  local prefill_port=$3
  local register_port=$4
  local prefill_device_ips=$5
  local decode_device_ips=$6
  echo "================================"
  echo "Testing model: $model_name"
  echo "================================"
  # Start prefill instance
  KV_CONFIG=$(jq -n \
    --arg kv_connector "AscendSimpleConnector" \
    --arg kv_buffer_device "npu" \
    --arg kv_role "kv_producer" \
    --argjson kv_parallel_size 8 \
    --arg kv_port 11001 \
    --argjson prefill_device_ips "$prefill_device_ips" \
    --argjson decode_device_ips "$decode_device_ips" \
    --argjson llmdatadist_comm_port 26000 \
    --arg proxy_ip "0.0.0.0" \
    --argjson proxy_port "$register_port" \
    --argjson http_port "$prefill_port" \
    '{
      "kv_connector": $kv_connector,
      "kv_buffer_device": $kv_buffer_device,
      "kv_role": $kv_role,
      "kv_parallel_size": $kv_parallel_size,
      "kv_port": $kv_port,
      "kv_connector_extra_config": {
        "prefill_device_ips": $prefill_device_ips,
        "decode_device_ips": $decode_device_ips,
        "llmdatadist_comm_port": $llmdatadist_comm_port,
        "proxy_ip": $proxy_ip,
        "proxy_port": $proxy_port,
        "http_port": $http_port
      }
    }')
  # start prefill instance
  ASCEND_RT_VISIBLE_DEVICES=0 vllm serve $model_name \
  --host 0.0.0.0 \
  --port $prefill_port \
  --tensor-parallel-size $tp_size \
  --served-model-name Deepseek \
  --max-model-len 2000 \
  --trust-remote-code \
  --kv-transfer-config "$KV_CONFIG"
 }
 function run_decode_instance() {
  # Start decode instance
  local model_name=$1
  local tp_size=$2
  local decode_port=$3
  local register_port=$4
  local prefill_device_ips=$5
  local decode_device_ips=$6
  KV_CONFIG=$(jq -n \
    --arg kv_connector "AscendSimpleConnector" \
    --arg kv_buffer_device "npu" \
    --arg kv_role "kv_consumer" \
    --argjson kv_parallel_size 8 \
    --arg kv_port 21001 \
    --argjson prefill_device_ips "$prefill_device_ips" \
    --argjson decode_device_ips "$decode_device_ips" \
    --argjson llmdatadist_comm_port 26000 \
    --arg proxy_ip "0.0.0.0" \
    --argjson proxy_port "$register_port" \
    --argjson http_port "$decode_port" \
    '{
      "kv_connector": $kv_connector,
      "kv_buffer_device": $kv_buffer_device,
      "kv_role": $kv_role,
      "kv_parallel_size": $kv_parallel_size,
      "kv_port": $kv_port,
      "kv_connector_extra_config": {
        "prefill_device_ips": $prefill_device_ips,
        "decode_device_ips": $decode_device_ips,
        "llmdatadist_comm_port": $llmdatadist_comm_port,
        "proxy_ip": $proxy_ip,
        "proxy_port": $proxy_port,
        "http_port": $http_port
      }
    }')
  # start decode instance
  ASCEND_RT_VISIBLE_DEVICES=1 vllm serve $model_name \
    --host 0.0.0.0 \
    --port $decode_port \
    --tensor-parallel-size $tp_size \
    --seed 1024 \
    --served-model-name Deepseek \
    --max-model-len 2000 \
    --max-num-batched-tokens 2000 \
    --trust-remote-code \
    --gpu-memory-utilization 0.9 \
    --kv-transfer-config "$KV_CONFIG"
 }
 function run_proxy_server() {
  # Build the command for the proxy server with all the hosts and ports
  register_port=$1
  proxy_port=$2
  PROXY_CMD="python examples/disaggregated_prefill/p2p_disaggrefated_prefill_proxy.py --http-port $proxy_port --register-port $register_port"
  # Start the proxy server
  echo "Starting proxy server with command: $PROXY_CMD"
  $PROXY_CMD &
 }
--- a/tests/e2e/pd_disaggreate/test_edge_cases.py
+++ b/tests/e2e/pd_disaggreate/test_edge_cases.py
@@ -1,81 +0,0 @@
 # SPDX-License-Identifier: Apache-2.0
 # This code is from: https://github.com/vllm-project/vllm/blob/main/tests/v1/kv_connector/nixl_integration/test_edge_cases.py
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
 import os
 import openai
 PREFILL_PORT = os.getenv("PREFILL_PORT", None)
 DECODE_PORT = os.getenv("DECODE_PORT", None)
 PROXY_PORT = os.getenv("PROXY_PORT", None)
 if PREFILL_PORT is None or DECODE_PORT is None or PROXY_PORT is None:
    raise ValueError(
        "Please set the PREFILL_PORT, DECODE_PORT, and PROXY_PORT.")
 LONG_PROMPT = "Red Hat is the best company in the world to work for because it works on open source software, which means that all the contributions are delivered to the community. As a result, when working on projects like vLLM we are able to meet many amazing people from various organizations like AMD, Google, NVIDIA, "  # noqa: E501
 PROMPT = "Red Hat is the best company in the world to work for because it works on open source software, which means that all the contributions are delivered to the community. As a result,"  # noqa: E501
 SHORT_PROMPT = "Red Hat is "
 def test_edge_cases():
    # Set the OpenAI API key and base URL
    decode_client = openai.OpenAI(
        api_key="MY_KEY",
        base_url=f"http://localhost:{DECODE_PORT}/v1",
    )
    prefill_client = openai.OpenAI(
        api_key="MY_KEY",
        base_url=f"http://localhost:{PREFILL_PORT}/v1",
    )
    proxy_client = openai.OpenAI(
        api_key="MY_KEY",
        base_url=f"http://localhost:{PROXY_PORT}/v1",
    )
    # Get the list of models
    models = decode_client.models.list()
    MODEL = models.data[0].id
    # (1) Check that we can handle a very short prompt,
    # less than the length of the block size.
    completion = proxy_client.completions.create(model=MODEL,
                                                 prompt=SHORT_PROMPT,
                                                 temperature=0)
    proxy_response = completion.choices[0].text
    completion = prefill_client.completions.create(model=MODEL,
                                                   prompt=SHORT_PROMPT,
                                                   temperature=0)
    prefill_response = completion.choices[0].text
    print(f"SMALL PROMPT: {proxy_response=}")
    print(f"SMALL PROMPT: {prefill_response=}")
    assert proxy_response == prefill_response
    # (2) Check that we can handle a full prefix cache
    # hit on the D worker but not on the P worker.
    # (2a): prime the D worker.
    completion = decode_client.completions.create(model=MODEL,
                                                  prompt=PROMPT,
                                                  temperature=0)
    decode_response = completion.choices[0].text
    # (2b): send via the P/D setup
    completion = proxy_client.completions.create(model=MODEL,
                                                 prompt=PROMPT,
                                                 temperature=0)
    proxy_response = completion.choices[0].text
    print(f"FULL CACHE HIT: {proxy_response=}")
    assert proxy_response == decode_response
    # (3) Check that we can handle a partial prefix cache
    # hit on the D worker.
    completion = proxy_client.completions.create(model=MODEL,
                                                 prompt=LONG_PROMPT,
                                                 temperature=0)
    proxy_response = completion.choices[0].text
    completion = prefill_client.completions.create(model=MODEL,
                                                   prompt=LONG_PROMPT,
                                                   temperature=0)
    prefill_response = completion.choices[0].text
    print(f"PARTIAL CACHE HIT: {proxy_response=}")
    assert proxy_response == prefill_response
--- a/tests/e2e/pd_disaggreate/test_pd_e2e.py
+++ b/tests/e2e/pd_disaggreate/test_pd_e2e.py
@@ -1,109 +0,0 @@
 #
 # Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # This file is a part of the vllm-ascend project.
 #
 import os
 import signal
 import subprocess
 import time
 import psutil
 import requests
 def kill_process_and_children(pid):
    try:
        parent = psutil.Process(pid)
        children = parent.children(recursive=True)
        for child in children:
            print(f"Killing child process {child.pid}")
            child.kill()
        print(f"Killing parent process {pid}")
        parent.kill()
    except psutil.NoSuchProcess:
        pass
 def kill_all_vllm_related():
    current_pid = os.getpid()
    for proc in psutil.process_iter(['pid', 'cmdline']):
        try:
            if proc.pid == current_pid:
                continue
            cmd = ' '.join(proc.info['cmdline'])
            if "vllm" in cmd or "proxy" in cmd or "engine_worker" in cmd:
                kill_process_and_children(proc.pid)
        except Exception:
            continue
 PROXY_PORT = 10102
 DECODE_PORT = 8002
 SCRIPT_PATH = os.path.abspath("./tests/e2e/run_disagg_pd.sh")
 def wait_for_port(port, timeout=30):
    import socket
    start = time.time()
    while time.time() - start < timeout:
        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
            if sock.connect_ex(("127.0.0.1", port)) == 0:
                return True
        time.sleep(1)
    raise TimeoutError(f"Port {port} not ready after {timeout}s")
 def start_and_test_pipeline():
    print("Launching bash script to run vLLM PD setup...")
    proc = subprocess.Popen(["bash", SCRIPT_PATH])
    try:
        print("Waiting for proxy port to be available...")
        wait_for_port(PROXY_PORT, 180)
        wait_for_port(DECODE_PORT, 600)
        # request
        payload = {
            "model": "Deepseek",
            "prompt": "The future of AI is",
            "max_tokens": 64,
            "temperature": 0,
        }
        response = requests.post(
            f"http://localhost:{PROXY_PORT}/v1/completions",
            headers={"Content-Type": "application/json"},
            json=payload,
            timeout=10)
        assert response.status_code == 200, f"HTTP failed: {response.status_code}"
        result = response.json()
        print("Response:", result)
        assert "text" in result["choices"][0]
        assert len(result["choices"][0]["text"].strip()) > 0
    finally:
        # clean up subprocesses
        print("Cleaning up subprocess...")
        proc.send_signal(signal.SIGINT)
        try:
            proc.wait(timeout=10)
        except subprocess.TimeoutExpired:
            proc.kill()
        kill_all_vllm_related()
 def test_disaggregated_pd_pipeline():
    start_and_test_pipeline()
--- a/tests/e2e/run_disagg_pd.sh
+++ b/tests/e2e/run_disagg_pd.sh
@@ -1,58 +0,0 @@
 #!/bin/bash
 #
 # Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # This file is a part of the vllm-ascend project.
 #
 set -eo errexit
 . $(dirname "$0")/common.sh
 . $(dirname "$0")/pd_disaggreate/setup_pd.sh
 export VLLM_USE_MODELSCOPE="True"
 MODEL_NAME="deepseek-ai/DeepSeek-V2-Lite"
 # TODO: add tp case
 TP_SIZE=1
 # TODO: support multi-card
 prefill_ip=$(/usr/local/Ascend/driver/tools/hccn_tool -i 0 -ip -g | grep "ipaddr" | awk -F: '{print $2}' | xargs)
 PREFILL_DEVICE_IPS="[\"$prefill_ip\"]"
 decode_ip=$(/usr/local/Ascend/driver/tools/hccn_tool -i 1 -ip -g | grep "ipaddr" | awk -F: '{print $2}' | xargs)
 DECODE_DEVICE_IPS="[\"$decode_ip\"]"
 _info "====> Start pd disaggregated test"
 REGISTER_PORT=10101
 PREOXY_PORT=10102
 run_proxy_server $REGISTER_PORT $PREOXY_PORT
 _info "Started pd disaggregated proxy server"
 PREFILL_PROC_NAME="Prefill-instance"
 PREFILL_PORT=8001
 _info "Starting prefill instance"
 run_prefill_instance $MODEL_NAME $TP_SIZE $PREFILL_PORT $REGISTER_PORT $PREFILL_DEVICE_IPS $DECODE_DEVICE_IPS &
 _info "Waiting for prefill instance ready"
 wait_url_ready $PREFILL_PROC_NAME "http://localhost:${PREFILL_PORT}/v1/completions"
 DECODE_PROC_NAME="Decode-instance"
 DECODE_PORT=8002
 _info "Starting decode instance"
 run_decode_instance  $MODEL_NAME $TP_SIZE $DECODE_PORT $REGISTER_PORT $PREFILL_DEVICE_IPS $DECODE_DEVICE_IPS &
 _info "Waiting for decode instance ready"
 wait_url_ready $DECODE_PROC_NAME "http://localhost:${DECODE_PORT}/v1/completions"
 _info "pd disaggregated system is ready for handling request"