v0.10.1rc1

2025-09-09 09:40:35 +08:00
parent d6f6ef41fe
commit 9149384e03
432 changed files with 84698 additions and 1 deletions
--- a/tests/e2e/pd_disaggreate/run_edge_case_test.sh
+++ b/tests/e2e/pd_disaggreate/run_edge_case_test.sh
@@ -0,0 +1,141 @@
+#!/bin/bash
+export LCCL_DETERMINISTIC=1
+export HCCL_DETERMINISTIC=true
+export CLOSE_MATMUL_K_SHIFT=1
+export VLLM_USE_V1=1
+
+set -xe
+
+# Models to run
+MODELS=(
+    "Qwen/Qwen3-0.6B-Instruct"
+)
+
+# Find the git repository root directory
+GIT_ROOT=$(git rev-parse --show-toplevel)
+
+# Trap the SIGINT signal (triggered by Ctrl+C)
+trap 'kill $(jobs -pr)' SIGINT SIGTERM EXIT
+
+# Gen ranktable
+RANKTABLE_PATH=${GIT_ROOT}/examples/disaggregate_prefill_v1/ranktable.json
+if [ -f "$RANKTABLE_PATH" ]; then
+    rm "$RANKTABLE_PATH"
+fi
+cd ${GIT_ROOT}/examples/disaggregate_prefill_v1
+LOCAL_HOST=`hostname -I|awk -F " " '{print$1}'`
+bash gen_ranktable.sh --ips $LOCAL_HOST  --network-card-name enp189s0f0 --prefill-device-cnt 1 --decode-device-cnt 1
+cd -
+export DISAGGREGATED_PREFILL_RANK_TABLE_PATH="$RANKTABLE_PATH"
+
+# Waits for vLLM to start.
+wait_for_server() {
+  local port=$1
+  timeout 1200 bash -c "
+    until curl -s localhost:${port}/health > /dev/null; do
+      sleep 1
+    done" && return 0 || return 1
+}
+
+# Function to clean up previous instances
+cleanup_instances() {
+  echo "Cleaning up any running vLLM instances..."
+  pkill -f "vllm serve" || true
+  sleep 2
+}
+
+# Handle to get model-specific arguments for deepseek
+get_model_args() {
+  local model_name=$1
+  local extra_args=""
+
+  if [[ "$model_name" == *"deepseek"* ]]; then
+    extra_args="--trust-remote-code"
+  fi
+
+  echo "$extra_args"
+}
+
+
+# Function to run tests for a specific model
+run_tests_for_model() {
+  local model_name=$1
+  echo "================================"
+  echo "Testing model: $model_name"
+  echo "================================"
+
+  # Get model-specific arguments
+  local model_args=$(get_model_args "$model_name")
+  
+  # Start prefill instance
+  PREFILL_PORT=8001
+
+  BASE_CMD="ASCEND_RT_VISIBLE_DEVICES=0 VLLM_LLMDD_RPC_PORT=5559 vllm serve $model_name \
+  --port $PREFILL_PORT \
+  --seed 1024 \
+  --enforce-eager \
+  --disable-log-requests \
+  --gpu-memory-utilization 0.8 \
+  --kv-transfer-config '{\"kv_connector\":\"LLMDataDistCMgrConnector\",\"kv_role\":\"kv_producer\",\"kv_buffer_device\":\"npu\",\"kv_parallel_size\":\"1\",\"kv_port\":\"20001\",\"engine_id\":\"0\",\"kv_connector_module_path\":\"vllm_ascend.distributed.llmdatadist_c_mgr_connector\"}'"
+
+  if [ -n "$model_args" ]; then
+  FULL_CMD="$BASE_CMD $model_args"
+  else
+  FULL_CMD="$BASE_CMD"
+  fi
+
+  eval "$FULL_CMD &"
+
+  # Start decode instance
+  DECODE_PORT=8002
+
+  # Build the command with or without model-specific args
+  BASE_CMD="ASCEND_RT_VISIBLE_DEVICES=1 VLLM_LLMDD_RPC_PORT=6000 vllm serve $model_name \
+  --port $DECODE_PORT \
+  --seed 1024 \
+  --enforce-eager \
+  --disable-log-requests \
+  --gpu-memory-utilization 0.8 \
+  --kv-transfer-config '{\"kv_connector\":\"LLMDataDistCMgrConnector\",\"kv_role\":\"kv_consumer\",\"kv_buffer_device\":\"npu\",\"kv_parallel_size\":\"1\",\"kv_port\":\"20001\",\"engine_id\":\"0\",\"kv_connector_module_path\":\"vllm_ascend.distributed.llmdatadist_c_mgr_connector\"}'"
+
+  if [ -n "$model_args" ]; then
+  FULL_CMD="$BASE_CMD $model_args"
+  else
+  FULL_CMD="$BASE_CMD"
+  fi
+
+  eval "$FULL_CMD &"
+
+  # Wait for all instances to start
+  echo "Waiting for prefill instance on port $PORT to start..."
+  wait_for_server $PREFILL_PORT
+  echo "Waiting for decode instance on port $PORT to start..."
+  wait_for_server $DECODE_PORT
+
+  # Build the command for the proxy server with all the hosts and ports
+  PROXY_PORT=8192
+  PROXY_CMD="python ${GIT_ROOT}/examples/disaggregate_prefill_v1/toy_proxy_server.py --port $PROXY_PORT"
+  PROXY_CMD+=" --prefiller-ports ${PREFILL_PORT}"
+  PROXY_CMD+=" --decoder-ports ${DECODE_PORT}"
+  # Start the proxy server
+  echo "Starting proxy server with command: $PROXY_CMD"
+  $PROXY_CMD &
+
+  # Wait for the proxy to start
+  sleep 5
+
+  # Run lm eval for this model
+  echo "Running tests for $model_name"
+  PREFILL_PORT=$PREFILL_PORT DECODE_PORT=$DECODE_PORT PROXY_PORT=$PROXY_PORT python -m pytest -s -v ${GIT_ROOT}/tests/e2e/pd_disaggreate/test_edge_cases.py
+
+  # Clean up before running next model
+  cleanup_instances
+  sleep 3
+}
+
+# Run tests for each model
+for model in "${MODELS[@]}"; do
+  run_tests_for_model "$model"
+done
+
+echo "All tests completed!"
--- a/tests/e2e/pd_disaggreate/setup_pd.sh
+++ b/tests/e2e/pd_disaggreate/setup_pd.sh
@@ -0,0 +1,136 @@
+#!/bin/bash
+
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+#
+
+function run_prefill_instance() {
+  local model_name=$1
+  local tp_size=$2
+  local prefill_port=$3
+  local register_port=$4
+  local prefill_device_ips=$5
+  local decode_device_ips=$6
+
+  echo "================================"
+  echo "Testing model: $model_name"
+  echo "================================"
+  # Start prefill instance
+
+  KV_CONFIG=$(jq -n \
+    --arg kv_connector "AscendSimpleConnector" \
+    --arg kv_buffer_device "npu" \
+    --arg kv_role "kv_producer" \
+    --argjson kv_parallel_size 8 \
+    --arg kv_port 11001 \
+    --argjson prefill_device_ips "$prefill_device_ips" \
+    --argjson decode_device_ips "$decode_device_ips" \
+    --argjson llmdatadist_comm_port 26000 \
+    --arg proxy_ip "0.0.0.0" \
+    --argjson proxy_port "$register_port" \
+    --argjson http_port "$prefill_port" \
+    '{
+      "kv_connector": $kv_connector,
+      "kv_buffer_device": $kv_buffer_device,
+      "kv_role": $kv_role,
+      "kv_parallel_size": $kv_parallel_size,
+      "kv_port": $kv_port,
+      "kv_connector_extra_config": {
+        "prefill_device_ips": $prefill_device_ips,
+        "decode_device_ips": $decode_device_ips,
+        "llmdatadist_comm_port": $llmdatadist_comm_port,
+        "proxy_ip": $proxy_ip,
+        "proxy_port": $proxy_port,
+        "http_port": $http_port
+      }
+    }')
+
+  # start prefill instance
+  ASCEND_RT_VISIBLE_DEVICES=0 vllm serve $model_name \
+  --host 0.0.0.0 \
+  --port $prefill_port \
+  --tensor-parallel-size $tp_size \
+  --served-model-name Deepseek \
+  --max-model-len 2000 \
+  --trust-remote-code \
+  --enforce-eager \
+  --kv-transfer-config "$KV_CONFIG"
+}
+
+
+
+function run_decode_instance() {
+  # Start decode instance
+  local model_name=$1
+  local tp_size=$2
+  local decode_port=$3
+  local register_port=$4
+  local prefill_device_ips=$5
+  local decode_device_ips=$6
+
+  KV_CONFIG=$(jq -n \
+    --arg kv_connector "AscendSimpleConnector" \
+    --arg kv_buffer_device "npu" \
+    --arg kv_role "kv_consumer" \
+    --argjson kv_parallel_size 8 \
+    --arg kv_port 21001 \
+    --argjson prefill_device_ips "$prefill_device_ips" \
+    --argjson decode_device_ips "$decode_device_ips" \
+    --argjson llmdatadist_comm_port 26000 \
+    --arg proxy_ip "0.0.0.0" \
+    --argjson proxy_port "$register_port" \
+    --argjson http_port "$decode_port" \
+    '{
+      "kv_connector": $kv_connector,
+      "kv_buffer_device": $kv_buffer_device,
+      "kv_role": $kv_role,
+      "kv_parallel_size": $kv_parallel_size,
+      "kv_port": $kv_port,
+      "kv_connector_extra_config": {
+        "prefill_device_ips": $prefill_device_ips,
+        "decode_device_ips": $decode_device_ips,
+        "llmdatadist_comm_port": $llmdatadist_comm_port,
+        "proxy_ip": $proxy_ip,
+        "proxy_port": $proxy_port,
+        "http_port": $http_port
+      }
+    }')
+
+  # start decode instance
+  ASCEND_RT_VISIBLE_DEVICES=1 vllm serve $model_name \
+    --host 0.0.0.0 \
+    --port $decode_port \
+    --tensor-parallel-size $tp_size \
+    --seed 1024 \
+    --served-model-name Deepseek \
+    --max-model-len 2000 \
+    --max-num-batched-tokens 2000 \
+    --trust-remote-code \
+    --gpu-memory-utilization 0.9 \
+    --enforce-eager \
+    --kv-transfer-config "$KV_CONFIG"
+}
+
+function run_proxy_server() {
+  # Build the command for the proxy server with all the hosts and ports
+  register_port=$1
+  proxy_port=$2
+  PROXY_CMD="python examples/disaggregated_prefill/p2p_disaggrefated_prefill_proxy.py --http-port $proxy_port --register-port $register_port"
+
+  # Start the proxy server
+  echo "Starting proxy server with command: $PROXY_CMD"
+  $PROXY_CMD &
+}
--- a/tests/e2e/pd_disaggreate/test_edge_cases.py
+++ b/tests/e2e/pd_disaggreate/test_edge_cases.py
@@ -0,0 +1,81 @@
+# SPDX-License-Identifier: Apache-2.0
+# This code is from: https://github.com/vllm-project/vllm/blob/main/tests/v1/kv_connector/nixl_integration/test_edge_cases.py
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+import os
+
+import openai
+
+PREFILL_PORT = os.getenv("PREFILL_PORT", None)
+DECODE_PORT = os.getenv("DECODE_PORT", None)
+PROXY_PORT = os.getenv("PROXY_PORT", None)
+
+if PREFILL_PORT is None or DECODE_PORT is None or PROXY_PORT is None:
+    raise ValueError(
+        "Please set the PREFILL_PORT, DECODE_PORT, and PROXY_PORT.")
+
+LONG_PROMPT = "Red Hat is the best company in the world to work for because it works on open source software, which means that all the contributions are delivered to the community. As a result, when working on projects like vLLM we are able to meet many amazing people from various organizations like AMD, Google, NVIDIA, "  # noqa: E501
+PROMPT = "Red Hat is the best company in the world to work for because it works on open source software, which means that all the contributions are delivered to the community. As a result,"  # noqa: E501
+SHORT_PROMPT = "Red Hat is "
+
+
+def test_edge_cases():
+    # Set the OpenAI API key and base URL
+    decode_client = openai.OpenAI(
+        api_key="MY_KEY",
+        base_url=f"http://localhost:{DECODE_PORT}/v1",
+    )
+    prefill_client = openai.OpenAI(
+        api_key="MY_KEY",
+        base_url=f"http://localhost:{PREFILL_PORT}/v1",
+    )
+    proxy_client = openai.OpenAI(
+        api_key="MY_KEY",
+        base_url=f"http://localhost:{PROXY_PORT}/v1",
+    )
+
+    # Get the list of models
+    models = decode_client.models.list()
+    MODEL = models.data[0].id
+
+    # (1) Check that we can handle a very short prompt,
+    # less than the length of the block size.
+    completion = proxy_client.completions.create(model=MODEL,
+                                                 prompt=SHORT_PROMPT,
+                                                 temperature=0)
+    proxy_response = completion.choices[0].text
+    completion = prefill_client.completions.create(model=MODEL,
+                                                   prompt=SHORT_PROMPT,
+                                                   temperature=0)
+    prefill_response = completion.choices[0].text
+    print(f"SMALL PROMPT: {proxy_response=}")
+    print(f"SMALL PROMPT: {prefill_response=}")
+    assert proxy_response == prefill_response
+
+    # (2) Check that we can handle a full prefix cache
+    # hit on the D worker but not on the P worker.
+    # (2a): prime the D worker.
+    completion = decode_client.completions.create(model=MODEL,
+                                                  prompt=PROMPT,
+                                                  temperature=0)
+    decode_response = completion.choices[0].text
+    # (2b): send via the P/D setup
+    completion = proxy_client.completions.create(model=MODEL,
+                                                 prompt=PROMPT,
+                                                 temperature=0)
+    proxy_response = completion.choices[0].text
+    print(f"FULL CACHE HIT: {proxy_response=}")
+    assert proxy_response == decode_response
+
+    # (3) Check that we can handle a partial prefix cache
+    # hit on the D worker.
+    completion = proxy_client.completions.create(model=MODEL,
+                                                 prompt=LONG_PROMPT,
+                                                 temperature=0)
+    proxy_response = completion.choices[0].text
+    completion = prefill_client.completions.create(model=MODEL,
+                                                   prompt=LONG_PROMPT,
+                                                   temperature=0)
+    prefill_response = completion.choices[0].text
+    print(f"PARTIAL CACHE HIT: {proxy_response=}")
+    assert proxy_response == prefill_response
--- a/tests/e2e/pd_disaggreate/test_pd_e2e.py
+++ b/tests/e2e/pd_disaggreate/test_pd_e2e.py
@@ -0,0 +1,109 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+#
+
+import os
+import signal
+import subprocess
+import time
+
+import psutil
+import requests
+
+
+def kill_process_and_children(pid):
+    try:
+        parent = psutil.Process(pid)
+        children = parent.children(recursive=True)
+        for child in children:
+            print(f"Killing child process {child.pid}")
+            child.kill()
+        print(f"Killing parent process {pid}")
+        parent.kill()
+    except psutil.NoSuchProcess:
+        pass
+
+
+def kill_all_vllm_related():
+    current_pid = os.getpid()
+
+    for proc in psutil.process_iter(['pid', 'cmdline']):
+        try:
+            if proc.pid == current_pid:
+                continue
+            cmd = ' '.join(proc.info['cmdline'])
+            if "vllm" in cmd or "proxy" in cmd or "engine_worker" in cmd:
+                kill_process_and_children(proc.pid)
+        except Exception:
+            continue
+
+
+PROXY_PORT = 10102
+DECODE_PORT = 8002
+
+SCRIPT_PATH = os.path.abspath("./tests/e2e/run_disagg_pd.sh")
+
+
+def wait_for_port(port, timeout=30):
+    import socket
+    start = time.time()
+    while time.time() - start < timeout:
+        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
+            if sock.connect_ex(("127.0.0.1", port)) == 0:
+                return True
+        time.sleep(1)
+    raise TimeoutError(f"Port {port} not ready after {timeout}s")
+
+
+def start_and_test_pipeline():
+    print("Launching bash script to run vLLM PD setup...")
+    proc = subprocess.Popen(["bash", SCRIPT_PATH])
+    try:
+        print("Waiting for proxy port to be available...")
+        wait_for_port(PROXY_PORT, 180)
+        wait_for_port(DECODE_PORT, 600)
+
+        # request
+        payload = {
+            "model": "Deepseek",
+            "prompt": "The future of AI is",
+            "max_tokens": 64,
+            "temperature": 0,
+        }
+        response = requests.post(
+            f"http://localhost:{PROXY_PORT}/v1/completions",
+            headers={"Content-Type": "application/json"},
+            json=payload,
+            timeout=10)
+        assert response.status_code == 200, f"HTTP failed: {response.status_code}"
+        result = response.json()
+        print("Response:", result)
+        assert "text" in result["choices"][0]
+        assert len(result["choices"][0]["text"].strip()) > 0
+
+    finally:
+        # clean up subprocesses
+        print("Cleaning up subprocess...")
+        proc.send_signal(signal.SIGINT)
+        try:
+            proc.wait(timeout=10)
+        except subprocess.TimeoutExpired:
+            proc.kill()
+        kill_all_vllm_related()
+
+
+def test_disaggregated_pd_pipeline():
+    start_and_test_pipeline()