From cb341c7bcd770954d29f413671c827d3249f74ac Mon Sep 17 00:00:00 2001 From: wangxiyuan Date: Mon, 9 Jun 2025 16:34:41 +0800 Subject: [PATCH] [CI] Fix PD job (#1129) Fix e2e test for Pd job Signed-off-by: wangxiyuan --- tests/e2e/common.sh | 4 ++-- tests/e2e/pd_disaggreate/setup_pd.sh | 4 ++-- tests/e2e/run_disagg_pd.sh | 8 ++++---- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/e2e/common.sh b/tests/e2e/common.sh index dd20d70..a8b8810 100644 --- a/tests/e2e/common.sh +++ b/tests/e2e/common.sh @@ -14,7 +14,7 @@ _err() { _red "Error: $*" && exit 1; } CURL_TIMEOUT=1 CURL_COOLDOWN=5 -CURL_MAX_TRIES=120 +CURL_MAX_TRIES=180 function wait_url_ready() { local serve_name="$1" @@ -31,7 +31,7 @@ function wait_url_ready() { break fi if [ "$i" -gt "$CURL_MAX_TRIES" ]; then - _info "===> \$CURL_MAX_TRIES exceeded waiting for ${serve_name} to be ready" + _info "===> ${CURL_MAX_TRIES}s exceeded waiting for ${serve_name} to be ready" return 1 fi sleep "$CURL_COOLDOWN" diff --git a/tests/e2e/pd_disaggreate/setup_pd.sh b/tests/e2e/pd_disaggreate/setup_pd.sh index e178e0f..675bee4 100644 --- a/tests/e2e/pd_disaggreate/setup_pd.sh +++ b/tests/e2e/pd_disaggreate/setup_pd.sh @@ -66,7 +66,7 @@ function run_prefill_instance() { --served-model-name Deepseek \ --max-model-len 2000 \ --trust-remote-code \ - --kv-transfer-config "$KV_CONFIG" & + --kv-transfer-config "$KV_CONFIG" } @@ -119,7 +119,7 @@ function run_decode_instance() { --max-num-batched-tokens 2000 \ --trust-remote-code \ --gpu-memory-utilization 0.9 \ - --kv-transfer-config "$KV_CONFIG" & + --kv-transfer-config "$KV_CONFIG" } function run_proxy_server() { diff --git a/tests/e2e/run_disagg_pd.sh b/tests/e2e/run_disagg_pd.sh index 16ed4a0..99d0faa 100644 --- a/tests/e2e/run_disagg_pd.sh +++ b/tests/e2e/run_disagg_pd.sh @@ -43,16 +43,16 @@ _info "Started pd disaggregated proxy server" PREFILL_PROC_NAME="Prefill-instance" PREFILL_PORT=8001 -run_prefill_instance $MODEL_NAME $TP_SIZE $PREFILL_PORT $REGISTER_PORT $PREFILL_DEVICE_IPS $DECODE_DEVICE_IPS _info "Starting prefill instance" - +run_prefill_instance $MODEL_NAME $TP_SIZE $PREFILL_PORT $REGISTER_PORT $PREFILL_DEVICE_IPS $DECODE_DEVICE_IPS & +_info "Waiting for prefill instance ready" wait_url_ready $PREFILL_PROC_NAME "http://localhost:${PREFILL_PORT}/v1/completions" DECODE_PROC_NAME="Decode-instance" DECODE_PORT=8002 -run_decode_instance $MODEL_NAME $TP_SIZE $DECODE_PORT $REGISTER_PORT $PREFILL_DEVICE_IPS $DECODE_DEVICE_IPS _info "Starting decode instance" - +run_decode_instance $MODEL_NAME $TP_SIZE $DECODE_PORT $REGISTER_PORT $PREFILL_DEVICE_IPS $DECODE_DEVICE_IPS & +_info "Waiting for decode instance ready" wait_url_ready $DECODE_PROC_NAME "http://localhost:${DECODE_PORT}/v1/completions" _info "pd disaggregated system is ready for handling request"