diff --git a/.github/workflows/pr-test-amd.yml b/.github/workflows/pr-test-amd.yml index 3efa5c2f1..acc8a6bb9 100644 --- a/.github/workflows/pr-test-amd.yml +++ b/.github/workflows/pr-test-amd.yml @@ -36,6 +36,9 @@ jobs: - name: Checkout code uses: actions/checkout@v4 + - name: Ensure VRAM is clear + run: bash scripts/ensure_vram_clear.sh rocm + - name: Start CI container run: bash scripts/ci/amd_ci_start_container.sh env: @@ -62,6 +65,9 @@ jobs: - name: Checkout code uses: actions/checkout@v4 + - name: Ensure VRAM is clear + run: bash scripts/ensure_vram_clear.sh rocm + - name: Start CI container run: bash scripts/ci/amd_ci_start_container.sh env: @@ -86,6 +92,9 @@ jobs: - name: Checkout code uses: actions/checkout@v4 + - name: Ensure VRAM is clear + run: bash scripts/ensure_vram_clear.sh rocm + - name: Start CI container run: bash scripts/ci/amd_ci_start_container.sh env: @@ -110,6 +119,9 @@ jobs: - name: Checkout code uses: actions/checkout@v4 + - name: Ensure VRAM is clear + run: bash scripts/ensure_vram_clear.sh rocm + - name: Start CI container run: bash scripts/ci/amd_ci_start_container.sh env: @@ -150,6 +162,9 @@ jobs: - name: Checkout code uses: actions/checkout@v4 + - name: Ensure VRAM is clear + run: bash scripts/ensure_vram_clear.sh rocm + - name: Start CI container run: bash scripts/ci/amd_ci_start_container.sh env: @@ -184,6 +199,9 @@ jobs: - name: Checkout code uses: actions/checkout@v4 + - name: Ensure VRAM is clear + run: bash scripts/ensure_vram_clear.sh rocm + - name: Start CI container run: bash scripts/ci/amd_ci_start_container.sh env: @@ -229,6 +247,9 @@ jobs: - name: Checkout code uses: actions/checkout@v4 + - name: Ensure VRAM is clear + run: bash scripts/ensure_vram_clear.sh rocm + - name: Start CI container run: bash scripts/ci/amd_ci_start_container.sh env: @@ -253,6 +274,9 @@ jobs: - name: Checkout code uses: actions/checkout@v4 + - name: Ensure VRAM is clear + run: bash scripts/ensure_vram_clear.sh rocm + - name: Start CI container run: bash scripts/ci/amd_ci_start_container.sh env: @@ -277,6 +301,9 @@ jobs: - name: Checkout code uses: actions/checkout@v4 + - name: Ensure VRAM is clear + run: bash scripts/ensure_vram_clear.sh rocm + - name: Start CI container run: bash scripts/ci/amd_ci_start_container.sh env: @@ -301,6 +328,9 @@ jobs: - name: Checkout code uses: actions/checkout@v4 + - name: Ensure VRAM is clear + run: bash scripts/ensure_vram_clear.sh rocm + - name: Start CI container run: bash scripts/ci/amd_ci_start_container.sh env: diff --git a/scripts/check_vram_clear.sh b/scripts/check_vram_clear.sh new file mode 100755 index 000000000..51e5a915f --- /dev/null +++ b/scripts/check_vram_clear.sh @@ -0,0 +1,27 @@ +#!/bin/bash + +check_vram_clear() { + local vram_threshold_percent=5 # Allow up to 5% VRAM usage + local memory_threshold_mb=500 # Allow up to 500MB memory usage + + if command -v rocm-smi >/dev/null 2>&1; then + echo "Checking ROCm GPU VRAM usage..." + # Check if any GPU has more than threshold VRAM allocated + local high_usage=$(rocm-smi --showmemuse | grep -E "GPU Memory Allocated \(VRAM%\): ([6-9]|[1-9][0-9]|100)") + if [ -n "$high_usage" ]; then + echo "ERROR: VRAM usage exceeds threshold (${vram_threshold_percent}%) on some GPUs:" + echo "$high_usage" + rocm-smi --showmemuse + return 1 + else + echo "✓ VRAM usage is within acceptable limits on all GPUs" + return 0 + fi + fi +} + +# If this script is run directly (not sourced), run the check +if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then + set -e + check_vram_clear +fi diff --git a/scripts/ensure_vram_clear.sh b/scripts/ensure_vram_clear.sh new file mode 100755 index 000000000..0dd720960 --- /dev/null +++ b/scripts/ensure_vram_clear.sh @@ -0,0 +1,103 @@ +#!/bin/bash + +# Source the VRAM checking function +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "$SCRIPT_DIR/check_vram_clear.sh" + +ensure_vram_clear() { + local max_retries=3 + local retry_count=0 + + # Stop and remove any existing ci_sglang container + echo "Stopping any existing ci_sglang container..." + docker stop ci_sglang || true + docker rm ci_sglang || true + + # Log host information for debugging + echo "=== Host Information ===" + echo "Hostname: $(hostname)" + echo "Host IP: $(hostname -I 2>/dev/null || echo 'N/A')" + echo "Date: $(date)" + echo "Mode: rocm" + echo "========================" + echo "Running in ROCm mode" + + # Show initial GPU status + echo "=== Initial GPU Memory Status ===" + rocm-smi --showmemuse + echo "==================================" + + while [ $retry_count -lt $max_retries ]; do + echo "=== Cleanup Attempt $((retry_count + 1))/$max_retries ===" + + # Clean SGLang processes + echo "Killing SGLang processes..." + pgrep -f 'sglang::|sglang\.launch_server|sglang\.bench|sglang\.data_parallel|sglang\.srt' | xargs -r kill -9 || true + + if [ $retry_count -gt 0 ]; then + echo "Performing aggressive cleanup..." + # Kill all processes using KFD + rocm-smi --showpids 2>/dev/null | grep 'PID:' | awk '{print $2}' | xargs -r kill -9 2>/dev/null || true + # Wait a bit for cleanup to take effect + echo "Waiting 30 seconds for VRAM to clear..." + sleep 30 + fi + + # Check VRAM + echo "Checking VRAM status..." + if check_vram_clear; then + echo "✓ VRAM cleanup successful after $((retry_count + 1)) attempts" + return 0 + else + echo "✗ VRAM still not clear after attempt $((retry_count + 1))" + retry_count=$((retry_count + 1)) + fi + done + + # Failed after all retries + echo "=== FAILED: VRAM cleanup unsuccessful after $max_retries attempts ===" + echo "Final GPU status:" + timeout 30 rocm-smi --showmemuse || echo "rocm-smi timed out" + echo "Processes using GPU:" + rocm-smi --showpids 2>/dev/null | grep -q 'PID:' || echo "No processes found using /dev/kfd" + + # Print detailed information about suspicious processes + echo "=== Detailed Process Information ===" + if command -v rocm-smi >/dev/null 2>&1; then + # For AMD GPUs, get processes from rocm-smi --showpids + kfd_pids=$(rocm-smi --showpids 2>/dev/null | grep 'PID:' | awk '{print $2}' | sort -u) + if [ -n "$kfd_pids" ]; then + echo "Processes accessing /dev/kfd (AMD GPU device):" + for pid in $kfd_pids; do + if ps -p $pid -o pid,ppid,cmd --no-headers 2>/dev/null; then + echo " └─ Command line: $(ps -p $pid -o cmd --no-headers 2>/dev/null | head -1)" + else + echo " └─ PID $pid: Process not found or already terminated" + fi + done + else + echo "No processes found accessing /dev/kfd" + fi + fi + + # Check for any remaining sglang-related processes + echo "Checking for any remaining sglang-related processes:" + sglang_procs=$(pgrep -f 'sglang::|sglang\.launch_server|sglang\.bench|sglang\.data_parallel|sglang\.srt' 2>/dev/null) + if [ -n "$sglang_procs" ]; then + echo "Found sglang processes still running:" + for pid in $sglang_procs; do + ps -p $pid -o pid,ppid,cmd --no-headers 2>/dev/null || echo "PID $pid not found" + done + else + echo "No sglang-related processes found." + fi + + echo "==================================================================" + return 1 +} + +# If this script is run directly (not sourced), run the ensure function +if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then + set -e + ensure_vram_clear "$@" +fi diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py index b0cfd44bf..5904e4ab1 100644 --- a/test/srt/run_suite.py +++ b/test/srt/run_suite.py @@ -2,6 +2,7 @@ import argparse import glob from dataclasses import dataclass +from sglang.srt.utils import is_hip from sglang.test.test_utils import run_unittest_files @@ -368,7 +369,7 @@ if __name__ == "__main__": arg_parser.add_argument( "--timeout-per-file", type=int, - default=1200, + default=1500 if is_hip() else 1200, help="The time limit for running one file in seconds.", ) arg_parser.add_argument(