[Fix AMD CI] VRAM cleanup (#11174)
Co-authored-by: root <root@smci350-zts-gtu-e17-15.zts-gtu.dcgpu>
This commit is contained in:
30
.github/workflows/pr-test-amd.yml
vendored
30
.github/workflows/pr-test-amd.yml
vendored
@@ -36,6 +36,9 @@ jobs:
|
|||||||
- name: Checkout code
|
- name: Checkout code
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Ensure VRAM is clear
|
||||||
|
run: bash scripts/ensure_vram_clear.sh rocm
|
||||||
|
|
||||||
- name: Start CI container
|
- name: Start CI container
|
||||||
run: bash scripts/ci/amd_ci_start_container.sh
|
run: bash scripts/ci/amd_ci_start_container.sh
|
||||||
env:
|
env:
|
||||||
@@ -62,6 +65,9 @@ jobs:
|
|||||||
- name: Checkout code
|
- name: Checkout code
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Ensure VRAM is clear
|
||||||
|
run: bash scripts/ensure_vram_clear.sh rocm
|
||||||
|
|
||||||
- name: Start CI container
|
- name: Start CI container
|
||||||
run: bash scripts/ci/amd_ci_start_container.sh
|
run: bash scripts/ci/amd_ci_start_container.sh
|
||||||
env:
|
env:
|
||||||
@@ -86,6 +92,9 @@ jobs:
|
|||||||
- name: Checkout code
|
- name: Checkout code
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Ensure VRAM is clear
|
||||||
|
run: bash scripts/ensure_vram_clear.sh rocm
|
||||||
|
|
||||||
- name: Start CI container
|
- name: Start CI container
|
||||||
run: bash scripts/ci/amd_ci_start_container.sh
|
run: bash scripts/ci/amd_ci_start_container.sh
|
||||||
env:
|
env:
|
||||||
@@ -110,6 +119,9 @@ jobs:
|
|||||||
- name: Checkout code
|
- name: Checkout code
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Ensure VRAM is clear
|
||||||
|
run: bash scripts/ensure_vram_clear.sh rocm
|
||||||
|
|
||||||
- name: Start CI container
|
- name: Start CI container
|
||||||
run: bash scripts/ci/amd_ci_start_container.sh
|
run: bash scripts/ci/amd_ci_start_container.sh
|
||||||
env:
|
env:
|
||||||
@@ -150,6 +162,9 @@ jobs:
|
|||||||
- name: Checkout code
|
- name: Checkout code
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Ensure VRAM is clear
|
||||||
|
run: bash scripts/ensure_vram_clear.sh rocm
|
||||||
|
|
||||||
- name: Start CI container
|
- name: Start CI container
|
||||||
run: bash scripts/ci/amd_ci_start_container.sh
|
run: bash scripts/ci/amd_ci_start_container.sh
|
||||||
env:
|
env:
|
||||||
@@ -184,6 +199,9 @@ jobs:
|
|||||||
- name: Checkout code
|
- name: Checkout code
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Ensure VRAM is clear
|
||||||
|
run: bash scripts/ensure_vram_clear.sh rocm
|
||||||
|
|
||||||
- name: Start CI container
|
- name: Start CI container
|
||||||
run: bash scripts/ci/amd_ci_start_container.sh
|
run: bash scripts/ci/amd_ci_start_container.sh
|
||||||
env:
|
env:
|
||||||
@@ -229,6 +247,9 @@ jobs:
|
|||||||
- name: Checkout code
|
- name: Checkout code
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Ensure VRAM is clear
|
||||||
|
run: bash scripts/ensure_vram_clear.sh rocm
|
||||||
|
|
||||||
- name: Start CI container
|
- name: Start CI container
|
||||||
run: bash scripts/ci/amd_ci_start_container.sh
|
run: bash scripts/ci/amd_ci_start_container.sh
|
||||||
env:
|
env:
|
||||||
@@ -253,6 +274,9 @@ jobs:
|
|||||||
- name: Checkout code
|
- name: Checkout code
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Ensure VRAM is clear
|
||||||
|
run: bash scripts/ensure_vram_clear.sh rocm
|
||||||
|
|
||||||
- name: Start CI container
|
- name: Start CI container
|
||||||
run: bash scripts/ci/amd_ci_start_container.sh
|
run: bash scripts/ci/amd_ci_start_container.sh
|
||||||
env:
|
env:
|
||||||
@@ -277,6 +301,9 @@ jobs:
|
|||||||
- name: Checkout code
|
- name: Checkout code
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Ensure VRAM is clear
|
||||||
|
run: bash scripts/ensure_vram_clear.sh rocm
|
||||||
|
|
||||||
- name: Start CI container
|
- name: Start CI container
|
||||||
run: bash scripts/ci/amd_ci_start_container.sh
|
run: bash scripts/ci/amd_ci_start_container.sh
|
||||||
env:
|
env:
|
||||||
@@ -301,6 +328,9 @@ jobs:
|
|||||||
- name: Checkout code
|
- name: Checkout code
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Ensure VRAM is clear
|
||||||
|
run: bash scripts/ensure_vram_clear.sh rocm
|
||||||
|
|
||||||
- name: Start CI container
|
- name: Start CI container
|
||||||
run: bash scripts/ci/amd_ci_start_container.sh
|
run: bash scripts/ci/amd_ci_start_container.sh
|
||||||
env:
|
env:
|
||||||
|
|||||||
27
scripts/check_vram_clear.sh
Executable file
27
scripts/check_vram_clear.sh
Executable file
@@ -0,0 +1,27 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
check_vram_clear() {
|
||||||
|
local vram_threshold_percent=5 # Allow up to 5% VRAM usage
|
||||||
|
local memory_threshold_mb=500 # Allow up to 500MB memory usage
|
||||||
|
|
||||||
|
if command -v rocm-smi >/dev/null 2>&1; then
|
||||||
|
echo "Checking ROCm GPU VRAM usage..."
|
||||||
|
# Check if any GPU has more than threshold VRAM allocated
|
||||||
|
local high_usage=$(rocm-smi --showmemuse | grep -E "GPU Memory Allocated \(VRAM%\): ([6-9]|[1-9][0-9]|100)")
|
||||||
|
if [ -n "$high_usage" ]; then
|
||||||
|
echo "ERROR: VRAM usage exceeds threshold (${vram_threshold_percent}%) on some GPUs:"
|
||||||
|
echo "$high_usage"
|
||||||
|
rocm-smi --showmemuse
|
||||||
|
return 1
|
||||||
|
else
|
||||||
|
echo "✓ VRAM usage is within acceptable limits on all GPUs"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# If this script is run directly (not sourced), run the check
|
||||||
|
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
|
||||||
|
set -e
|
||||||
|
check_vram_clear
|
||||||
|
fi
|
||||||
103
scripts/ensure_vram_clear.sh
Executable file
103
scripts/ensure_vram_clear.sh
Executable file
@@ -0,0 +1,103 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Source the VRAM checking function
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
source "$SCRIPT_DIR/check_vram_clear.sh"
|
||||||
|
|
||||||
|
ensure_vram_clear() {
|
||||||
|
local max_retries=3
|
||||||
|
local retry_count=0
|
||||||
|
|
||||||
|
# Stop and remove any existing ci_sglang container
|
||||||
|
echo "Stopping any existing ci_sglang container..."
|
||||||
|
docker stop ci_sglang || true
|
||||||
|
docker rm ci_sglang || true
|
||||||
|
|
||||||
|
# Log host information for debugging
|
||||||
|
echo "=== Host Information ==="
|
||||||
|
echo "Hostname: $(hostname)"
|
||||||
|
echo "Host IP: $(hostname -I 2>/dev/null || echo 'N/A')"
|
||||||
|
echo "Date: $(date)"
|
||||||
|
echo "Mode: rocm"
|
||||||
|
echo "========================"
|
||||||
|
echo "Running in ROCm mode"
|
||||||
|
|
||||||
|
# Show initial GPU status
|
||||||
|
echo "=== Initial GPU Memory Status ==="
|
||||||
|
rocm-smi --showmemuse
|
||||||
|
echo "=================================="
|
||||||
|
|
||||||
|
while [ $retry_count -lt $max_retries ]; do
|
||||||
|
echo "=== Cleanup Attempt $((retry_count + 1))/$max_retries ==="
|
||||||
|
|
||||||
|
# Clean SGLang processes
|
||||||
|
echo "Killing SGLang processes..."
|
||||||
|
pgrep -f 'sglang::|sglang\.launch_server|sglang\.bench|sglang\.data_parallel|sglang\.srt' | xargs -r kill -9 || true
|
||||||
|
|
||||||
|
if [ $retry_count -gt 0 ]; then
|
||||||
|
echo "Performing aggressive cleanup..."
|
||||||
|
# Kill all processes using KFD
|
||||||
|
rocm-smi --showpids 2>/dev/null | grep 'PID:' | awk '{print $2}' | xargs -r kill -9 2>/dev/null || true
|
||||||
|
# Wait a bit for cleanup to take effect
|
||||||
|
echo "Waiting 30 seconds for VRAM to clear..."
|
||||||
|
sleep 30
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Check VRAM
|
||||||
|
echo "Checking VRAM status..."
|
||||||
|
if check_vram_clear; then
|
||||||
|
echo "✓ VRAM cleanup successful after $((retry_count + 1)) attempts"
|
||||||
|
return 0
|
||||||
|
else
|
||||||
|
echo "✗ VRAM still not clear after attempt $((retry_count + 1))"
|
||||||
|
retry_count=$((retry_count + 1))
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
# Failed after all retries
|
||||||
|
echo "=== FAILED: VRAM cleanup unsuccessful after $max_retries attempts ==="
|
||||||
|
echo "Final GPU status:"
|
||||||
|
timeout 30 rocm-smi --showmemuse || echo "rocm-smi timed out"
|
||||||
|
echo "Processes using GPU:"
|
||||||
|
rocm-smi --showpids 2>/dev/null | grep -q 'PID:' || echo "No processes found using /dev/kfd"
|
||||||
|
|
||||||
|
# Print detailed information about suspicious processes
|
||||||
|
echo "=== Detailed Process Information ==="
|
||||||
|
if command -v rocm-smi >/dev/null 2>&1; then
|
||||||
|
# For AMD GPUs, get processes from rocm-smi --showpids
|
||||||
|
kfd_pids=$(rocm-smi --showpids 2>/dev/null | grep 'PID:' | awk '{print $2}' | sort -u)
|
||||||
|
if [ -n "$kfd_pids" ]; then
|
||||||
|
echo "Processes accessing /dev/kfd (AMD GPU device):"
|
||||||
|
for pid in $kfd_pids; do
|
||||||
|
if ps -p $pid -o pid,ppid,cmd --no-headers 2>/dev/null; then
|
||||||
|
echo " └─ Command line: $(ps -p $pid -o cmd --no-headers 2>/dev/null | head -1)"
|
||||||
|
else
|
||||||
|
echo " └─ PID $pid: Process not found or already terminated"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
else
|
||||||
|
echo "No processes found accessing /dev/kfd"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Check for any remaining sglang-related processes
|
||||||
|
echo "Checking for any remaining sglang-related processes:"
|
||||||
|
sglang_procs=$(pgrep -f 'sglang::|sglang\.launch_server|sglang\.bench|sglang\.data_parallel|sglang\.srt' 2>/dev/null)
|
||||||
|
if [ -n "$sglang_procs" ]; then
|
||||||
|
echo "Found sglang processes still running:"
|
||||||
|
for pid in $sglang_procs; do
|
||||||
|
ps -p $pid -o pid,ppid,cmd --no-headers 2>/dev/null || echo "PID $pid not found"
|
||||||
|
done
|
||||||
|
else
|
||||||
|
echo "No sglang-related processes found."
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "=================================================================="
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
# If this script is run directly (not sourced), run the ensure function
|
||||||
|
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
|
||||||
|
set -e
|
||||||
|
ensure_vram_clear "$@"
|
||||||
|
fi
|
||||||
@@ -2,6 +2,7 @@ import argparse
|
|||||||
import glob
|
import glob
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
from sglang.srt.utils import is_hip
|
||||||
from sglang.test.test_utils import run_unittest_files
|
from sglang.test.test_utils import run_unittest_files
|
||||||
|
|
||||||
|
|
||||||
@@ -368,7 +369,7 @@ if __name__ == "__main__":
|
|||||||
arg_parser.add_argument(
|
arg_parser.add_argument(
|
||||||
"--timeout-per-file",
|
"--timeout-per-file",
|
||||||
type=int,
|
type=int,
|
||||||
default=1200,
|
default=1500 if is_hip() else 1200,
|
||||||
help="The time limit for running one file in seconds.",
|
help="The time limit for running one file in seconds.",
|
||||||
)
|
)
|
||||||
arg_parser.add_argument(
|
arg_parser.add_argument(
|
||||||
|
|||||||
Reference in New Issue
Block a user