Files
sglang/scripts/ensure_vram_clear.sh
2025-10-05 19:03:53 -04:00

104 lines
3.8 KiB
Bash
Executable File

#!/bin/bash
# Source the VRAM checking function
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
source "$SCRIPT_DIR/check_vram_clear.sh"
ensure_vram_clear() {
local max_retries=3
local retry_count=0
# Stop and remove any existing ci_sglang container
echo "Stopping any existing ci_sglang container..."
docker stop ci_sglang || true
docker rm ci_sglang || true
# Log host information for debugging
echo "=== Host Information ==="
echo "Hostname: $(hostname)"
echo "Host IP: $(hostname -I 2>/dev/null || echo 'N/A')"
echo "Date: $(date)"
echo "Mode: rocm"
echo "========================"
echo "Running in ROCm mode"
# Show initial GPU status
echo "=== Initial GPU Memory Status ==="
rocm-smi --showmemuse
echo "=================================="
while [ $retry_count -lt $max_retries ]; do
echo "=== Cleanup Attempt $((retry_count + 1))/$max_retries ==="
# Clean SGLang processes
echo "Killing SGLang processes..."
pgrep -f 'sglang::|sglang\.launch_server|sglang\.bench|sglang\.data_parallel|sglang\.srt' | xargs -r kill -9 || true
if [ $retry_count -gt 0 ]; then
echo "Performing aggressive cleanup..."
# Kill all processes using KFD
rocm-smi --showpids 2>/dev/null | grep 'PID:' | awk '{print $2}' | xargs -r kill -9 2>/dev/null || true
# Wait a bit for cleanup to take effect
echo "Waiting 30 seconds for VRAM to clear..."
sleep 30
fi
# Check VRAM
echo "Checking VRAM status..."
if check_vram_clear; then
echo "✓ VRAM cleanup successful after $((retry_count + 1)) attempts"
return 0
else
echo "✗ VRAM still not clear after attempt $((retry_count + 1))"
retry_count=$((retry_count + 1))
fi
done
# Failed after all retries
echo "=== FAILED: VRAM cleanup unsuccessful after $max_retries attempts ==="
echo "Final GPU status:"
timeout 30 rocm-smi --showmemuse || echo "rocm-smi timed out"
echo "Processes using GPU:"
rocm-smi --showpids 2>/dev/null | grep -q 'PID:' || echo "No processes found using /dev/kfd"
# Print detailed information about suspicious processes
echo "=== Detailed Process Information ==="
if command -v rocm-smi >/dev/null 2>&1; then
# For AMD GPUs, get processes from rocm-smi --showpids
kfd_pids=$(rocm-smi --showpids 2>/dev/null | grep 'PID:' | awk '{print $2}' | sort -u)
if [ -n "$kfd_pids" ]; then
echo "Processes accessing /dev/kfd (AMD GPU device):"
for pid in $kfd_pids; do
if ps -p $pid -o pid,ppid,cmd --no-headers 2>/dev/null; then
echo " └─ Command line: $(ps -p $pid -o cmd --no-headers 2>/dev/null | head -1)"
else
echo " └─ PID $pid: Process not found or already terminated"
fi
done
else
echo "No processes found accessing /dev/kfd"
fi
fi
# Check for any remaining sglang-related processes
echo "Checking for any remaining sglang-related processes:"
sglang_procs=$(pgrep -f 'sglang::|sglang\.launch_server|sglang\.bench|sglang\.data_parallel|sglang\.srt' 2>/dev/null)
if [ -n "$sglang_procs" ]; then
echo "Found sglang processes still running:"
for pid in $sglang_procs; do
ps -p $pid -o pid,ppid,cmd --no-headers 2>/dev/null || echo "PID $pid not found"
done
else
echo "No sglang-related processes found."
fi
echo "=================================================================="
return 1
}
# If this script is run directly (not sourced), run the ensure function
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
set -e
ensure_vram_clear "$@"
fi