diff --git a/.github/workflows/pr-test-amd.yml b/.github/workflows/pr-test-amd.yml
index 3efa5c2f1..acc8a6bb9 100644
--- a/.github/workflows/pr-test-amd.yml
+++ b/.github/workflows/pr-test-amd.yml
@@ -36,6 +36,9 @@ jobs:
       - name: Checkout code
         uses: actions/checkout@v4
 
+      - name: Ensure VRAM is clear
+        run: bash scripts/ensure_vram_clear.sh rocm
+
       - name: Start CI container
         run: bash scripts/ci/amd_ci_start_container.sh
         env:
@@ -62,6 +65,9 @@ jobs:
       - name: Checkout code
         uses: actions/checkout@v4
 
+      - name: Ensure VRAM is clear
+        run: bash scripts/ensure_vram_clear.sh rocm
+
       - name: Start CI container
         run: bash scripts/ci/amd_ci_start_container.sh
         env:
@@ -86,6 +92,9 @@ jobs:
       - name: Checkout code
         uses: actions/checkout@v4
 
+      - name: Ensure VRAM is clear
+        run: bash scripts/ensure_vram_clear.sh rocm
+
       - name: Start CI container
         run: bash scripts/ci/amd_ci_start_container.sh
         env:
@@ -110,6 +119,9 @@ jobs:
       - name: Checkout code
         uses: actions/checkout@v4
 
+      - name: Ensure VRAM is clear
+        run: bash scripts/ensure_vram_clear.sh rocm
+
       - name: Start CI container
         run: bash scripts/ci/amd_ci_start_container.sh
         env:
@@ -150,6 +162,9 @@ jobs:
       - name: Checkout code
         uses: actions/checkout@v4
 
+      - name: Ensure VRAM is clear
+        run: bash scripts/ensure_vram_clear.sh rocm
+
       - name: Start CI container
         run: bash scripts/ci/amd_ci_start_container.sh
         env:
@@ -184,6 +199,9 @@ jobs:
       - name: Checkout code
         uses: actions/checkout@v4
 
+      - name: Ensure VRAM is clear
+        run: bash scripts/ensure_vram_clear.sh rocm
+
       - name: Start CI container
         run: bash scripts/ci/amd_ci_start_container.sh
         env:
@@ -229,6 +247,9 @@ jobs:
       - name: Checkout code
         uses: actions/checkout@v4
 
+      - name: Ensure VRAM is clear
+        run: bash scripts/ensure_vram_clear.sh rocm
+
       - name: Start CI container
         run: bash scripts/ci/amd_ci_start_container.sh
         env:
@@ -253,6 +274,9 @@ jobs:
       - name: Checkout code
         uses: actions/checkout@v4
 
+      - name: Ensure VRAM is clear
+        run: bash scripts/ensure_vram_clear.sh rocm
+
       - name: Start CI container
         run: bash scripts/ci/amd_ci_start_container.sh
         env:
@@ -277,6 +301,9 @@ jobs:
       - name: Checkout code
         uses: actions/checkout@v4
 
+      - name: Ensure VRAM is clear
+        run: bash scripts/ensure_vram_clear.sh rocm
+
       - name: Start CI container
         run: bash scripts/ci/amd_ci_start_container.sh
         env:
@@ -301,6 +328,9 @@ jobs:
       - name: Checkout code
         uses: actions/checkout@v4
 
+      - name: Ensure VRAM is clear
+        run: bash scripts/ensure_vram_clear.sh rocm
+
       - name: Start CI container
         run: bash scripts/ci/amd_ci_start_container.sh
         env:
diff --git a/scripts/check_vram_clear.sh b/scripts/check_vram_clear.sh
new file mode 100755
index 000000000..51e5a915f
--- /dev/null
+++ b/scripts/check_vram_clear.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+check_vram_clear() {
+    local vram_threshold_percent=5  # Allow up to 5% VRAM usage
+    local memory_threshold_mb=500   # Allow up to 500MB memory usage
+
+    if command -v rocm-smi >/dev/null 2>&1; then
+        echo "Checking ROCm GPU VRAM usage..."
+        # Check if any GPU has more than threshold VRAM allocated
+        local high_usage=$(rocm-smi --showmemuse | grep -E "GPU Memory Allocated \(VRAM%\): ([6-9]|[1-9][0-9]|100)")
+        if [ -n "$high_usage" ]; then
+            echo "ERROR: VRAM usage exceeds threshold (${vram_threshold_percent}%) on some GPUs:"
+            echo "$high_usage"
+            rocm-smi --showmemuse
+            return 1
+        else
+            echo "✓ VRAM usage is within acceptable limits on all GPUs"
+            return 0
+        fi
+   fi
+}
+
+# If this script is run directly (not sourced), run the check
+if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
+    set -e
+    check_vram_clear
+fi
diff --git a/scripts/ensure_vram_clear.sh b/scripts/ensure_vram_clear.sh
new file mode 100755
index 000000000..0dd720960
--- /dev/null
+++ b/scripts/ensure_vram_clear.sh
@@ -0,0 +1,103 @@
+#!/bin/bash
+
+# Source the VRAM checking function
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+source "$SCRIPT_DIR/check_vram_clear.sh"
+
+ensure_vram_clear() {
+    local max_retries=3
+    local retry_count=0
+
+    # Stop and remove any existing ci_sglang container
+    echo "Stopping any existing ci_sglang container..."
+    docker stop ci_sglang || true
+    docker rm ci_sglang || true
+
+    # Log host information for debugging
+    echo "=== Host Information ==="
+    echo "Hostname: $(hostname)"
+    echo "Host IP: $(hostname -I 2>/dev/null || echo 'N/A')"
+    echo "Date: $(date)"
+    echo "Mode: rocm"
+    echo "========================"
+    echo "Running in ROCm mode"
+
+    # Show initial GPU status
+    echo "=== Initial GPU Memory Status ==="
+    rocm-smi --showmemuse
+    echo "=================================="
+
+    while [ $retry_count -lt $max_retries ]; do
+        echo "=== Cleanup Attempt $((retry_count + 1))/$max_retries ==="
+
+        # Clean SGLang processes
+        echo "Killing SGLang processes..."
+        pgrep -f 'sglang::|sglang\.launch_server|sglang\.bench|sglang\.data_parallel|sglang\.srt' | xargs -r kill -9 || true
+
+        if [ $retry_count -gt 0 ]; then
+            echo "Performing aggressive cleanup..."
+            # Kill all processes using KFD
+            rocm-smi --showpids 2>/dev/null | grep 'PID:' | awk '{print $2}' | xargs -r kill -9 2>/dev/null || true
+            # Wait a bit for cleanup to take effect
+            echo "Waiting 30 seconds for VRAM to clear..."
+            sleep 30
+        fi
+
+        # Check VRAM
+        echo "Checking VRAM status..."
+        if check_vram_clear; then
+            echo "✓ VRAM cleanup successful after $((retry_count + 1)) attempts"
+            return 0
+        else
+            echo "✗ VRAM still not clear after attempt $((retry_count + 1))"
+            retry_count=$((retry_count + 1))
+        fi
+    done
+
+    # Failed after all retries
+    echo "=== FAILED: VRAM cleanup unsuccessful after $max_retries attempts ==="
+    echo "Final GPU status:"
+    timeout 30 rocm-smi --showmemuse || echo "rocm-smi timed out"
+    echo "Processes using GPU:"
+    rocm-smi --showpids 2>/dev/null | grep -q 'PID:' || echo "No processes found using /dev/kfd"
+
+    # Print detailed information about suspicious processes
+    echo "=== Detailed Process Information ==="
+    if command -v rocm-smi >/dev/null 2>&1; then
+        # For AMD GPUs, get processes from rocm-smi --showpids
+        kfd_pids=$(rocm-smi --showpids 2>/dev/null | grep 'PID:' | awk '{print $2}' | sort -u)
+        if [ -n "$kfd_pids" ]; then
+            echo "Processes accessing /dev/kfd (AMD GPU device):"
+            for pid in $kfd_pids; do
+                if ps -p $pid -o pid,ppid,cmd --no-headers 2>/dev/null; then
+                    echo "  └─ Command line: $(ps -p $pid -o cmd --no-headers 2>/dev/null | head -1)"
+                else
+                    echo "  └─ PID $pid: Process not found or already terminated"
+                fi
+            done
+        else
+            echo "No processes found accessing /dev/kfd"
+        fi
+    fi
+
+    # Check for any remaining sglang-related processes
+    echo "Checking for any remaining sglang-related processes:"
+    sglang_procs=$(pgrep -f 'sglang::|sglang\.launch_server|sglang\.bench|sglang\.data_parallel|sglang\.srt' 2>/dev/null)
+    if [ -n "$sglang_procs" ]; then
+        echo "Found sglang processes still running:"
+        for pid in $sglang_procs; do
+            ps -p $pid -o pid,ppid,cmd --no-headers 2>/dev/null || echo "PID $pid not found"
+        done
+    else
+        echo "No sglang-related processes found."
+    fi
+
+    echo "=================================================================="
+    return 1
+}
+
+# If this script is run directly (not sourced), run the ensure function
+if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
+    set -e
+    ensure_vram_clear "$@"
+fi
diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py
index b0cfd44bf..5904e4ab1 100644
--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -2,6 +2,7 @@ import argparse
 import glob
 from dataclasses import dataclass
 
+from sglang.srt.utils import is_hip
 from sglang.test.test_utils import run_unittest_files
 
 
@@ -368,7 +369,7 @@ if __name__ == "__main__":
     arg_parser.add_argument(
         "--timeout-per-file",
         type=int,
-        default=1200,
+        default=1500 if is_hip() else 1200,
         help="The time limit for running one file in seconds.",
     )
     arg_parser.add_argument(