From 1d0865397248e47d5888f489d56b73785710c08f Mon Sep 17 00:00:00 2001 From: Sai Enduri Date: Tue, 14 Oct 2025 02:51:35 -0700 Subject: [PATCH] [AMD CI] Add image and weights caching. (#11593) --- .github/workflows/pr-test-amd.yml | 18 +++++++++--------- scripts/ci/amd_ci_start_container.sh | 22 ++++++++++++++++++++++ 2 files changed, 31 insertions(+), 9 deletions(-) diff --git a/.github/workflows/pr-test-amd.yml b/.github/workflows/pr-test-amd.yml index 3b58cde5d..b74f5f2f3 100644 --- a/.github/workflows/pr-test-amd.yml +++ b/.github/workflows/pr-test-amd.yml @@ -30,7 +30,7 @@ jobs: strategy: fail-fast: false matrix: - runner: [linux-mi325-gpu-1] + runner: [linux-mi300-gpu-1] runs-on: ${{matrix.runner}} steps: - name: Checkout code @@ -59,7 +59,7 @@ jobs: strategy: fail-fast: false matrix: - runner: [linux-mi325-gpu-2] + runner: [linux-mi300-gpu-2] runs-on: ${{matrix.runner}} steps: - name: Checkout code @@ -86,7 +86,7 @@ jobs: strategy: fail-fast: false matrix: - runner: [linux-mi325-gpu-1] + runner: [linux-mi300-gpu-1] runs-on: ${{matrix.runner}} steps: - name: Checkout code @@ -113,7 +113,7 @@ jobs: strategy: fail-fast: false matrix: - runner: [linux-mi325-gpu-1] + runner: [linux-mi300-gpu-1] runs-on: ${{matrix.runner}} steps: - name: Checkout code @@ -156,7 +156,7 @@ jobs: strategy: fail-fast: false matrix: - runner: [linux-mi325-gpu-1] + runner: [linux-mi300-gpu-1] runs-on: ${{matrix.runner}} steps: - name: Checkout code @@ -193,7 +193,7 @@ jobs: strategy: fail-fast: false matrix: - runner: [linux-mi325-gpu-2] + runner: [linux-mi300-gpu-2] runs-on: ${{matrix.runner}} steps: - name: Checkout code @@ -240,7 +240,7 @@ jobs: strategy: fail-fast: false matrix: - runner: [linux-mi325-gpu-1] + runner: [linux-mi300-gpu-1] part: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] runs-on: ${{matrix.runner}} steps: @@ -268,7 +268,7 @@ jobs: strategy: fail-fast: false matrix: - runner: [linux-mi325-gpu-2] + runner: [linux-mi300-gpu-2] runs-on: ${{matrix.runner}} steps: - name: Checkout code @@ -323,7 +323,7 @@ jobs: strategy: fail-fast: false matrix: - runner: [linux-mi325-gpu-1] + runner: [linux-mi300-gpu-1] runs-on: ${{matrix.runner}} steps: - name: Checkout code diff --git a/scripts/ci/amd_ci_start_container.sh b/scripts/ci/amd_ci_start_container.sh index a1f281c8d..e435f3321 100755 --- a/scripts/ci/amd_ci_start_container.sh +++ b/scripts/ci/amd_ci_start_container.sh @@ -95,6 +95,19 @@ find_latest_image() { *) echo "Error: unsupported GPU architecture '${gpu_arch}'" >&2; return 1 ;; esac + # First, check local cache + for days_back in {0..6}; do + image_tag="${base_tag}-$(date -d "${days_back} days ago" +%Y%m%d)" + local local_image="rocm/sgl-dev:${image_tag}" + image_id=$(docker images -q "${local_image}") + if [[ -n "$image_id" ]]; then + echo "Found cached image locally: ${local_image}" >&2 + echo "${local_image}" + return 0 + fi + done + + # If not found locally, fall back to pulling from public registry for days_back in {0..6}; do image_tag="${base_tag}-$(date -d "${days_back} days ago" +%Y%m%d)" echo "Checking for image: rocm/sgl-dev:${image_tag}" >&2 @@ -119,13 +132,22 @@ IMAGE=$(find_latest_image "${GPU_ARCH}") echo "Pulling Docker image: ${IMAGE}" docker pull "${IMAGE}" +HF_CACHE_HOST=/home/runner/sgl-data/hf-cache +if [[ -d "$HF_CACHE_HOST" ]]; then + CACHE_VOLUME="-v $HF_CACHE_HOST:/hf_home" +else + CACHE_VOLUME="" +fi + echo "Launching container: ci_sglang" docker run -dt --user root --device=/dev/kfd ${DEVICE_FLAG} \ -v "${GITHUB_WORKSPACE:-$PWD}:/sglang-checkout" \ + $CACHE_VOLUME \ --ipc=host --group-add video \ --shm-size 32g \ --cap-add=SYS_PTRACE \ -e HF_TOKEN="${HF_TOKEN:-}" \ + -e HF_HOME=/hf_home \ --security-opt seccomp=unconfined \ -w /sglang-checkout \ --name ci_sglang \