[AMD CI] Add image and weights caching. (#11593)
This commit is contained in:
18
.github/workflows/pr-test-amd.yml
vendored
18
.github/workflows/pr-test-amd.yml
vendored
@@ -30,7 +30,7 @@ jobs:
|
|||||||
strategy:
|
strategy:
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
runner: [linux-mi325-gpu-1]
|
runner: [linux-mi300-gpu-1]
|
||||||
runs-on: ${{matrix.runner}}
|
runs-on: ${{matrix.runner}}
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout code
|
- name: Checkout code
|
||||||
@@ -59,7 +59,7 @@ jobs:
|
|||||||
strategy:
|
strategy:
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
runner: [linux-mi325-gpu-2]
|
runner: [linux-mi300-gpu-2]
|
||||||
runs-on: ${{matrix.runner}}
|
runs-on: ${{matrix.runner}}
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout code
|
- name: Checkout code
|
||||||
@@ -86,7 +86,7 @@ jobs:
|
|||||||
strategy:
|
strategy:
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
runner: [linux-mi325-gpu-1]
|
runner: [linux-mi300-gpu-1]
|
||||||
runs-on: ${{matrix.runner}}
|
runs-on: ${{matrix.runner}}
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout code
|
- name: Checkout code
|
||||||
@@ -113,7 +113,7 @@ jobs:
|
|||||||
strategy:
|
strategy:
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
runner: [linux-mi325-gpu-1]
|
runner: [linux-mi300-gpu-1]
|
||||||
runs-on: ${{matrix.runner}}
|
runs-on: ${{matrix.runner}}
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout code
|
- name: Checkout code
|
||||||
@@ -156,7 +156,7 @@ jobs:
|
|||||||
strategy:
|
strategy:
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
runner: [linux-mi325-gpu-1]
|
runner: [linux-mi300-gpu-1]
|
||||||
runs-on: ${{matrix.runner}}
|
runs-on: ${{matrix.runner}}
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout code
|
- name: Checkout code
|
||||||
@@ -193,7 +193,7 @@ jobs:
|
|||||||
strategy:
|
strategy:
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
runner: [linux-mi325-gpu-2]
|
runner: [linux-mi300-gpu-2]
|
||||||
runs-on: ${{matrix.runner}}
|
runs-on: ${{matrix.runner}}
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout code
|
- name: Checkout code
|
||||||
@@ -240,7 +240,7 @@ jobs:
|
|||||||
strategy:
|
strategy:
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
runner: [linux-mi325-gpu-1]
|
runner: [linux-mi300-gpu-1]
|
||||||
part: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
|
part: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
|
||||||
runs-on: ${{matrix.runner}}
|
runs-on: ${{matrix.runner}}
|
||||||
steps:
|
steps:
|
||||||
@@ -268,7 +268,7 @@ jobs:
|
|||||||
strategy:
|
strategy:
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
runner: [linux-mi325-gpu-2]
|
runner: [linux-mi300-gpu-2]
|
||||||
runs-on: ${{matrix.runner}}
|
runs-on: ${{matrix.runner}}
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout code
|
- name: Checkout code
|
||||||
@@ -323,7 +323,7 @@ jobs:
|
|||||||
strategy:
|
strategy:
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
runner: [linux-mi325-gpu-1]
|
runner: [linux-mi300-gpu-1]
|
||||||
runs-on: ${{matrix.runner}}
|
runs-on: ${{matrix.runner}}
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout code
|
- name: Checkout code
|
||||||
|
|||||||
@@ -95,6 +95,19 @@ find_latest_image() {
|
|||||||
*) echo "Error: unsupported GPU architecture '${gpu_arch}'" >&2; return 1 ;;
|
*) echo "Error: unsupported GPU architecture '${gpu_arch}'" >&2; return 1 ;;
|
||||||
esac
|
esac
|
||||||
|
|
||||||
|
# First, check local cache
|
||||||
|
for days_back in {0..6}; do
|
||||||
|
image_tag="${base_tag}-$(date -d "${days_back} days ago" +%Y%m%d)"
|
||||||
|
local local_image="rocm/sgl-dev:${image_tag}"
|
||||||
|
image_id=$(docker images -q "${local_image}")
|
||||||
|
if [[ -n "$image_id" ]]; then
|
||||||
|
echo "Found cached image locally: ${local_image}" >&2
|
||||||
|
echo "${local_image}"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
# If not found locally, fall back to pulling from public registry
|
||||||
for days_back in {0..6}; do
|
for days_back in {0..6}; do
|
||||||
image_tag="${base_tag}-$(date -d "${days_back} days ago" +%Y%m%d)"
|
image_tag="${base_tag}-$(date -d "${days_back} days ago" +%Y%m%d)"
|
||||||
echo "Checking for image: rocm/sgl-dev:${image_tag}" >&2
|
echo "Checking for image: rocm/sgl-dev:${image_tag}" >&2
|
||||||
@@ -119,13 +132,22 @@ IMAGE=$(find_latest_image "${GPU_ARCH}")
|
|||||||
echo "Pulling Docker image: ${IMAGE}"
|
echo "Pulling Docker image: ${IMAGE}"
|
||||||
docker pull "${IMAGE}"
|
docker pull "${IMAGE}"
|
||||||
|
|
||||||
|
HF_CACHE_HOST=/home/runner/sgl-data/hf-cache
|
||||||
|
if [[ -d "$HF_CACHE_HOST" ]]; then
|
||||||
|
CACHE_VOLUME="-v $HF_CACHE_HOST:/hf_home"
|
||||||
|
else
|
||||||
|
CACHE_VOLUME=""
|
||||||
|
fi
|
||||||
|
|
||||||
echo "Launching container: ci_sglang"
|
echo "Launching container: ci_sglang"
|
||||||
docker run -dt --user root --device=/dev/kfd ${DEVICE_FLAG} \
|
docker run -dt --user root --device=/dev/kfd ${DEVICE_FLAG} \
|
||||||
-v "${GITHUB_WORKSPACE:-$PWD}:/sglang-checkout" \
|
-v "${GITHUB_WORKSPACE:-$PWD}:/sglang-checkout" \
|
||||||
|
$CACHE_VOLUME \
|
||||||
--ipc=host --group-add video \
|
--ipc=host --group-add video \
|
||||||
--shm-size 32g \
|
--shm-size 32g \
|
||||||
--cap-add=SYS_PTRACE \
|
--cap-add=SYS_PTRACE \
|
||||||
-e HF_TOKEN="${HF_TOKEN:-}" \
|
-e HF_TOKEN="${HF_TOKEN:-}" \
|
||||||
|
-e HF_HOME=/hf_home \
|
||||||
--security-opt seccomp=unconfined \
|
--security-opt seccomp=unconfined \
|
||||||
-w /sglang-checkout \
|
-w /sglang-checkout \
|
||||||
--name ci_sglang \
|
--name ci_sglang \
|
||||||
|
|||||||
Reference in New Issue
Block a user