Add tests to AMD CI for MI35x (#9662)

Co-authored-by: Sai Enduri <saimanas.enduri@amd.com>
This commit is contained in:
Hubert Lu
2025-09-10 12:50:05 -07:00
committed by GitHub
parent 9e2f7252db
commit 91b3555d2d
7 changed files with 159 additions and 122 deletions

View File

@@ -1,6 +1,18 @@
#!/bin/bash
set -euo pipefail
# Detect GPU family from hostname (e.g., linux-mi35x-gpu-1-xxxxx-runner-zzzzz)
HOSTNAME_VALUE=$(hostname)
GPU_FAMILY=""
# Host names look like: linux-mi35x-gpu-1-xxxxx-runner-zzzzz
if [[ "${HOSTNAME_VALUE}" =~ ^linux-(mi[0-9]+[a-z]*)-gpu-[0-9]+ ]]; then
GPU_FAMILY="${BASH_REMATCH[1]}"
echo "Detected GPU family from hostname: ${GPU_FAMILY}"
else
echo "Warning: could not parse GPU family from '${HOSTNAME_VALUE}'"
fi
WORKDIR="/sglang-checkout/test/srt"
declare -A ENV_MAP=(
[SGLANG_AMD_CI]=1
@@ -8,6 +20,11 @@ declare -A ENV_MAP=(
[SGLANG_USE_AITER]=1
)
# Conditionally add GPU_ARCHS only for mi35x
if [[ "${GPU_FAMILY}" == "mi35x" ]]; then
ENV_MAP[GPU_ARCHS]="gfx950"
fi
# Parse -w/--workdir and -e ENV=VAL
while [[ $# -gt 0 ]]; do
case "$1" in

View File

@@ -1,19 +1,44 @@
#!/bin/bash
set -euo pipefail
HOSTNAME_VALUE=$(hostname)
GPU_ARCH="mi30x" # default
# Host names look like: linux-mi35x-gpu-1-xxxxx-runner-zzzzz
if [[ "${HOSTNAME_VALUE}" =~ ^linux-(mi[0-9]+[a-z]*)-gpu-[0-9]+ ]]; then
GPU_ARCH="${BASH_REMATCH[1]}"
echo "Detected GPU architecture from hostname: ${GPU_ARCH}"
else
echo "Warning: could not parse GPU architecture from '${HOSTNAME_VALUE}', defaulting to ${GPU_ARCH}"
fi
# Install the required dependencies in CI.
docker exec ci_sglang pip install --upgrade pip
docker exec ci_sglang pip uninstall sgl-kernel -y || true
docker exec -w /sglang-checkout/sgl-kernel ci_sglang bash -c "rm -f pyproject.toml && mv pyproject_rocm.toml pyproject.toml && python3 setup_rocm.py install"
docker exec ci_sglang pip install -e "python[dev_hip]"
case "${GPU_ARCH}" in
mi35x)
echo "Runner uses ${GPU_ARCH}; will fetch mi35x image."
docker exec ci_sglang pip install -e "python[dev_hip]" --no-deps # TODO: only for mi35x
# For lmms_evals evaluating MMMU
docker exec -w / ci_sglang git clone --branch v0.3.3 --depth 1 https://github.com/EvolvingLMMs-Lab/lmms-eval.git
docker exec -w /lmms-eval ci_sglang pip install -e . --no-deps # TODO: only for mi35x
;;
mi30x|mi300|mi325)
echo "Runner uses ${GPU_ARCH}; will fetch mi30x image."
docker exec ci_sglang pip install -e "python[dev_hip]"
# For lmms_evals evaluating MMMU
docker exec -w / ci_sglang git clone --branch v0.3.3 --depth 1 https://github.com/EvolvingLMMs-Lab/lmms-eval.git
docker exec -w /lmms-eval ci_sglang pip install -e .
;;
*)
echo "Runner architecture '${GPU_ARCH}' unrecognised;" >&2
;;
esac
docker exec -w / ci_sglang git clone https://github.com/merrymercy/human-eval.git
docker exec -w /human-eval ci_sglang pip install -e .
# For lmms_evals evaluating MMMU
docker exec -w / ci_sglang git clone --branch v0.3.3 --depth 1 https://github.com/EvolvingLMMs-Lab/lmms-eval.git
docker exec -w /lmms-eval ci_sglang pip install -e .
docker exec -w / ci_sglang mkdir -p /dummy-grok
mkdir -p dummy-grok && wget https://sharkpublic.blob.core.windows.net/sharkpublic/sglang/dummy_grok.json -O dummy-grok/config.json
docker cp ./dummy-grok ci_sglang:/

View File

@@ -3,7 +3,7 @@ set -euo pipefail
# Get version from SGLang version.py file
SGLANG_VERSION_FILE="$(dirname "$0")/../../python/sglang/version.py"
SGLANG_VERSION="v0.5.0rc0" # Default version, will be overridden if version.py is found
SGLANG_VERSION="v0.5.0rc0" # Default version, will be overridden if version.py is found
if [ -f "$SGLANG_VERSION_FILE" ]; then
VERSION_FROM_FILE=$(python3 -c '
@@ -25,130 +25,102 @@ else
echo "Warning: version.py not found, using default version: $SGLANG_VERSION" >&2
fi
# Default base tags (can be overridden by command line arguments)
DEFAULT_MI30X_BASE_TAG="${SGLANG_VERSION}-rocm630-mi30x"
DEFAULT_MI35X_BASE_TAG="${SGLANG_VERSION}-rocm700-mi35x"
# Parse command line arguments
MI30X_BASE_TAG="$DEFAULT_MI30X_BASE_TAG"
MI35X_BASE_TAG="$DEFAULT_MI35X_BASE_TAG"
MI30X_BASE_TAG="${DEFAULT_MI30X_BASE_TAG}"
MI35X_BASE_TAG="${DEFAULT_MI35X_BASE_TAG}"
while [[ $# -gt 0 ]]; do
case $1 in
--mi30x-base-tag)
MI30X_BASE_TAG="$2"
shift 2
;;
--mi35x-base-tag)
MI35X_BASE_TAG="$2"
shift 2
;;
--mi30x-base-tag) MI30X_BASE_TAG="$2"; shift 2;;
--mi35x-base-tag) MI35X_BASE_TAG="$2"; shift 2;;
-h|--help)
echo "Usage: $0 [--mi30x-base-tag TAG] [--mi35x-base-tag TAG]"
echo " --mi30x-base-tag TAG Base tag for mi30x images (default: $DEFAULT_MI30X_BASE_TAG)"
echo " --mi35x-base-tag TAG Base tag for mi35x images (default: $DEFAULT_MI35X_BASE_TAG)"
exit 0
;;
*)
echo "Unknown option $1"
echo "Use --help for usage information"
exit 1
;;
*) echo "Unknown option $1"; exit 1;;
esac
done
# Detect GPU architecture from the Kubernetes runner hostname
HOSTNAME_VALUE=$(hostname)
GPU_ARCH="mi30x" # default
# Host names look like: linux-mi35x-gpu-1-xxxxx-runner-zzzzz
if [[ "${HOSTNAME_VALUE}" =~ ^linux-(mi[0-9]+[a-z]*)-gpu-[0-9]+ ]]; then
GPU_ARCH="${BASH_REMATCH[1]}"
echo "Detected GPU architecture from hostname: ${GPU_ARCH}"
else
echo "Warning: could not parse GPU architecture from '${HOSTNAME_VALUE}', defaulting to ${GPU_ARCH}"
fi
# Normalise / collapse architectures we dont yet build specifically for
case "${GPU_ARCH}" in
mi35x)
echo "Runner uses ${GPU_ARCH}; will fetch mi35x image."
;;
mi30x|mi300|mi325)
echo "Runner uses ${GPU_ARCH}; will fetch mi30x image."
GPU_ARCH="mi30x"
;;
*)
echo "Runner architecture '${GPU_ARCH}' unrecognised; defaulting to mi30x image." >&2
GPU_ARCH="mi30x"
;;
esac
# Set up DEVICE_FLAG based on Kubernetes pod info
if [ -f "/etc/podinfo/gha-render-devices" ]; then
if [[ -f /etc/podinfo/gha-render-devices ]]; then
DEVICE_FLAG=$(cat /etc/podinfo/gha-render-devices)
else
DEVICE_FLAG="--device /dev/dri"
fi
# Function to find latest available image for a given GPU architecture
# Find the latest image
find_latest_image() {
local gpu_arch=$1
local base_tag
local base_tag days_back image_tag
if [ "$gpu_arch" == "mi30x" ]; then
base_tag="$MI30X_BASE_TAG"
elif [ "$gpu_arch" == "mi35x" ]; then
base_tag="$MI35X_BASE_TAG"
else
echo "Error: Unsupported GPU architecture '$gpu_arch'" >&2
return 1
fi
local days_back=0
while [ $days_back -lt 7 ]; do
local check_date=$(date -d "$days_back days ago" +%Y%m%d)
local image_tag="${base_tag}-${check_date}"
case "${gpu_arch}" in
mi30x) base_tag="${MI30X_BASE_TAG}" ;;
mi35x) base_tag="${MI35X_BASE_TAG}" ;;
*) echo "Error: unsupported GPU architecture '${gpu_arch}'" >&2; return 1 ;;
esac
for days_back in {0..6}; do
image_tag="${base_tag}-$(date -d "${days_back} days ago" +%Y%m%d)"
echo "Checking for image: rocm/sgl-dev:${image_tag}" >&2
# Check if the image exists by trying to get its manifest
if docker manifest inspect "rocm/sgl-dev:${image_tag}" >/dev/null 2>&1; then
echo "Found available image: rocm/sgl-dev:${image_tag}" >&2
echo "rocm/sgl-dev:${image_tag}"
return 0
fi
days_back=$((days_back + 1))
done
echo "Error: No ${gpu_arch} image found in the last 7 days for version ${base_tag}" >&2
# Final fallback to specific hardcoded images
echo "Using final fallback images..." >&2
if [ "$gpu_arch" == "mi30x" ]; then
echo "rocm/sgl-dev:v0.5.0rc0-rocm630-mi30x-20250812"
elif [ "$gpu_arch" == "mi35x" ]; then
echo "Error: no ${gpu_arch} image found in the last 7 days for base ${base_tag}" >&2
echo "Using hard-coded fallback…" >&2
if [[ "${gpu_arch}" == "mi35x" ]]; then
echo "rocm/sgl-dev:v0.5.0rc0-rocm700-mi35x-20250812"
else
echo "rocm/sgl-dev:v0.5.0rc0-rocm630-mi30x-20250812" # Default to mi30x
echo "rocm/sgl-dev:v0.5.0rc0-rocm630-mi30x-20250812"
fi
return 0
}
# Determine image finder and fallback based on runner
# In Kubernetes, the hostname contains the GPU type (e.g., linux-mi300-gpu-1-bgg8r-runner-vknlb)
# Extract the GPU type from hostname
HOSTNAME_VALUE=$(hostname)
RUNNER_NAME="unknown"
if [[ "${HOSTNAME_VALUE}" =~ ^(linux-mi[0-9]+-gpu-[0-9]+) ]]; then
RUNNER_NAME="${BASH_REMATCH[1]}"
echo "Extracted runner from hostname: ${RUNNER_NAME}"
else
echo "Could not extract runner info from hostname: ${HOSTNAME_VALUE}"
fi
echo "The runner is: ${RUNNER_NAME}"
GPU_ARCH="mi30x"
# Check for mi350/mi355 runners
if [[ "${RUNNER_NAME}" =~ ^linux-mi350-gpu-[0-9]+$ ]] || [[ "${RUNNER_NAME}" =~ ^linux-mi355-gpu-[0-9]+$ ]]; then
echo "Runner is ${RUNNER_NAME}, will find mi35x image."
GPU_ARCH="mi35x"
# Check for mi300/mi325 runners
elif [[ "${RUNNER_NAME}" =~ ^linux-mi300-gpu-[0-9]+$ ]] || [[ "${RUNNER_NAME}" =~ ^linux-mi325-gpu-[0-9]+$ ]]; then
echo "Runner is ${RUNNER_NAME}, will find mi30x image."
else
echo "Runner type not recognized: '${RUNNER_NAME}'"
echo "Defaulting to find mi30x image"
fi
# Find and pull the latest image
# Pull and run the latest image
IMAGE=$(find_latest_image "${GPU_ARCH}")
echo "Pulling Docker image: $IMAGE"
docker pull "$IMAGE"
echo "Pulling Docker image: ${IMAGE}"
docker pull "${IMAGE}"
# Run the container
echo "Starting container: ci_sglang"
docker run -dt --user root --device=/dev/kfd $DEVICE_FLAG \
echo "Launching container: ci_sglang"
docker run -dt --user root --device=/dev/kfd ${DEVICE_FLAG} \
-v "${GITHUB_WORKSPACE:-$PWD}:/sglang-checkout" \
--ipc=host --group-add video \
--shm-size 32g \
@@ -157,4 +129,4 @@ docker run -dt --user root --device=/dev/kfd $DEVICE_FLAG \
--security-opt seccomp=unconfined \
-w /sglang-checkout \
--name ci_sglang \
"$IMAGE"
"${IMAGE}"