[CI/Build] Add CI end-to-end (E2E) tests (#139)
* [CI/Build] Add CI end-to-end (E2E) tests Signed-off-by: Chenchao Hu <huchenchao@example.com>
This commit is contained in:
101
ci/scripts/docker/start_docker.sh
Executable file
101
ci/scripts/docker/start_docker.sh
Executable file
@@ -0,0 +1,101 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
source ci/scripts/common/env.sh
|
||||
source ci/scripts/common/log.sh
|
||||
|
||||
log "Starting docker container: ${DOCKER_NAME}"
|
||||
|
||||
if docker ps -a --format '{{.Names}}' | grep -q "^${DOCKER_NAME}$"; then
|
||||
log "Container exists, removing first..."
|
||||
docker stop "${DOCKER_NAME}" >/dev/null 2>&1 || true
|
||||
docker rm "${DOCKER_NAME}" >/dev/null 2>&1 || true
|
||||
fi
|
||||
|
||||
HOST_CUDA_LIB_PATH=""
|
||||
for path in "/usr/local/cuda/lib64" /usr/local/cuda-*/lib64; do
|
||||
if [ -d "$path" ]; then
|
||||
HOST_CUDA_LIB_PATH="$path"
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
if [ -n "${HOST_CUDA_LIB_PATH}" ]; then
|
||||
log "Detected host CUDA lib path: ${HOST_CUDA_LIB_PATH}"
|
||||
else
|
||||
log "Host CUDA lib path not found, will use container CUDA"
|
||||
fi
|
||||
|
||||
# NVIDIA device mapping
|
||||
DEVICE_ARGS=""
|
||||
if [ -e "/dev/nvidia0" ]; then
|
||||
DEVICE_ARGS="--device /dev/nvidia0:/dev/nvidia0"
|
||||
for i in $(seq 1 16); do
|
||||
if [ -e "/dev/nvidia${i}" ]; then
|
||||
DEVICE_ARGS="${DEVICE_ARGS} --device /dev/nvidia${i}:/dev/nvidia${i}"
|
||||
fi
|
||||
done
|
||||
if [ -e "/dev/nvidia-uvm" ]; then
|
||||
DEVICE_ARGS="${DEVICE_ARGS} --device /dev/nvidia-uvm:/dev/nvidia-uvm"
|
||||
fi
|
||||
if [ -e "/dev/nvidia-modeset" ]; then
|
||||
DEVICE_ARGS="${DEVICE_ARGS} --device /dev/nvidia-modeset:/dev/nvidia-modeset"
|
||||
fi
|
||||
else
|
||||
log "WARNING: /dev/nvidia0 not found, GPU may not be available"
|
||||
fi
|
||||
|
||||
# Mount nvidia-smi
|
||||
NVIDIA_BIN=""
|
||||
if [ -f "/usr/bin/nvidia-smi" ]; then
|
||||
NVIDIA_BIN="-v /usr/bin/nvidia-smi:/usr/bin/nvidia-smi"
|
||||
log "Added nvidia-smi mount"
|
||||
else
|
||||
log "WARNING: nvidia-smi not found on host"
|
||||
fi
|
||||
|
||||
# Mount critical NVIDIA libs
|
||||
NVIDIA_LIBS=""
|
||||
if [ -d "/usr/lib64" ]; then
|
||||
for lib in libnvidia-ml.so libnvidia-ml.so.1; do
|
||||
if [ -f "/usr/lib64/${lib}" ]; then
|
||||
NVIDIA_LIBS="${NVIDIA_LIBS} -v /usr/lib64/${lib}:/usr/lib64/${lib}"
|
||||
fi
|
||||
done
|
||||
fi
|
||||
|
||||
# Ensure libcuda symlink
|
||||
ln -sf /usr/lib64/libcuda.so.1 /usr/lib64/libcuda.so || true
|
||||
|
||||
log "docker run ${IMAGE_NAME}"
|
||||
docker run \
|
||||
-h "$(hostname)" \
|
||||
--privileged \
|
||||
--net=host \
|
||||
--user=root \
|
||||
--name="${DOCKER_NAME}" \
|
||||
-v /home:/home \
|
||||
-v "${WORKSPACE_MOUNT}" \
|
||||
-v /ssd2:/ssd2 \
|
||||
-v /ssd1:/ssd1 \
|
||||
-v /ssd3:/ssd3 \
|
||||
-v /dev/shm:/dev/shm \
|
||||
-v /usr/lib64/libcuda.so.1:/usr/lib64/libcuda.so.1 \
|
||||
-v /usr/lib64/libcuda.so:/usr/lib64/libcuda.so \
|
||||
-v /usr/lib64/libnvidia-ml.so.1:/usr/lib64/libnvidia-ml.so.1 \
|
||||
-v /usr/lib64/libnvidia-ptxjitcompiler.so.1:/usr/lib64/libnvidia-ptxjitcompiler.so.1 2>/dev/null \
|
||||
-v /var/run/docker.sock:/var/run/docker.sock \
|
||||
-w /workspace \
|
||||
${DEVICE_ARGS} \
|
||||
${NVIDIA_BIN} \
|
||||
${NVIDIA_LIBS} \
|
||||
--shm-size=16G \
|
||||
-e NVIDIA_VISIBLE_DEVICES=all \
|
||||
-e NVIDIA_DRIVER_CAPABILITIES=compute,utility \
|
||||
-itd "${IMAGE_NAME}"
|
||||
|
||||
log "Container started. Inject conda activate into bashrc"
|
||||
docker exec "${DOCKER_NAME}" bash -lc "
|
||||
echo 'conda activate ${CONDA_ENV}' >> ~/.bashrc
|
||||
conda env list || true
|
||||
"
|
||||
Reference in New Issue
Block a user