Files
xc-llm-kunlun/ci/scripts/docker/start_docker.sh

102 lines
2.8 KiB
Bash
Raw Permalink Normal View History

#!/usr/bin/env bash
set -euo pipefail
source ci/scripts/common/env.sh
source ci/scripts/common/log.sh
log "Starting docker container: ${DOCKER_NAME}"
if docker ps -a --format '{{.Names}}' | grep -q "^${DOCKER_NAME}$"; then
log "Container exists, removing first..."
docker stop "${DOCKER_NAME}" >/dev/null 2>&1 || true
docker rm "${DOCKER_NAME}" >/dev/null 2>&1 || true
fi
HOST_CUDA_LIB_PATH=""
for path in "/usr/local/cuda/lib64" /usr/local/cuda-*/lib64; do
if [ -d "$path" ]; then
HOST_CUDA_LIB_PATH="$path"
break
fi
done
if [ -n "${HOST_CUDA_LIB_PATH}" ]; then
log "Detected host CUDA lib path: ${HOST_CUDA_LIB_PATH}"
else
log "Host CUDA lib path not found, will use container CUDA"
fi
# NVIDIA device mapping
DEVICE_ARGS=""
if [ -e "/dev/nvidia0" ]; then
DEVICE_ARGS="--device /dev/nvidia0:/dev/nvidia0"
for i in $(seq 1 16); do
if [ -e "/dev/nvidia${i}" ]; then
DEVICE_ARGS="${DEVICE_ARGS} --device /dev/nvidia${i}:/dev/nvidia${i}"
fi
done
if [ -e "/dev/nvidia-uvm" ]; then
DEVICE_ARGS="${DEVICE_ARGS} --device /dev/nvidia-uvm:/dev/nvidia-uvm"
fi
if [ -e "/dev/nvidia-modeset" ]; then
DEVICE_ARGS="${DEVICE_ARGS} --device /dev/nvidia-modeset:/dev/nvidia-modeset"
fi
else
log "WARNING: /dev/nvidia0 not found, GPU may not be available"
fi
# Mount nvidia-smi
NVIDIA_BIN=""
if [ -f "/usr/bin/nvidia-smi" ]; then
NVIDIA_BIN="-v /usr/bin/nvidia-smi:/usr/bin/nvidia-smi"
log "Added nvidia-smi mount"
else
log "WARNING: nvidia-smi not found on host"
fi
# Mount critical NVIDIA libs
NVIDIA_LIBS=""
if [ -d "/usr/lib64" ]; then
for lib in libnvidia-ml.so libnvidia-ml.so.1; do
if [ -f "/usr/lib64/${lib}" ]; then
NVIDIA_LIBS="${NVIDIA_LIBS} -v /usr/lib64/${lib}:/usr/lib64/${lib}"
fi
done
fi
# Ensure libcuda symlink
ln -sf /usr/lib64/libcuda.so.1 /usr/lib64/libcuda.so || true
log "docker run ${IMAGE_NAME}"
docker run \
-h "$(hostname)" \
--privileged \
--net=host \
--user=root \
--name="${DOCKER_NAME}" \
-v /home:/home \
-v "${WORKSPACE_MOUNT}" \
-v /ssd2:/ssd2 \
-v /ssd1:/ssd1 \
-v /ssd3:/ssd3 \
-v /dev/shm:/dev/shm \
-v /usr/lib64/libcuda.so.1:/usr/lib64/libcuda.so.1 \
-v /usr/lib64/libcuda.so:/usr/lib64/libcuda.so \
-v /usr/lib64/libnvidia-ml.so.1:/usr/lib64/libnvidia-ml.so.1 \
-v /usr/lib64/libnvidia-ptxjitcompiler.so.1:/usr/lib64/libnvidia-ptxjitcompiler.so.1 2>/dev/null \
-v /var/run/docker.sock:/var/run/docker.sock \
-w /workspace \
${DEVICE_ARGS} \
${NVIDIA_BIN} \
${NVIDIA_LIBS} \
--shm-size=16G \
-e NVIDIA_VISIBLE_DEVICES=all \
-e NVIDIA_DRIVER_CAPABILITIES=compute,utility \
-itd "${IMAGE_NAME}"
log "Container started. Inject conda activate into bashrc"
docker exec "${DOCKER_NAME}" bash -lc "
echo 'conda activate ${CONDA_ENV}' >> ~/.bashrc
conda env list || true
"