This commit is contained in:
2025-10-09 16:47:16 +08:00
parent c8feb4deb5
commit e27e3f16bb
5248 changed files with 1778505 additions and 0 deletions

View File

@@ -0,0 +1,65 @@
FROM google/cloud-sdk:slim
# Build args.
ARG GITHUB_REF=refs/heads/main
# TODO: This Dockerfile installs pytorch/xla 3.6 wheels. There are also 3.7
# wheels available; see below.
ENV PYTHON_VERSION=3.6
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
cmake \
git \
curl \
ca-certificates
# Install conda and python.
# NOTE new Conda does not forward the exit status... https://github.com/conda/conda/issues/8385
RUN curl -o ~/miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-4.7.12-Linux-x86_64.sh && \
chmod +x ~/miniconda.sh && \
~/miniconda.sh -b && \
rm ~/miniconda.sh
ENV PATH=/root/miniconda3/bin:$PATH
RUN conda create -y --name container python=$PYTHON_VERSION
# Run the rest of commands within the new conda env.
# Use absolute path to appease Codefactor.
SHELL ["/root/miniconda3/bin/conda", "run", "-n", "container", "/bin/bash", "-c"]
RUN conda install -y python=$PYTHON_VERSION mkl
RUN pip uninstall -y torch && \
# Python 3.7 wheels are available. Replace cp36-cp36m with cp37-cp37m
gsutil cp 'gs://tpu-pytorch/wheels/torch-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' . && \
gsutil cp 'gs://tpu-pytorch/wheels/torch_xla-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' . && \
gsutil cp 'gs://tpu-pytorch/wheels/torchvision-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' . && \
pip install 'torch-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' && \
pip install 'torch_xla-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' && \
pip install 'torchvision-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' && \
rm 'torch-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' && \
rm 'torch_xla-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' && \
rm 'torchvision-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' && \
apt-get install -y libomp5
ENV LD_LIBRARY_PATH=root/miniconda3/envs/container/lib
# Install huggingface/transformers at the current PR, plus dependencies.
RUN git clone https://github.com/huggingface/transformers.git && \
cd transformers && \
git fetch origin $GITHUB_REF:CI && \
git checkout CI && \
cd .. && \
pip install ./transformers && \
pip install -r ./transformers/examples/pytorch/_test_requirements.txt && \
pip install pytest
RUN python -c "import torch_xla; print(torch_xla.__version__)"
RUN python -c "import transformers as trf; print(trf.__version__)"
RUN conda init bash
COPY docker-entrypoint.sh /usr/local/bin/
RUN chmod +x /usr/local/bin/docker-entrypoint.sh
ENTRYPOINT ["/usr/local/bin/docker-entrypoint.sh"]
CMD ["bash"]

View File

@@ -0,0 +1,38 @@
local base = import 'templates/base.libsonnet';
local tpus = import 'templates/tpus.libsonnet';
local utils = import "templates/utils.libsonnet";
local volumes = import "templates/volumes.libsonnet";
local bertBaseCased = base.BaseTest {
frameworkPrefix: "hf",
modelName: "bert-base-cased",
mode: "example",
configMaps: [],
timeout: 3600, # 1 hour, in seconds
image: std.extVar('image'),
imageTag: std.extVar('image-tag'),
tpuSettings+: {
softwareVersion: "pytorch-nightly",
},
accelerator: tpus.v3_8,
volumeMap+: {
datasets: volumes.PersistentVolumeSpec {
name: "huggingface-cluster-disk",
mountPath: "/datasets",
},
},
command: utils.scriptCommand(
|||
python -m pytest -s transformers/examples/pytorch/test_xla_examples.py -v
test_exit_code=$?
echo "\nFinished running commands.\n"
test $test_exit_code -eq 0
|||
),
};
bertBaseCased.oneshotJob

View File

@@ -0,0 +1,32 @@
apiVersion: v1
kind: PersistentVolume
metadata:
name: huggingface-cluster-disk
spec:
storageClassName: ""
capacity:
storage: 500Gi
accessModes:
- ReadOnlyMany
claimRef:
namespace: default
name: huggingface-cluster-disk-claim
gcePersistentDisk:
pdName: huggingface-cluster-disk
fsType: ext4
readOnly: true
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: huggingface-cluster-disk-claim
spec:
# Specify "" as the storageClassName so it matches the PersistentVolume's StorageClass.
# A nil storageClassName value uses the default StorageClass. For details, see
# https://kubernetes.io/docs/concepts/storage/persistent-volumes/#class-1
storageClassName: ""
accessModes:
- ReadOnlyMany
resources:
requests:
storage: 1Ki

View File

@@ -0,0 +1,8 @@
#!/bin/bash
source ~/.bashrc
echo "running docker-entrypoint.sh"
conda activate container
echo $KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS
echo "printed TPU info"
export XRT_TPU_CONFIG="tpu_worker;0;${KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS:7}"
exec "$@"#!/bin/bash