[docker] added rdma support (#3619)

2025-02-17 15:36:16 +08:00
parent d03c4c25a7
commit c9565e49e7
7 changed files with 39 additions and 11 deletions
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -14,6 +14,7 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1 \
    && update-alternatives --set python3 /usr/bin/python3.10 && apt install python3.10-distutils -y \
    && apt install curl git sudo libibverbs-dev -y \
+    && apt install -y rdma-core infiniband-diags openssh-server perftest ibverbs-providers libibumad3 libibverbs1 libnl-3-200 libnl-route-3-200 librdmacm1 \
    && curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py && python3 get-pip.py \
    && python3 --version \
    && python3 -m pip --version \
--- a/docker/Dockerfile.dev
+++ b/docker/Dockerfile.dev
@@ -21,6 +21,7 @@ RUN apt-get update && apt-get install -y \
    pkg-config \
    libssl-dev \
    bear \
+    && apt install -y rdma-core infiniband-diags openssh-server perftest ibverbs-providers libibumad3 libibverbs1 libnl-3-200 libnl-route-3-200 librdmacm1 \
    && rm -rf /var/lib/apt/lists/* \
    && apt-get clean

--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -20,6 +20,8 @@ ARG TRITON_COMMIT="improve_fa_decode_3.0.0"
 ARG ATER_REPO="https://github.com/HaiShaw/ater"
 ARG CK_COMMITS="fa05ae"

+RUN apt install -y rdma-core infiniband-diags openssh-server perftest ibverbs-providers libibumad3 libibverbs1 libnl-3-200 libnl-route-3-200 librdmacm1
+
 RUN git clone ${SGL_REPO} \
    && cd sglang \
    && if [ "${SGL_BRANCH}" = ${SGL_DEFAULT} ]; then \
--- a/docker/compose.yaml
+++ b/docker/compose.yaml
@@ -7,7 +7,8 @@ services:
      # If you use modelscope, you need mount this directory
      # - ${HOME}/.cache/modelscope:/root/.cache/modelscope
    restart: always
-    network_mode: host
+    network_mode: host # required by RDMA
+    privileged: true # required by RDMA
    # Or you can only publish port 30000
    # ports:
    #   - 30000:30000
@@ -16,8 +17,7 @@ services:
      # if you use modelscope to download model, you need set this environment
      # - SGLANG_USE_MODELSCOPE: true
    entrypoint: python3 -m sglang.launch_server
-    command:
-      --model-path meta-llama/Llama-3.1-8B-Instruct
+    command: --model-path meta-llama/Llama-3.1-8B-Instruct
      --host 0.0.0.0
      --port 30000
    ulimits:
@@ -31,5 +31,5 @@ services:
        reservations:
          devices:
            - driver: nvidia
-              device_ids: ['0']
+              device_ids: ["0"]
              capabilities: [gpu]