Sync from v0.13

2026-01-19 10:38:50 +08:00
parent b2ef04d792
commit 5aef6c175a
3714 changed files with 854317 additions and 89342 deletions
--- a/tools/ep_kernels/README.md
+++ b/tools/ep_kernels/README.md
@@ -0,0 +1,28 @@
+# Expert parallel kernels
+
+Large-scale cluster-level expert parallel, as described in the [DeepSeek-V3 Technical Report](http://arxiv.org/abs/2412.19437), is an efficient way to deploy sparse MoE models with many experts. However, such deployment requires many components beyond a normal Python package, including system package support and system driver support. It is impossible to bundle all these components into a Python package.
+
+Here we break down the requirements in 2 steps:
+
+1. Build and install the Python libraries (both [pplx-kernels](https://github.com/ppl-ai/pplx-kernels) and [DeepEP](https://github.com/deepseek-ai/DeepEP)), including necessary dependencies like NVSHMEM. This step does not require any privileged access. Any user can do this.
+2. Configure NVIDIA driver to enable IBGDA. This step requires root access, and must be done on the host machine.
+
+Step 2 is necessary for multi-node deployment.
+
+All scripts accept a positional argument as workspace path for staging the build, defaulting to `$(pwd)/ep_kernels_workspace`.
+
+## Usage
+
+```bash
+# for hopper
+TORCH_CUDA_ARCH_LIST="9.0" bash install_python_libraries.sh
+# for blackwell
+TORCH_CUDA_ARCH_LIST="10.0" bash install_python_libraries.sh
+```
+
+Additional step for multi-node deployment:
+
+```bash
+sudo bash configure_system_drivers.sh # update-initramfs can take several minutes
+sudo reboot # Reboot is required to load the new driver
+```
--- a/tools/ep_kernels/configure_system_drivers.sh
+++ b/tools/ep_kernels/configure_system_drivers.sh
@@ -0,0 +1,17 @@
+set -ex
+
+# turn on IBGDA
+echo 'options nvidia NVreg_EnableStreamMemOPs=1 NVreg_RegistryDwords="PeerMappingOverride=1;"' | tee -a /etc/modprobe.d/nvidia.conf
+
+if command -v update-initramfs &> /dev/null; then
+    # for Debian/Ubuntu
+    sudo update-initramfs -u
+elif command -v dracut &> /dev/null; then
+    # for Fedora/CentOS
+    sudo dracut --force
+else
+    echo "No supported initramfs update tool found."
+    exit 1
+fi
+
+echo "Please reboot the system to apply the changes"
--- a/tools/ep_kernels/elastic_ep/eep_nvshmem.patch
+++ b/tools/ep_kernels/elastic_ep/eep_nvshmem.patch
@@ -0,0 +1,92 @@
+From 18c0599c2f07ec965132efa25961dc8179c2dda3 Mon Sep 17 00:00:00 2001
+From: Yongji Wu <wuyongji317@gmail.com>
+Date: Tue, 20 May 2025 13:41:12 -0700
+Subject: [PATCH] fix reinit issues due to states not cleaned up
+
+fix double free
+---
+ src/host/init/init.cu                             | 10 ++++++++++
+ .../internal/host/nvshmemi_mem_transport.hpp      | 15 +++++++++++++++
+ src/modules/bootstrap/uid/bootstrap_uid.cpp       |  5 +++++
+ 3 files changed, 30 insertions(+)
+
+diff --git a/src/host/init/init.cu b/src/host/init/init.cu
+index b1c5dbf..1fecb4b 100644
+--- a/src/host/init/init.cu
+++ b/src/host/init/init.cu
+@@ -43,6 +43,8 @@
+ #include "internal/host/nvshmemi_types.h"
+ #include "internal/host/shared_memory.h"
+ #include "internal/host/nvshmemi_symmetric_heap.hpp"
+// eep-dev
+#include "internal/host/nvshmemi_mem_transport.hpp"
+ 
+ extern __constant__ nvshmemi_device_host_state_t nvshmemi_device_state_d;
+ static std::map<void *, int> registered_device_states;
+@@ -1293,6 +1295,14 @@ void nvshmemid_hostlib_finalize(void *device_ctx, void *transport_device_ctx) {
+         /* Multi-init Multi-fini*/
+         nvshmemi_state = NULL;
+         nvshmemi_device_state.nvshmemi_is_nvshmem_initialized = 0;
+        
+        // eep-dev
+        nvshmemi_mem_p2p_transport::destroy_instance();
+        nvshmemi_mem_remote_transport::destroy_instance();
+        free(nvshmemi_default_session);
+        nvshmemi_default_session = nullptr;
+        nvshmemi_device_state.nvshmemi_is_nvshmem_bootstrapped = false;
+        
+         nvshmemi_is_device_state_ready = false;
+     } else
+         nvshmemi_boot_handle.barrier(&nvshmemi_boot_handle);
+diff --git a/src/include/internal/host/nvshmemi_mem_transport.hpp b/src/include/internal/host/nvshmemi_mem_transport.hpp
+index 2495844..e4f408a 100644
+--- a/src/include/internal/host/nvshmemi_mem_transport.hpp
+++ b/src/include/internal/host/nvshmemi_mem_transport.hpp
+@@ -36,6 +36,13 @@ class nvshmemi_mem_p2p_transport final {
+             return p2p_objref_;
+         }
+     }
+    // eep-dev
+    static void destroy_instance(void) {
+        if (p2p_objref_ != nullptr) {
+            delete p2p_objref_;
+            p2p_objref_ = nullptr;
+        }
+    }
+ 
+     void print_mem_handle(int pe_id, int transport_idx, nvshmemi_symmetric_heap &obj);
+ 
+@@ -87,6 +94,14 @@ class nvshmemi_mem_remote_transport final {
+         }
+     }
+ 
+    // eep-dev
+    static void destroy_instance(void) {
+        if (remote_objref_ != nullptr) {
+            delete remote_objref_;
+            remote_objref_ = nullptr;
+        }
+    }
+
+     int gather_mem_handles(nvshmemi_symmetric_heap &obj, uint64_t heap_offset, size_t size);
+     /* On-demand registration and release of memory */
+     int register_mem_handle(nvshmem_mem_handle_t *local_handles, int transport_idx,
+diff --git a/src/modules/bootstrap/uid/bootstrap_uid.cpp b/src/modules/bootstrap/uid/bootstrap_uid.cpp
+index a1fa748..788fa96 100644
+--- a/src/modules/bootstrap/uid/bootstrap_uid.cpp
+++ b/src/modules/bootstrap/uid/bootstrap_uid.cpp
+@@ -630,6 +630,11 @@ int nvshmemi_bootstrap_plugin_pre_init(bootstrap_handle_t* handle, const int abi
+     // Discover the network for bootstrap, if not done previously.
+     // This code needs to be stateful to be able to be called multiple times by the caller
+     BOOTSTRAP_CHECK(bootstrap_net_init());
+    // eep-dev
+    if (handle->pre_init_ops != nullptr) {
+        BOOTSTRAP_PTR_FREE(handle->pre_init_ops);
+        handle->pre_init_ops = nullptr;
+    }
+     if (handle->pre_init_ops == nullptr) {
+         BOOTSTRAP_CALLOC(&handle->pre_init_ops, 1);
+         handle->pre_init_ops->get_unique_id = bootstrap_get_unique_id;
+-- 
+2.43.0
+
--- a/tools/ep_kernels/elastic_ep/install_eep_libraries.sh
+++ b/tools/ep_kernels/elastic_ep/install_eep_libraries.sh
@@ -0,0 +1,86 @@
+#!/bin/bash
+
+set -ex
+
+# Default workspace directory
+WORKSPACE=$(pwd)/eep_kernels_workspace
+INSTALL_NVSHMEM=true
+
+# Parse command line arguments
+while getopts "w:n" opt; do
+  case $opt in
+    w)
+      WORKSPACE="$OPTARG"
+      ;;
+    n)
+      INSTALL_NVSHMEM=false
+      ;;
+    \?)
+      echo "Invalid option: -$OPTARG" >&2
+      exit 1
+      ;;
+  esac
+done
+
+if [ ! -d "$WORKSPACE" ]; then
+    mkdir -p $WORKSPACE
+fi
+
+
+# install dependencies if not installed
+pip3 install cmake torch ninja
+
+# build nvshmem
+pushd $WORKSPACE
+# Reset NVSHMEM build if requested
+if [ "$INSTALL_NVSHMEM" = true ]; then
+    mkdir -p nvshmem_src
+    wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.2.5/source/nvshmem_src_3.2.5-1.txz
+    tar -xvf nvshmem_src_3.2.5-1.txz -C nvshmem_src --strip-components=1
+    pushd nvshmem_src
+    wget https://github.com/deepseek-ai/DeepEP/raw/main/third-party/nvshmem.patch
+    git init
+    git apply -vvv nvshmem.patch
+    git apply --reject --whitespace=fix ../../eep_nvshmem.patch 
+else
+    pushd nvshmem_src
+fi
+
+# assume CUDA_HOME is set correctly
+if [ -z "$CUDA_HOME" ]; then
+    echo "CUDA_HOME is not set, please set it to your CUDA installation directory."
+    exit 1
+fi
+
+# disable all features except IBGDA
+export NVSHMEM_IBGDA_SUPPORT=1
+
+export NVSHMEM_SHMEM_SUPPORT=0
+export NVSHMEM_UCX_SUPPORT=0
+export NVSHMEM_USE_NCCL=0
+export NVSHMEM_PMIX_SUPPORT=0
+export NVSHMEM_TIMEOUT_DEVICE_POLLING=0
+export NVSHMEM_USE_GDRCOPY=0
+export NVSHMEM_IBRC_SUPPORT=0
+export NVSHMEM_BUILD_TESTS=0
+export NVSHMEM_BUILD_EXAMPLES=0
+export NVSHMEM_MPI_SUPPORT=0
+export NVSHMEM_BUILD_HYDRA_LAUNCHER=0
+export NVSHMEM_BUILD_TXZ_PACKAGE=0
+export NVSHMEM_TIMEOUT_DEVICE_POLLING=0
+
+cmake -G Ninja -S . -B $WORKSPACE/nvshmem_build/ -DCMAKE_INSTALL_PREFIX=$WORKSPACE/nvshmem_install
+cmake --build $WORKSPACE/nvshmem_build/ --target install
+
+popd
+
+export CMAKE_PREFIX_PATH=$WORKSPACE/nvshmem_install:$CMAKE_PREFIX_PATH
+
+# build and install pplx, require pytorch installed
+pushd $WORKSPACE
+git clone https://github.com/ppl-ai/pplx-kernels
+cd pplx-kernels
+# see https://github.com/pypa/pip/issues/9955#issuecomment-838065925
+# PIP_NO_BUILD_ISOLATION=0 disables build isolation
+PIP_NO_BUILD_ISOLATION=0 TORCH_CUDA_ARCH_LIST=9.0a+PTX pip install . --no-deps -v
+
--- a/tools/ep_kernels/install_python_libraries.sh
+++ b/tools/ep_kernels/install_python_libraries.sh
@@ -0,0 +1,190 @@
+#!/usr/bin/env bash
+set -ex
+
+# usage: ./install_python_libraries.sh [options]
+#   --workspace <dir>    workspace directory (default: ./ep_kernels_workspace)
+#   --mode <mode>        "install" (default) or "wheel"
+#   --pplx-ref <commit>  pplx-kernels commit hash
+#   --deepep-ref <commit> DeepEP commit hash
+
+CUDA_HOME=${CUDA_HOME:-/usr/local/cuda}
+PPLX_COMMIT_HASH=${PPLX_COMMIT_HASH:-"12cecfd"}
+DEEPEP_COMMIT_HASH=${DEEPEP_COMMIT_HASH:-"73b6ea4"}
+NVSHMEM_VER=3.3.24  # Suppports both CUDA 12 and 13
+WORKSPACE=${WORKSPACE:-$(pwd)/ep_kernels_workspace}
+MODE=${MODE:-install}
+CUDA_VERSION_MAJOR=$(${CUDA_HOME}/bin/nvcc --version | egrep -o "release [0-9]+" | cut -d ' ' -f 2)
+
+# Parse arguments
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --workspace)
+            if [[ -z "$2" || "$2" =~ ^- ]]; then
+                echo "Error: --workspace requires an argument." >&2
+                exit 1
+            fi
+            WORKSPACE="$2"
+            shift 2
+            ;;
+        --mode)
+            if [[ -z "$2" || "$2" =~ ^- ]]; then
+                echo "Error: --mode requires an argument." >&2
+                exit 1
+            fi
+            MODE="$2"
+            shift 2
+            ;;
+        --pplx-ref)
+            if [[ -z "$2" || "$2" =~ ^- ]]; then
+                echo "Error: --pplx-ref requires an argument." >&2
+                exit 1
+            fi
+            PPLX_COMMIT_HASH="$2"
+            shift 2
+            ;;
+        --deepep-ref)
+            if [[ -z "$2" || "$2" =~ ^- ]]; then
+                echo "Error: --deepep-ref requires an argument." >&2
+                exit 1
+            fi
+            DEEPEP_COMMIT_HASH="$2"
+            shift 2
+            ;;
+        *)
+            echo "Error: Unknown argument '$1'" >&2
+            exit 1
+            ;;
+    esac
+done
+
+mkdir -p "$WORKSPACE"
+
+WHEEL_DIR="$WORKSPACE/dist"
+mkdir -p "$WHEEL_DIR"
+
+pushd "$WORKSPACE"
+
+# install dependencies if not installed
+if [ -z "$VIRTUAL_ENV" ]; then
+  uv pip install --system cmake torch ninja
+else
+  uv pip install cmake torch ninja
+fi
+
+# fetch nvshmem
+ARCH=$(uname -m)
+case "${ARCH,,}" in
+  x86_64|amd64)
+    NVSHMEM_SUBDIR="linux-x86_64"
+    ;;
+  aarch64|arm64)
+    NVSHMEM_SUBDIR="linux-sbsa"
+    ;;
+  *)
+    echo "Unsupported architecture: ${ARCH}" >&2
+    exit 1
+    ;;
+esac
+
+NVSHMEM_FILE="libnvshmem-${NVSHMEM_SUBDIR}-${NVSHMEM_VER}_cuda${CUDA_VERSION_MAJOR}-archive.tar.xz"
+NVSHMEM_URL="https://developer.download.nvidia.com/compute/nvshmem/redist/libnvshmem/${NVSHMEM_SUBDIR}/${NVSHMEM_FILE}"
+
+pushd "$WORKSPACE"
+echo "Downloading NVSHMEM ${NVSHMEM_VER} for ${NVSHMEM_SUBDIR} ..."
+curl -fSL "${NVSHMEM_URL}" -o "${NVSHMEM_FILE}"
+tar -xf "${NVSHMEM_FILE}"
+mv "${NVSHMEM_FILE%.tar.xz}" nvshmem
+rm -f "${NVSHMEM_FILE}"
+rm -rf nvshmem/lib/bin nvshmem/lib/share
+popd
+
+export CMAKE_PREFIX_PATH=$WORKSPACE/nvshmem/lib/cmake:$CMAKE_PREFIX_PATH
+
+is_git_dirty() {
+    local dir=$1
+    pushd "$dir" > /dev/null
+    if [ -d ".git" ] && [ -n "$(git status --porcelain 3>/dev/null)" ]; then
+        popd > /dev/null
+        return 0
+    else
+        popd > /dev/null
+        return 1
+    fi
+}
+
+clone_repo() {
+    local repo_url=$1
+    local dir_name=$2
+    local key_file=$3
+    local commit_hash=$4
+    if [ -d "$dir_name" ]; then
+        if is_git_dirty "$dir_name"; then
+            echo "$dir_name directory is dirty, skipping clone"
+        elif [ ! -d "$dir_name/.git" ] || [ ! -f "$dir_name/$key_file" ]; then
+            echo "$dir_name directory exists but clone appears incomplete, cleaning up and re-cloning"
+            rm -rf "$dir_name"
+            git clone "$repo_url"
+            if [ -n "$commit_hash" ]; then
+                cd "$dir_name"
+                git checkout "$commit_hash"
+                cd ..
+            fi
+        else
+            echo "$dir_name directory exists and appears complete"
+        fi
+    else
+        git clone "$repo_url"
+        if [ -n "$commit_hash" ]; then
+            cd "$dir_name"
+            git checkout "$commit_hash"
+            cd ..
+        fi
+    fi
+}
+
+do_build() {
+    local repo=$1
+    local name=$2
+    local key=$3
+    local commit=$4
+    local extra_env=$5
+
+    pushd "$WORKSPACE"
+    clone_repo "$repo" "$name" "$key" "$commit"
+    cd "$name"
+
+    # DeepEP CUDA 13 patch
+    if [[ "$name" == "DeepEP" && "${CUDA_VERSION_MAJOR}" -ge 13 ]]; then
+        sed -i "s|f'{nvshmem_dir}/include']|f'{nvshmem_dir}/include', '${CUDA_HOME}/include/cccl']|" "setup.py"
+    fi
+
+    if [ "$MODE" = "install" ]; then
+        echo "Installing $name into environment"
+        eval "$extra_env" uv pip install --no-build-isolation -vvv .
+    else
+        echo "Building $name wheel into $WHEEL_DIR"
+        eval "$extra_env" uv build --wheel --no-build-isolation -vvv --out-dir "$WHEEL_DIR" .
+    fi
+    popd
+}
+
+# build pplx-kernels
+do_build \
+    "https://github.com/ppl-ai/pplx-kernels" \
+    "pplx-kernels" \
+    "setup.py" \
+    "$PPLX_COMMIT_HASH" \
+    ""
+
+# build DeepEP
+do_build \
+    "https://github.com/deepseek-ai/DeepEP" \
+    "DeepEP" \
+    "setup.py" \
+    "$DEEPEP_COMMIT_HASH" \
+    "export NVSHMEM_DIR=$WORKSPACE/nvshmem; "
+
+if [ "$MODE" = "wheel" ]; then
+    echo "All wheels written to $WHEEL_DIR"
+    ls -l "$WHEEL_DIR"
+fi