Sync from v0.13

This commit is contained in:
2026-01-19 10:38:50 +08:00
parent b2ef04d792
commit 5aef6c175a
3714 changed files with 854317 additions and 89342 deletions

View File

@@ -0,0 +1,28 @@
# Expert parallel kernels
Large-scale cluster-level expert parallel, as described in the [DeepSeek-V3 Technical Report](http://arxiv.org/abs/2412.19437), is an efficient way to deploy sparse MoE models with many experts. However, such deployment requires many components beyond a normal Python package, including system package support and system driver support. It is impossible to bundle all these components into a Python package.
Here we break down the requirements in 2 steps:
1. Build and install the Python libraries (both [pplx-kernels](https://github.com/ppl-ai/pplx-kernels) and [DeepEP](https://github.com/deepseek-ai/DeepEP)), including necessary dependencies like NVSHMEM. This step does not require any privileged access. Any user can do this.
2. Configure NVIDIA driver to enable IBGDA. This step requires root access, and must be done on the host machine.
Step 2 is necessary for multi-node deployment.
All scripts accept a positional argument as workspace path for staging the build, defaulting to `$(pwd)/ep_kernels_workspace`.
## Usage
```bash
# for hopper
TORCH_CUDA_ARCH_LIST="9.0" bash install_python_libraries.sh
# for blackwell
TORCH_CUDA_ARCH_LIST="10.0" bash install_python_libraries.sh
```
Additional step for multi-node deployment:
```bash
sudo bash configure_system_drivers.sh # update-initramfs can take several minutes
sudo reboot # Reboot is required to load the new driver
```

View File

@@ -0,0 +1,17 @@
set -ex
# turn on IBGDA
echo 'options nvidia NVreg_EnableStreamMemOPs=1 NVreg_RegistryDwords="PeerMappingOverride=1;"' | tee -a /etc/modprobe.d/nvidia.conf
if command -v update-initramfs &> /dev/null; then
# for Debian/Ubuntu
sudo update-initramfs -u
elif command -v dracut &> /dev/null; then
# for Fedora/CentOS
sudo dracut --force
else
echo "No supported initramfs update tool found."
exit 1
fi
echo "Please reboot the system to apply the changes"

View File

@@ -0,0 +1,92 @@
From 18c0599c2f07ec965132efa25961dc8179c2dda3 Mon Sep 17 00:00:00 2001
From: Yongji Wu <wuyongji317@gmail.com>
Date: Tue, 20 May 2025 13:41:12 -0700
Subject: [PATCH] fix reinit issues due to states not cleaned up
fix double free
---
src/host/init/init.cu | 10 ++++++++++
.../internal/host/nvshmemi_mem_transport.hpp | 15 +++++++++++++++
src/modules/bootstrap/uid/bootstrap_uid.cpp | 5 +++++
3 files changed, 30 insertions(+)
diff --git a/src/host/init/init.cu b/src/host/init/init.cu
index b1c5dbf..1fecb4b 100644
--- a/src/host/init/init.cu
+++ b/src/host/init/init.cu
@@ -43,6 +43,8 @@
#include "internal/host/nvshmemi_types.h"
#include "internal/host/shared_memory.h"
#include "internal/host/nvshmemi_symmetric_heap.hpp"
+// eep-dev
+#include "internal/host/nvshmemi_mem_transport.hpp"
extern __constant__ nvshmemi_device_host_state_t nvshmemi_device_state_d;
static std::map<void *, int> registered_device_states;
@@ -1293,6 +1295,14 @@ void nvshmemid_hostlib_finalize(void *device_ctx, void *transport_device_ctx) {
/* Multi-init Multi-fini*/
nvshmemi_state = NULL;
nvshmemi_device_state.nvshmemi_is_nvshmem_initialized = 0;
+
+ // eep-dev
+ nvshmemi_mem_p2p_transport::destroy_instance();
+ nvshmemi_mem_remote_transport::destroy_instance();
+ free(nvshmemi_default_session);
+ nvshmemi_default_session = nullptr;
+ nvshmemi_device_state.nvshmemi_is_nvshmem_bootstrapped = false;
+
nvshmemi_is_device_state_ready = false;
} else
nvshmemi_boot_handle.barrier(&nvshmemi_boot_handle);
diff --git a/src/include/internal/host/nvshmemi_mem_transport.hpp b/src/include/internal/host/nvshmemi_mem_transport.hpp
index 2495844..e4f408a 100644
--- a/src/include/internal/host/nvshmemi_mem_transport.hpp
+++ b/src/include/internal/host/nvshmemi_mem_transport.hpp
@@ -36,6 +36,13 @@ class nvshmemi_mem_p2p_transport final {
return p2p_objref_;
}
}
+ // eep-dev
+ static void destroy_instance(void) {
+ if (p2p_objref_ != nullptr) {
+ delete p2p_objref_;
+ p2p_objref_ = nullptr;
+ }
+ }
void print_mem_handle(int pe_id, int transport_idx, nvshmemi_symmetric_heap &obj);
@@ -87,6 +94,14 @@ class nvshmemi_mem_remote_transport final {
}
}
+ // eep-dev
+ static void destroy_instance(void) {
+ if (remote_objref_ != nullptr) {
+ delete remote_objref_;
+ remote_objref_ = nullptr;
+ }
+ }
+
int gather_mem_handles(nvshmemi_symmetric_heap &obj, uint64_t heap_offset, size_t size);
/* On-demand registration and release of memory */
int register_mem_handle(nvshmem_mem_handle_t *local_handles, int transport_idx,
diff --git a/src/modules/bootstrap/uid/bootstrap_uid.cpp b/src/modules/bootstrap/uid/bootstrap_uid.cpp
index a1fa748..788fa96 100644
--- a/src/modules/bootstrap/uid/bootstrap_uid.cpp
+++ b/src/modules/bootstrap/uid/bootstrap_uid.cpp
@@ -630,6 +630,11 @@ int nvshmemi_bootstrap_plugin_pre_init(bootstrap_handle_t* handle, const int abi
// Discover the network for bootstrap, if not done previously.
// This code needs to be stateful to be able to be called multiple times by the caller
BOOTSTRAP_CHECK(bootstrap_net_init());
+ // eep-dev
+ if (handle->pre_init_ops != nullptr) {
+ BOOTSTRAP_PTR_FREE(handle->pre_init_ops);
+ handle->pre_init_ops = nullptr;
+ }
if (handle->pre_init_ops == nullptr) {
BOOTSTRAP_CALLOC(&handle->pre_init_ops, 1);
handle->pre_init_ops->get_unique_id = bootstrap_get_unique_id;
--
2.43.0

View File

@@ -0,0 +1,86 @@
#!/bin/bash
set -ex
# Default workspace directory
WORKSPACE=$(pwd)/eep_kernels_workspace
INSTALL_NVSHMEM=true
# Parse command line arguments
while getopts "w:n" opt; do
case $opt in
w)
WORKSPACE="$OPTARG"
;;
n)
INSTALL_NVSHMEM=false
;;
\?)
echo "Invalid option: -$OPTARG" >&2
exit 1
;;
esac
done
if [ ! -d "$WORKSPACE" ]; then
mkdir -p $WORKSPACE
fi
# install dependencies if not installed
pip3 install cmake torch ninja
# build nvshmem
pushd $WORKSPACE
# Reset NVSHMEM build if requested
if [ "$INSTALL_NVSHMEM" = true ]; then
mkdir -p nvshmem_src
wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.2.5/source/nvshmem_src_3.2.5-1.txz
tar -xvf nvshmem_src_3.2.5-1.txz -C nvshmem_src --strip-components=1
pushd nvshmem_src
wget https://github.com/deepseek-ai/DeepEP/raw/main/third-party/nvshmem.patch
git init
git apply -vvv nvshmem.patch
git apply --reject --whitespace=fix ../../eep_nvshmem.patch
else
pushd nvshmem_src
fi
# assume CUDA_HOME is set correctly
if [ -z "$CUDA_HOME" ]; then
echo "CUDA_HOME is not set, please set it to your CUDA installation directory."
exit 1
fi
# disable all features except IBGDA
export NVSHMEM_IBGDA_SUPPORT=1
export NVSHMEM_SHMEM_SUPPORT=0
export NVSHMEM_UCX_SUPPORT=0
export NVSHMEM_USE_NCCL=0
export NVSHMEM_PMIX_SUPPORT=0
export NVSHMEM_TIMEOUT_DEVICE_POLLING=0
export NVSHMEM_USE_GDRCOPY=0
export NVSHMEM_IBRC_SUPPORT=0
export NVSHMEM_BUILD_TESTS=0
export NVSHMEM_BUILD_EXAMPLES=0
export NVSHMEM_MPI_SUPPORT=0
export NVSHMEM_BUILD_HYDRA_LAUNCHER=0
export NVSHMEM_BUILD_TXZ_PACKAGE=0
export NVSHMEM_TIMEOUT_DEVICE_POLLING=0
cmake -G Ninja -S . -B $WORKSPACE/nvshmem_build/ -DCMAKE_INSTALL_PREFIX=$WORKSPACE/nvshmem_install
cmake --build $WORKSPACE/nvshmem_build/ --target install
popd
export CMAKE_PREFIX_PATH=$WORKSPACE/nvshmem_install:$CMAKE_PREFIX_PATH
# build and install pplx, require pytorch installed
pushd $WORKSPACE
git clone https://github.com/ppl-ai/pplx-kernels
cd pplx-kernels
# see https://github.com/pypa/pip/issues/9955#issuecomment-838065925
# PIP_NO_BUILD_ISOLATION=0 disables build isolation
PIP_NO_BUILD_ISOLATION=0 TORCH_CUDA_ARCH_LIST=9.0a+PTX pip install . --no-deps -v

View File

@@ -0,0 +1,190 @@
#!/usr/bin/env bash
set -ex
# usage: ./install_python_libraries.sh [options]
# --workspace <dir> workspace directory (default: ./ep_kernels_workspace)
# --mode <mode> "install" (default) or "wheel"
# --pplx-ref <commit> pplx-kernels commit hash
# --deepep-ref <commit> DeepEP commit hash
CUDA_HOME=${CUDA_HOME:-/usr/local/cuda}
PPLX_COMMIT_HASH=${PPLX_COMMIT_HASH:-"12cecfd"}
DEEPEP_COMMIT_HASH=${DEEPEP_COMMIT_HASH:-"73b6ea4"}
NVSHMEM_VER=3.3.24 # Suppports both CUDA 12 and 13
WORKSPACE=${WORKSPACE:-$(pwd)/ep_kernels_workspace}
MODE=${MODE:-install}
CUDA_VERSION_MAJOR=$(${CUDA_HOME}/bin/nvcc --version | egrep -o "release [0-9]+" | cut -d ' ' -f 2)
# Parse arguments
while [[ $# -gt 0 ]]; do
case $1 in
--workspace)
if [[ -z "$2" || "$2" =~ ^- ]]; then
echo "Error: --workspace requires an argument." >&2
exit 1
fi
WORKSPACE="$2"
shift 2
;;
--mode)
if [[ -z "$2" || "$2" =~ ^- ]]; then
echo "Error: --mode requires an argument." >&2
exit 1
fi
MODE="$2"
shift 2
;;
--pplx-ref)
if [[ -z "$2" || "$2" =~ ^- ]]; then
echo "Error: --pplx-ref requires an argument." >&2
exit 1
fi
PPLX_COMMIT_HASH="$2"
shift 2
;;
--deepep-ref)
if [[ -z "$2" || "$2" =~ ^- ]]; then
echo "Error: --deepep-ref requires an argument." >&2
exit 1
fi
DEEPEP_COMMIT_HASH="$2"
shift 2
;;
*)
echo "Error: Unknown argument '$1'" >&2
exit 1
;;
esac
done
mkdir -p "$WORKSPACE"
WHEEL_DIR="$WORKSPACE/dist"
mkdir -p "$WHEEL_DIR"
pushd "$WORKSPACE"
# install dependencies if not installed
if [ -z "$VIRTUAL_ENV" ]; then
uv pip install --system cmake torch ninja
else
uv pip install cmake torch ninja
fi
# fetch nvshmem
ARCH=$(uname -m)
case "${ARCH,,}" in
x86_64|amd64)
NVSHMEM_SUBDIR="linux-x86_64"
;;
aarch64|arm64)
NVSHMEM_SUBDIR="linux-sbsa"
;;
*)
echo "Unsupported architecture: ${ARCH}" >&2
exit 1
;;
esac
NVSHMEM_FILE="libnvshmem-${NVSHMEM_SUBDIR}-${NVSHMEM_VER}_cuda${CUDA_VERSION_MAJOR}-archive.tar.xz"
NVSHMEM_URL="https://developer.download.nvidia.com/compute/nvshmem/redist/libnvshmem/${NVSHMEM_SUBDIR}/${NVSHMEM_FILE}"
pushd "$WORKSPACE"
echo "Downloading NVSHMEM ${NVSHMEM_VER} for ${NVSHMEM_SUBDIR} ..."
curl -fSL "${NVSHMEM_URL}" -o "${NVSHMEM_FILE}"
tar -xf "${NVSHMEM_FILE}"
mv "${NVSHMEM_FILE%.tar.xz}" nvshmem
rm -f "${NVSHMEM_FILE}"
rm -rf nvshmem/lib/bin nvshmem/lib/share
popd
export CMAKE_PREFIX_PATH=$WORKSPACE/nvshmem/lib/cmake:$CMAKE_PREFIX_PATH
is_git_dirty() {
local dir=$1
pushd "$dir" > /dev/null
if [ -d ".git" ] && [ -n "$(git status --porcelain 3>/dev/null)" ]; then
popd > /dev/null
return 0
else
popd > /dev/null
return 1
fi
}
clone_repo() {
local repo_url=$1
local dir_name=$2
local key_file=$3
local commit_hash=$4
if [ -d "$dir_name" ]; then
if is_git_dirty "$dir_name"; then
echo "$dir_name directory is dirty, skipping clone"
elif [ ! -d "$dir_name/.git" ] || [ ! -f "$dir_name/$key_file" ]; then
echo "$dir_name directory exists but clone appears incomplete, cleaning up and re-cloning"
rm -rf "$dir_name"
git clone "$repo_url"
if [ -n "$commit_hash" ]; then
cd "$dir_name"
git checkout "$commit_hash"
cd ..
fi
else
echo "$dir_name directory exists and appears complete"
fi
else
git clone "$repo_url"
if [ -n "$commit_hash" ]; then
cd "$dir_name"
git checkout "$commit_hash"
cd ..
fi
fi
}
do_build() {
local repo=$1
local name=$2
local key=$3
local commit=$4
local extra_env=$5
pushd "$WORKSPACE"
clone_repo "$repo" "$name" "$key" "$commit"
cd "$name"
# DeepEP CUDA 13 patch
if [[ "$name" == "DeepEP" && "${CUDA_VERSION_MAJOR}" -ge 13 ]]; then
sed -i "s|f'{nvshmem_dir}/include']|f'{nvshmem_dir}/include', '${CUDA_HOME}/include/cccl']|" "setup.py"
fi
if [ "$MODE" = "install" ]; then
echo "Installing $name into environment"
eval "$extra_env" uv pip install --no-build-isolation -vvv .
else
echo "Building $name wheel into $WHEEL_DIR"
eval "$extra_env" uv build --wheel --no-build-isolation -vvv --out-dir "$WHEEL_DIR" .
fi
popd
}
# build pplx-kernels
do_build \
"https://github.com/ppl-ai/pplx-kernels" \
"pplx-kernels" \
"setup.py" \
"$PPLX_COMMIT_HASH" \
""
# build DeepEP
do_build \
"https://github.com/deepseek-ai/DeepEP" \
"DeepEP" \
"setup.py" \
"$DEEPEP_COMMIT_HASH" \
"export NVSHMEM_DIR=$WORKSPACE/nvshmem; "
if [ "$MODE" = "wheel" ]; then
echo "All wheels written to $WHEEL_DIR"
ls -l "$WHEEL_DIR"
fi