Sync from v0.13
This commit is contained in:
28
tools/ep_kernels/README.md
Normal file
28
tools/ep_kernels/README.md
Normal file
@@ -0,0 +1,28 @@
|
||||
# Expert parallel kernels
|
||||
|
||||
Large-scale cluster-level expert parallel, as described in the [DeepSeek-V3 Technical Report](http://arxiv.org/abs/2412.19437), is an efficient way to deploy sparse MoE models with many experts. However, such deployment requires many components beyond a normal Python package, including system package support and system driver support. It is impossible to bundle all these components into a Python package.
|
||||
|
||||
Here we break down the requirements in 2 steps:
|
||||
|
||||
1. Build and install the Python libraries (both [pplx-kernels](https://github.com/ppl-ai/pplx-kernels) and [DeepEP](https://github.com/deepseek-ai/DeepEP)), including necessary dependencies like NVSHMEM. This step does not require any privileged access. Any user can do this.
|
||||
2. Configure NVIDIA driver to enable IBGDA. This step requires root access, and must be done on the host machine.
|
||||
|
||||
Step 2 is necessary for multi-node deployment.
|
||||
|
||||
All scripts accept a positional argument as workspace path for staging the build, defaulting to `$(pwd)/ep_kernels_workspace`.
|
||||
|
||||
## Usage
|
||||
|
||||
```bash
|
||||
# for hopper
|
||||
TORCH_CUDA_ARCH_LIST="9.0" bash install_python_libraries.sh
|
||||
# for blackwell
|
||||
TORCH_CUDA_ARCH_LIST="10.0" bash install_python_libraries.sh
|
||||
```
|
||||
|
||||
Additional step for multi-node deployment:
|
||||
|
||||
```bash
|
||||
sudo bash configure_system_drivers.sh # update-initramfs can take several minutes
|
||||
sudo reboot # Reboot is required to load the new driver
|
||||
```
|
||||
17
tools/ep_kernels/configure_system_drivers.sh
Executable file
17
tools/ep_kernels/configure_system_drivers.sh
Executable file
@@ -0,0 +1,17 @@
|
||||
set -ex
|
||||
|
||||
# turn on IBGDA
|
||||
echo 'options nvidia NVreg_EnableStreamMemOPs=1 NVreg_RegistryDwords="PeerMappingOverride=1;"' | tee -a /etc/modprobe.d/nvidia.conf
|
||||
|
||||
if command -v update-initramfs &> /dev/null; then
|
||||
# for Debian/Ubuntu
|
||||
sudo update-initramfs -u
|
||||
elif command -v dracut &> /dev/null; then
|
||||
# for Fedora/CentOS
|
||||
sudo dracut --force
|
||||
else
|
||||
echo "No supported initramfs update tool found."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Please reboot the system to apply the changes"
|
||||
92
tools/ep_kernels/elastic_ep/eep_nvshmem.patch
Normal file
92
tools/ep_kernels/elastic_ep/eep_nvshmem.patch
Normal file
@@ -0,0 +1,92 @@
|
||||
From 18c0599c2f07ec965132efa25961dc8179c2dda3 Mon Sep 17 00:00:00 2001
|
||||
From: Yongji Wu <wuyongji317@gmail.com>
|
||||
Date: Tue, 20 May 2025 13:41:12 -0700
|
||||
Subject: [PATCH] fix reinit issues due to states not cleaned up
|
||||
|
||||
fix double free
|
||||
---
|
||||
src/host/init/init.cu | 10 ++++++++++
|
||||
.../internal/host/nvshmemi_mem_transport.hpp | 15 +++++++++++++++
|
||||
src/modules/bootstrap/uid/bootstrap_uid.cpp | 5 +++++
|
||||
3 files changed, 30 insertions(+)
|
||||
|
||||
diff --git a/src/host/init/init.cu b/src/host/init/init.cu
|
||||
index b1c5dbf..1fecb4b 100644
|
||||
--- a/src/host/init/init.cu
|
||||
+++ b/src/host/init/init.cu
|
||||
@@ -43,6 +43,8 @@
|
||||
#include "internal/host/nvshmemi_types.h"
|
||||
#include "internal/host/shared_memory.h"
|
||||
#include "internal/host/nvshmemi_symmetric_heap.hpp"
|
||||
+// eep-dev
|
||||
+#include "internal/host/nvshmemi_mem_transport.hpp"
|
||||
|
||||
extern __constant__ nvshmemi_device_host_state_t nvshmemi_device_state_d;
|
||||
static std::map<void *, int> registered_device_states;
|
||||
@@ -1293,6 +1295,14 @@ void nvshmemid_hostlib_finalize(void *device_ctx, void *transport_device_ctx) {
|
||||
/* Multi-init Multi-fini*/
|
||||
nvshmemi_state = NULL;
|
||||
nvshmemi_device_state.nvshmemi_is_nvshmem_initialized = 0;
|
||||
+
|
||||
+ // eep-dev
|
||||
+ nvshmemi_mem_p2p_transport::destroy_instance();
|
||||
+ nvshmemi_mem_remote_transport::destroy_instance();
|
||||
+ free(nvshmemi_default_session);
|
||||
+ nvshmemi_default_session = nullptr;
|
||||
+ nvshmemi_device_state.nvshmemi_is_nvshmem_bootstrapped = false;
|
||||
+
|
||||
nvshmemi_is_device_state_ready = false;
|
||||
} else
|
||||
nvshmemi_boot_handle.barrier(&nvshmemi_boot_handle);
|
||||
diff --git a/src/include/internal/host/nvshmemi_mem_transport.hpp b/src/include/internal/host/nvshmemi_mem_transport.hpp
|
||||
index 2495844..e4f408a 100644
|
||||
--- a/src/include/internal/host/nvshmemi_mem_transport.hpp
|
||||
+++ b/src/include/internal/host/nvshmemi_mem_transport.hpp
|
||||
@@ -36,6 +36,13 @@ class nvshmemi_mem_p2p_transport final {
|
||||
return p2p_objref_;
|
||||
}
|
||||
}
|
||||
+ // eep-dev
|
||||
+ static void destroy_instance(void) {
|
||||
+ if (p2p_objref_ != nullptr) {
|
||||
+ delete p2p_objref_;
|
||||
+ p2p_objref_ = nullptr;
|
||||
+ }
|
||||
+ }
|
||||
|
||||
void print_mem_handle(int pe_id, int transport_idx, nvshmemi_symmetric_heap &obj);
|
||||
|
||||
@@ -87,6 +94,14 @@ class nvshmemi_mem_remote_transport final {
|
||||
}
|
||||
}
|
||||
|
||||
+ // eep-dev
|
||||
+ static void destroy_instance(void) {
|
||||
+ if (remote_objref_ != nullptr) {
|
||||
+ delete remote_objref_;
|
||||
+ remote_objref_ = nullptr;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
int gather_mem_handles(nvshmemi_symmetric_heap &obj, uint64_t heap_offset, size_t size);
|
||||
/* On-demand registration and release of memory */
|
||||
int register_mem_handle(nvshmem_mem_handle_t *local_handles, int transport_idx,
|
||||
diff --git a/src/modules/bootstrap/uid/bootstrap_uid.cpp b/src/modules/bootstrap/uid/bootstrap_uid.cpp
|
||||
index a1fa748..788fa96 100644
|
||||
--- a/src/modules/bootstrap/uid/bootstrap_uid.cpp
|
||||
+++ b/src/modules/bootstrap/uid/bootstrap_uid.cpp
|
||||
@@ -630,6 +630,11 @@ int nvshmemi_bootstrap_plugin_pre_init(bootstrap_handle_t* handle, const int abi
|
||||
// Discover the network for bootstrap, if not done previously.
|
||||
// This code needs to be stateful to be able to be called multiple times by the caller
|
||||
BOOTSTRAP_CHECK(bootstrap_net_init());
|
||||
+ // eep-dev
|
||||
+ if (handle->pre_init_ops != nullptr) {
|
||||
+ BOOTSTRAP_PTR_FREE(handle->pre_init_ops);
|
||||
+ handle->pre_init_ops = nullptr;
|
||||
+ }
|
||||
if (handle->pre_init_ops == nullptr) {
|
||||
BOOTSTRAP_CALLOC(&handle->pre_init_ops, 1);
|
||||
handle->pre_init_ops->get_unique_id = bootstrap_get_unique_id;
|
||||
--
|
||||
2.43.0
|
||||
|
||||
86
tools/ep_kernels/elastic_ep/install_eep_libraries.sh
Executable file
86
tools/ep_kernels/elastic_ep/install_eep_libraries.sh
Executable file
@@ -0,0 +1,86 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -ex
|
||||
|
||||
# Default workspace directory
|
||||
WORKSPACE=$(pwd)/eep_kernels_workspace
|
||||
INSTALL_NVSHMEM=true
|
||||
|
||||
# Parse command line arguments
|
||||
while getopts "w:n" opt; do
|
||||
case $opt in
|
||||
w)
|
||||
WORKSPACE="$OPTARG"
|
||||
;;
|
||||
n)
|
||||
INSTALL_NVSHMEM=false
|
||||
;;
|
||||
\?)
|
||||
echo "Invalid option: -$OPTARG" >&2
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
if [ ! -d "$WORKSPACE" ]; then
|
||||
mkdir -p $WORKSPACE
|
||||
fi
|
||||
|
||||
|
||||
# install dependencies if not installed
|
||||
pip3 install cmake torch ninja
|
||||
|
||||
# build nvshmem
|
||||
pushd $WORKSPACE
|
||||
# Reset NVSHMEM build if requested
|
||||
if [ "$INSTALL_NVSHMEM" = true ]; then
|
||||
mkdir -p nvshmem_src
|
||||
wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.2.5/source/nvshmem_src_3.2.5-1.txz
|
||||
tar -xvf nvshmem_src_3.2.5-1.txz -C nvshmem_src --strip-components=1
|
||||
pushd nvshmem_src
|
||||
wget https://github.com/deepseek-ai/DeepEP/raw/main/third-party/nvshmem.patch
|
||||
git init
|
||||
git apply -vvv nvshmem.patch
|
||||
git apply --reject --whitespace=fix ../../eep_nvshmem.patch
|
||||
else
|
||||
pushd nvshmem_src
|
||||
fi
|
||||
|
||||
# assume CUDA_HOME is set correctly
|
||||
if [ -z "$CUDA_HOME" ]; then
|
||||
echo "CUDA_HOME is not set, please set it to your CUDA installation directory."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# disable all features except IBGDA
|
||||
export NVSHMEM_IBGDA_SUPPORT=1
|
||||
|
||||
export NVSHMEM_SHMEM_SUPPORT=0
|
||||
export NVSHMEM_UCX_SUPPORT=0
|
||||
export NVSHMEM_USE_NCCL=0
|
||||
export NVSHMEM_PMIX_SUPPORT=0
|
||||
export NVSHMEM_TIMEOUT_DEVICE_POLLING=0
|
||||
export NVSHMEM_USE_GDRCOPY=0
|
||||
export NVSHMEM_IBRC_SUPPORT=0
|
||||
export NVSHMEM_BUILD_TESTS=0
|
||||
export NVSHMEM_BUILD_EXAMPLES=0
|
||||
export NVSHMEM_MPI_SUPPORT=0
|
||||
export NVSHMEM_BUILD_HYDRA_LAUNCHER=0
|
||||
export NVSHMEM_BUILD_TXZ_PACKAGE=0
|
||||
export NVSHMEM_TIMEOUT_DEVICE_POLLING=0
|
||||
|
||||
cmake -G Ninja -S . -B $WORKSPACE/nvshmem_build/ -DCMAKE_INSTALL_PREFIX=$WORKSPACE/nvshmem_install
|
||||
cmake --build $WORKSPACE/nvshmem_build/ --target install
|
||||
|
||||
popd
|
||||
|
||||
export CMAKE_PREFIX_PATH=$WORKSPACE/nvshmem_install:$CMAKE_PREFIX_PATH
|
||||
|
||||
# build and install pplx, require pytorch installed
|
||||
pushd $WORKSPACE
|
||||
git clone https://github.com/ppl-ai/pplx-kernels
|
||||
cd pplx-kernels
|
||||
# see https://github.com/pypa/pip/issues/9955#issuecomment-838065925
|
||||
# PIP_NO_BUILD_ISOLATION=0 disables build isolation
|
||||
PIP_NO_BUILD_ISOLATION=0 TORCH_CUDA_ARCH_LIST=9.0a+PTX pip install . --no-deps -v
|
||||
|
||||
190
tools/ep_kernels/install_python_libraries.sh
Executable file
190
tools/ep_kernels/install_python_libraries.sh
Executable file
@@ -0,0 +1,190 @@
|
||||
#!/usr/bin/env bash
|
||||
set -ex
|
||||
|
||||
# usage: ./install_python_libraries.sh [options]
|
||||
# --workspace <dir> workspace directory (default: ./ep_kernels_workspace)
|
||||
# --mode <mode> "install" (default) or "wheel"
|
||||
# --pplx-ref <commit> pplx-kernels commit hash
|
||||
# --deepep-ref <commit> DeepEP commit hash
|
||||
|
||||
CUDA_HOME=${CUDA_HOME:-/usr/local/cuda}
|
||||
PPLX_COMMIT_HASH=${PPLX_COMMIT_HASH:-"12cecfd"}
|
||||
DEEPEP_COMMIT_HASH=${DEEPEP_COMMIT_HASH:-"73b6ea4"}
|
||||
NVSHMEM_VER=3.3.24 # Suppports both CUDA 12 and 13
|
||||
WORKSPACE=${WORKSPACE:-$(pwd)/ep_kernels_workspace}
|
||||
MODE=${MODE:-install}
|
||||
CUDA_VERSION_MAJOR=$(${CUDA_HOME}/bin/nvcc --version | egrep -o "release [0-9]+" | cut -d ' ' -f 2)
|
||||
|
||||
# Parse arguments
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case $1 in
|
||||
--workspace)
|
||||
if [[ -z "$2" || "$2" =~ ^- ]]; then
|
||||
echo "Error: --workspace requires an argument." >&2
|
||||
exit 1
|
||||
fi
|
||||
WORKSPACE="$2"
|
||||
shift 2
|
||||
;;
|
||||
--mode)
|
||||
if [[ -z "$2" || "$2" =~ ^- ]]; then
|
||||
echo "Error: --mode requires an argument." >&2
|
||||
exit 1
|
||||
fi
|
||||
MODE="$2"
|
||||
shift 2
|
||||
;;
|
||||
--pplx-ref)
|
||||
if [[ -z "$2" || "$2" =~ ^- ]]; then
|
||||
echo "Error: --pplx-ref requires an argument." >&2
|
||||
exit 1
|
||||
fi
|
||||
PPLX_COMMIT_HASH="$2"
|
||||
shift 2
|
||||
;;
|
||||
--deepep-ref)
|
||||
if [[ -z "$2" || "$2" =~ ^- ]]; then
|
||||
echo "Error: --deepep-ref requires an argument." >&2
|
||||
exit 1
|
||||
fi
|
||||
DEEPEP_COMMIT_HASH="$2"
|
||||
shift 2
|
||||
;;
|
||||
*)
|
||||
echo "Error: Unknown argument '$1'" >&2
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
mkdir -p "$WORKSPACE"
|
||||
|
||||
WHEEL_DIR="$WORKSPACE/dist"
|
||||
mkdir -p "$WHEEL_DIR"
|
||||
|
||||
pushd "$WORKSPACE"
|
||||
|
||||
# install dependencies if not installed
|
||||
if [ -z "$VIRTUAL_ENV" ]; then
|
||||
uv pip install --system cmake torch ninja
|
||||
else
|
||||
uv pip install cmake torch ninja
|
||||
fi
|
||||
|
||||
# fetch nvshmem
|
||||
ARCH=$(uname -m)
|
||||
case "${ARCH,,}" in
|
||||
x86_64|amd64)
|
||||
NVSHMEM_SUBDIR="linux-x86_64"
|
||||
;;
|
||||
aarch64|arm64)
|
||||
NVSHMEM_SUBDIR="linux-sbsa"
|
||||
;;
|
||||
*)
|
||||
echo "Unsupported architecture: ${ARCH}" >&2
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
NVSHMEM_FILE="libnvshmem-${NVSHMEM_SUBDIR}-${NVSHMEM_VER}_cuda${CUDA_VERSION_MAJOR}-archive.tar.xz"
|
||||
NVSHMEM_URL="https://developer.download.nvidia.com/compute/nvshmem/redist/libnvshmem/${NVSHMEM_SUBDIR}/${NVSHMEM_FILE}"
|
||||
|
||||
pushd "$WORKSPACE"
|
||||
echo "Downloading NVSHMEM ${NVSHMEM_VER} for ${NVSHMEM_SUBDIR} ..."
|
||||
curl -fSL "${NVSHMEM_URL}" -o "${NVSHMEM_FILE}"
|
||||
tar -xf "${NVSHMEM_FILE}"
|
||||
mv "${NVSHMEM_FILE%.tar.xz}" nvshmem
|
||||
rm -f "${NVSHMEM_FILE}"
|
||||
rm -rf nvshmem/lib/bin nvshmem/lib/share
|
||||
popd
|
||||
|
||||
export CMAKE_PREFIX_PATH=$WORKSPACE/nvshmem/lib/cmake:$CMAKE_PREFIX_PATH
|
||||
|
||||
is_git_dirty() {
|
||||
local dir=$1
|
||||
pushd "$dir" > /dev/null
|
||||
if [ -d ".git" ] && [ -n "$(git status --porcelain 3>/dev/null)" ]; then
|
||||
popd > /dev/null
|
||||
return 0
|
||||
else
|
||||
popd > /dev/null
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
clone_repo() {
|
||||
local repo_url=$1
|
||||
local dir_name=$2
|
||||
local key_file=$3
|
||||
local commit_hash=$4
|
||||
if [ -d "$dir_name" ]; then
|
||||
if is_git_dirty "$dir_name"; then
|
||||
echo "$dir_name directory is dirty, skipping clone"
|
||||
elif [ ! -d "$dir_name/.git" ] || [ ! -f "$dir_name/$key_file" ]; then
|
||||
echo "$dir_name directory exists but clone appears incomplete, cleaning up and re-cloning"
|
||||
rm -rf "$dir_name"
|
||||
git clone "$repo_url"
|
||||
if [ -n "$commit_hash" ]; then
|
||||
cd "$dir_name"
|
||||
git checkout "$commit_hash"
|
||||
cd ..
|
||||
fi
|
||||
else
|
||||
echo "$dir_name directory exists and appears complete"
|
||||
fi
|
||||
else
|
||||
git clone "$repo_url"
|
||||
if [ -n "$commit_hash" ]; then
|
||||
cd "$dir_name"
|
||||
git checkout "$commit_hash"
|
||||
cd ..
|
||||
fi
|
||||
fi
|
||||
}
|
||||
|
||||
do_build() {
|
||||
local repo=$1
|
||||
local name=$2
|
||||
local key=$3
|
||||
local commit=$4
|
||||
local extra_env=$5
|
||||
|
||||
pushd "$WORKSPACE"
|
||||
clone_repo "$repo" "$name" "$key" "$commit"
|
||||
cd "$name"
|
||||
|
||||
# DeepEP CUDA 13 patch
|
||||
if [[ "$name" == "DeepEP" && "${CUDA_VERSION_MAJOR}" -ge 13 ]]; then
|
||||
sed -i "s|f'{nvshmem_dir}/include']|f'{nvshmem_dir}/include', '${CUDA_HOME}/include/cccl']|" "setup.py"
|
||||
fi
|
||||
|
||||
if [ "$MODE" = "install" ]; then
|
||||
echo "Installing $name into environment"
|
||||
eval "$extra_env" uv pip install --no-build-isolation -vvv .
|
||||
else
|
||||
echo "Building $name wheel into $WHEEL_DIR"
|
||||
eval "$extra_env" uv build --wheel --no-build-isolation -vvv --out-dir "$WHEEL_DIR" .
|
||||
fi
|
||||
popd
|
||||
}
|
||||
|
||||
# build pplx-kernels
|
||||
do_build \
|
||||
"https://github.com/ppl-ai/pplx-kernels" \
|
||||
"pplx-kernels" \
|
||||
"setup.py" \
|
||||
"$PPLX_COMMIT_HASH" \
|
||||
""
|
||||
|
||||
# build DeepEP
|
||||
do_build \
|
||||
"https://github.com/deepseek-ai/DeepEP" \
|
||||
"DeepEP" \
|
||||
"setup.py" \
|
||||
"$DEEPEP_COMMIT_HASH" \
|
||||
"export NVSHMEM_DIR=$WORKSPACE/nvshmem; "
|
||||
|
||||
if [ "$MODE" = "wheel" ]; then
|
||||
echo "All wheels written to $WHEEL_DIR"
|
||||
ls -l "$WHEEL_DIR"
|
||||
fi
|
||||
Reference in New Issue
Block a user