367 lines
14 KiB
Plaintext
367 lines
14 KiB
Plaintext
|
|
#!/bin/bash
|
||
|
|
#SBATCH --time=12:00:00
|
||
|
|
#SBATCH --nodes=24
|
||
|
|
#SBATCH --ntasks-per-node=1
|
||
|
|
#SBATCH --cpus-per-task=288
|
||
|
|
#SBATCH --output=/e/scratch/jureap59/raoof1/sft_experiments/g1_gptlong_top8_32b/logs/%x_%j.out
|
||
|
|
#SBATCH --job-name=g1_gptlong_top8_32b__Qwen3-8B
|
||
|
|
#SBATCH --mail-type=END,TIME_LIMIT,FAIL
|
||
|
|
#SBATCH --mail-user=
|
||
|
|
#SBATCH -p booster
|
||
|
|
#SBATCH --account reformo
|
||
|
|
#SBATCH --gres=gpu:4
|
||
|
|
#SBATCH --exclude=jpbo-031-[01-48]
|
||
|
|
|
||
|
|
# ==============================================================================
|
||
|
|
# Universal SFT Training SBATCH Template
|
||
|
|
# ==============================================================================
|
||
|
|
# This template replaces the cluster-specific *_train.sbatch scripts by delegating
|
||
|
|
# all logic to the SFTJobRunner Python class.
|
||
|
|
#
|
||
|
|
# Usage: The launcher writes a JSON config file and substitutes /e/scratch/jureap59/raoof1/sft_experiments/g1_gptlong_top8_32b/configs/g1_gptlong_top8_32b__Qwen3-8B_sft_config.json
|
||
|
|
# ==============================================================================
|
||
|
|
|
||
|
|
source /e/scratch/jureap59/feuer1/miniforge3/etc/profile.d/conda.sh
|
||
|
|
conda activate otagent
|
||
|
|
set -eo pipefail
|
||
|
|
ml purge
|
||
|
|
ulimit -c 0 # Disable core dumps to avoid filling disk space
|
||
|
|
|
||
|
|
# Handle bash completion scripts that use BASH_COMPLETION_DEBUG
|
||
|
|
if [ -z "${BASH_COMPLETION_DEBUG+x}" ]; then
|
||
|
|
export BASH_COMPLETION_DEBUG=""
|
||
|
|
fi
|
||
|
|
|
||
|
|
# --- Clean up /tmp to prevent state pollution from previous jobs ---
|
||
|
|
# Some HPC systems retain /tmp contents across job allocations on the same node.
|
||
|
|
# This can cause issues with tmux sessions, container state, and other temporary files.
|
||
|
|
rm -rf /tmp/tmux-* 2>/dev/null || true
|
||
|
|
rm -rf /tmp/ray 2>/dev/null || true
|
||
|
|
rm -rf /tmp/hf_home 2>/dev/null || true
|
||
|
|
rm -rf /tmp/containers 2>/dev/null || true
|
||
|
|
rm -rf /tmp/podman-* 2>/dev/null || true
|
||
|
|
|
||
|
|
# Guard conda deactivate scripts from set -u complaints
|
||
|
|
export CONDA_BACKUP_CXX="${CONDA_BACKUP_CXX:-}"
|
||
|
|
export CONDA_BACKUP_CC="${CONDA_BACKUP_CC:-}"
|
||
|
|
export CONDA_BACKUP_FC="${CONDA_BACKUP_FC:-}"
|
||
|
|
|
||
|
|
# --- Module and Conda Setup ---
|
||
|
|
|
||
|
|
# --- Module loading (cluster-specific, substituted by launcher) ---
|
||
|
|
module load nvidia-compilers/25.9-CUDA-13
|
||
|
|
|
||
|
|
# --- Environment setup ---
|
||
|
|
if [ -n "${DCFT_PRIVATE:-}" ]; then
|
||
|
|
WORKDIR="$DCFT_PRIVATE"
|
||
|
|
elif [ -n "${DCFT:-}" ]; then
|
||
|
|
WORKDIR="$DCFT"
|
||
|
|
else
|
||
|
|
WORKDIR="$PWD"
|
||
|
|
fi
|
||
|
|
cd "$WORKDIR"
|
||
|
|
|
||
|
|
if [ -z "${DCFT:-}" ]; then
|
||
|
|
export DCFT="$WORKDIR"
|
||
|
|
fi
|
||
|
|
|
||
|
|
# --- Conda activation (cluster-specific, substituted by launcher) ---
|
||
|
|
# No conda activation configured
|
||
|
|
|
||
|
|
# --- Source environment files ---
|
||
|
|
if [ -n "${DCFT:-}" ] && [ -f "$DCFT/hpc/dotenv/jupiter.env" ]; then
|
||
|
|
source "$DCFT/hpc/dotenv/jupiter.env"
|
||
|
|
fi
|
||
|
|
if [ -n "${DC_AGENT_SECRET_ENV:-}" ] && [ -f "$DC_AGENT_SECRET_ENV" ]; then
|
||
|
|
set -a
|
||
|
|
source "$DC_AGENT_SECRET_ENV"
|
||
|
|
set +a
|
||
|
|
fi
|
||
|
|
if [ -n "${DCFT_ACTIVATE_ENV:-}" ]; then
|
||
|
|
eval "$DCFT_ACTIVATE_ENV"
|
||
|
|
fi
|
||
|
|
|
||
|
|
# --- CUDA path detection (Perlmutter and similar) ---
|
||
|
|
|
||
|
|
|
||
|
|
# --- NCCL/Networking settings (cluster-specific) ---
|
||
|
|
# Cluster-specific NCCL/networking settings
|
||
|
|
export NCCL_DEBUG="INFO"
|
||
|
|
export NCCL_NET_GDR_LEVEL="0"
|
||
|
|
export NCCL_SOCKET_IFNAME="ib0"
|
||
|
|
export NCCL_IB_TIMEOUT="60"
|
||
|
|
|
||
|
|
# --- Cluster-specific environment variables ---
|
||
|
|
export WANDB_MODE="offline"
|
||
|
|
export GLOO_USE_IPV6="0"
|
||
|
|
export NCCL_SOCKET_FAMILY="AF_INET"
|
||
|
|
export SKYRL_ENABLE_NUMA_AFFINITY="1"
|
||
|
|
export DISABLE_AIOHTTP_TRANSPORT="True"
|
||
|
|
export VLLM_ALLREDUCE_USE_SYMM_MEM="0"
|
||
|
|
|
||
|
|
# --- Ray defaults ---
|
||
|
|
# --- Ray defaults ---
|
||
|
|
export RAY_CGRAPH_get_timeout="${RAY_CGRAPH_get_timeout:-900}"
|
||
|
|
# GH200 unified memory: GPU HBM is part of system RAM, so Ray's
|
||
|
|
# memory monitor double-counts GPU allocations and kills workers
|
||
|
|
# during model loading. Disable the monitor entirely.
|
||
|
|
export RAY_memory_monitor_refresh_ms=0
|
||
|
|
if [ -z "${RAY_TMPDIR:-}" ]; then
|
||
|
|
RAY_TMPDIR_BASE="/tmp/ray"
|
||
|
|
RAY_TMPDIR="${RAY_TMPDIR_BASE}/ray_${SLURM_JOB_ID:-$$}"
|
||
|
|
mkdir -p "$RAY_TMPDIR"
|
||
|
|
fi
|
||
|
|
export RAY_TMPDIR="${RAY_TMPDIR}"
|
||
|
|
echo "[ray] RAY_TMPDIR=$RAY_TMPDIR"
|
||
|
|
|
||
|
|
# --- Standard environment variables ---
|
||
|
|
export PYTHONFAULTHANDLER=1
|
||
|
|
export TORCH_SHOW_CPP_STACKTRACES=1
|
||
|
|
export CUDA_LAUNCH_BLOCKING=0
|
||
|
|
export TORCH_NCCL_ASYNC_ERROR_HANDLING=1
|
||
|
|
export OMP_NUM_THREADS=1
|
||
|
|
export PYTHONPATH="$WORKDIR:${PYTHONPATH:-}"
|
||
|
|
export DISABLE_VERSION_CHECK=1 # Skip LlamaFactory transformers version check
|
||
|
|
|
||
|
|
# --- Distributed training setup ---
|
||
|
|
export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
|
||
|
|
# if histname contains jrc or jwb, set master addres to ${MASTER_ADDR}i to use infiniband
|
||
|
|
if [[ "$MASTER_ADDR" == *"jrc"* || "$MASTER_ADDR" == *"jwb"* ]]; then
|
||
|
|
export MASTER_ADDR="${MASTER_ADDR}i"
|
||
|
|
fi
|
||
|
|
|
||
|
|
echo "MASTER_ADDR set to $MASTER_ADDR"
|
||
|
|
export MASTER_PORT=12802
|
||
|
|
export NUM_NODES=$SLURM_JOB_NUM_NODES
|
||
|
|
export NUM_GPUS_PER_NODE=4
|
||
|
|
export NUM_GPUS=$((NUM_GPUS_PER_NODE*SLURM_NNODES))
|
||
|
|
|
||
|
|
# --- HuggingFace/WandB paths ---
|
||
|
|
export HF_HOME="${HF_HOME:-${HF_HUB_CACHE:-/tmp/hf_home}}"
|
||
|
|
export WANDB_DIR="${DCFT_WANDB_DIR:-$DCFT//e/scratch/jureap59/raoof1/sft_experiments/g1_gptlong_top8_32b/wandb}"
|
||
|
|
|
||
|
|
# --- Triton/TorchInductor cache settings (node-local to avoid shared FS issues) ---
|
||
|
|
export TRITON_CACHE_VERBOSE=1
|
||
|
|
source "$WORKDIR/hpc/shell_utils/triton_cache.sh"
|
||
|
|
|
||
|
|
# --- Create experiment directories ---
|
||
|
|
mkdir -p "$DCFT//e/scratch/jureap59/raoof1/sft_experiments/g1_gptlong_top8_32b"
|
||
|
|
mkdir -p "$DCFT//e/scratch/jureap59/raoof1/sft_experiments/g1_gptlong_top8_32b/logs"
|
||
|
|
mkdir -p "$DCFT//e/scratch/jureap59/raoof1/sft_experiments/g1_gptlong_top8_32b/tmp"
|
||
|
|
|
||
|
|
# --- Supabase environment variables for DB registration ---
|
||
|
|
for _supabase_var in SUPABASE_URL SUPABASE_ANON_KEY SUPABASE_SERVICE_ROLE_KEY; do
|
||
|
|
if [[ -n "${!_supabase_var:-}" ]]; then
|
||
|
|
export "${_supabase_var}=${!_supabase_var}"
|
||
|
|
else
|
||
|
|
echo "Warning: ${_supabase_var} is not set; Supabase registration may fail." >&2
|
||
|
|
fi
|
||
|
|
done
|
||
|
|
|
||
|
|
# --- SSH Tunneling (JSC clusters only) ---
|
||
|
|
# ============================================================================
|
||
|
|
# SSH Tunnel + Proxychains Setup for No-Internet Clusters (JSC)
|
||
|
|
#
|
||
|
|
# Creates SOCKS5 proxy via SSH tunnel to login node, then uses proxychains
|
||
|
|
# to route external traffic through the tunnel.
|
||
|
|
#
|
||
|
|
# Jupiter (ARM GH200): Uses wrapped binary approach (proxychains4 -f <config> cmd)
|
||
|
|
# Other JSC clusters: Uses LD_PRELOAD approach for Ray worker inheritance
|
||
|
|
# ============================================================================
|
||
|
|
|
||
|
|
# Determine login node and proxychains paths based on cluster
|
||
|
|
NODE_HOST=$(hostname -s)
|
||
|
|
PROXYCHAINS_MODE="" # "binary" or "ldpreload"
|
||
|
|
|
||
|
|
if [[ $NODE_HOST == jrc* ]]; then
|
||
|
|
LOGIN_NODE="jrlogin05i"
|
||
|
|
PROXYCHAINS_LIB="/p/scratch/synthlaion/dc-agent-shared/tools/proxychains-ng-install/lib/libproxychains4.so"
|
||
|
|
PROXYCHAINS_MODE="ldpreload"
|
||
|
|
elif [[ $NODE_HOST == jwb* ]]; then
|
||
|
|
LOGIN_NODE="jwlogin22i"
|
||
|
|
PROXYCHAINS_LIB="/p/scratch/synthlaion/dc-agent-shared/tools/proxychains-ng-install/lib/libproxychains4.so"
|
||
|
|
PROXYCHAINS_MODE="ldpreload"
|
||
|
|
elif [[ $NODE_HOST == jpb* ]] || [[ $NODE_HOST == jpc* ]]; then
|
||
|
|
LOGIN_NODE="jpbl-s01-01"
|
||
|
|
# Jupiter uses aarch64 build - binary wrapper approach (LD_PRELOAD doesn't work reliably)
|
||
|
|
PROXYCHAINS_BIN="/e/scratch/jureap59/feuer1/proxychains-ng-aarch64/bin/proxychains4"
|
||
|
|
PROXYCHAINS_MODE="binary"
|
||
|
|
elif [[ $NODE_HOST == lrdn* ]] || [[ $NODE_HOST == *.leonardo.local ]]; then
|
||
|
|
LOGIN_NODE="login05-ext.leonardo.cineca.it"
|
||
|
|
# Leonardo uses x86 build - binary wrapper approach
|
||
|
|
PROXYCHAINS_BIN="/leonardo/home/userexternal/bfeuer00/proxychains/bin/proxychains4"
|
||
|
|
PROXYCHAINS_MODE="binary"
|
||
|
|
else
|
||
|
|
echo "[proxy] Unknown cluster for node $NODE_HOST - skipping proxy setup"
|
||
|
|
return 0
|
||
|
|
fi
|
||
|
|
|
||
|
|
TUNNEL_PORT=7003
|
||
|
|
|
||
|
|
# Check if proxychains is available
|
||
|
|
if [[ "$PROXYCHAINS_MODE" == "binary" ]]; then
|
||
|
|
if [ ! -x "$PROXYCHAINS_BIN" ]; then
|
||
|
|
echo "[proxy] ✗ proxychains binary not found at $PROXYCHAINS_BIN"
|
||
|
|
echo "[proxy] Skipping proxy setup - external connectivity will fail"
|
||
|
|
return 0
|
||
|
|
fi
|
||
|
|
echo "[proxy] ✓ Found proxychains binary at $PROXYCHAINS_BIN"
|
||
|
|
else
|
||
|
|
if [ ! -f "$PROXYCHAINS_LIB" ]; then
|
||
|
|
echo "[proxy] ✗ proxychains library not found at $PROXYCHAINS_LIB"
|
||
|
|
echo "[proxy] Skipping proxy setup - external connectivity will fail"
|
||
|
|
return 0
|
||
|
|
fi
|
||
|
|
echo "[proxy] ✓ Found proxychains library at $PROXYCHAINS_LIB"
|
||
|
|
fi
|
||
|
|
|
||
|
|
if [ -z "${SSH_KEY:-}" ]; then
|
||
|
|
echo "[proxy] SSH_KEY not set - skipping proxy setup"
|
||
|
|
echo "[proxy] Set SSH_KEY in your environment to enable internet access"
|
||
|
|
else
|
||
|
|
# Get this node's IP address for multi-node proxy access
|
||
|
|
NODE_IP=$(nslookup $NODE_HOST | grep 'Address' | tail -n1 | awk '{print $2}')
|
||
|
|
echo "[proxy] Setting up SSH tunnel to $LOGIN_NODE"
|
||
|
|
echo "[proxy] SSH key: $SSH_KEY"
|
||
|
|
echo "[proxy] Tunnel port: $TUNNEL_PORT"
|
||
|
|
echo "[proxy] Node IP: $NODE_IP (workers will connect here)"
|
||
|
|
|
||
|
|
# Create SSH tunnel with SOCKS5 proxy
|
||
|
|
# -g flag allows remote hosts (worker nodes) to connect to the tunnel
|
||
|
|
ssh -g -f -N -D ${TUNNEL_PORT} \
|
||
|
|
-o StrictHostKeyChecking=no \
|
||
|
|
-o ConnectTimeout=1000 \
|
||
|
|
-o ServerAliveInterval=10 \
|
||
|
|
-o ServerAliveCountMax=30 \
|
||
|
|
-o TCPKeepAlive=yes \
|
||
|
|
-o ExitOnForwardFailure=yes \
|
||
|
|
-o BatchMode=yes \
|
||
|
|
-i ${SSH_KEY} \
|
||
|
|
${USER}@${LOGIN_NODE}
|
||
|
|
|
||
|
|
# Give tunnel time to establish
|
||
|
|
sleep 5
|
||
|
|
|
||
|
|
# Verify tunnel is running
|
||
|
|
if pgrep -f "ssh.*-D.*${TUNNEL_PORT}" > /dev/null; then
|
||
|
|
echo "[proxy] ✓ SSH tunnel started successfully"
|
||
|
|
else
|
||
|
|
echo "[proxy] ✗ SSH tunnel failed to start"
|
||
|
|
return 0
|
||
|
|
fi
|
||
|
|
|
||
|
|
# ============================================================================
|
||
|
|
# Generate proxychains config
|
||
|
|
# Key: Uses NODE_IP (not localhost) so worker nodes can access the tunnel
|
||
|
|
# localnet entries ensure internal traffic (Ray, NCCL) bypasses proxy
|
||
|
|
# ============================================================================
|
||
|
|
SLURM_JOB_ID=${SLURM_JOB_ID:-"local"}
|
||
|
|
CFG_PATH=~/.proxychains/proxychains_${SLURM_JOB_ID}.conf
|
||
|
|
mkdir -p ~/.proxychains
|
||
|
|
|
||
|
|
cat > "$CFG_PATH" <<PCEOF
|
||
|
|
strict_chain
|
||
|
|
quiet_mode
|
||
|
|
tcp_read_time_out 30000
|
||
|
|
tcp_connect_time_out 15000
|
||
|
|
localnet 127.0.0.0/255.0.0.0
|
||
|
|
localnet 127.0.0.1/255.255.255.255
|
||
|
|
localnet 10.0.0.0/255.0.0.0
|
||
|
|
localnet 172.16.0.0/255.240.0.0
|
||
|
|
localnet 192.168.0.0/255.255.0.0
|
||
|
|
localnet 169.254.0.0/255.255.0.0
|
||
|
|
[ProxyList]
|
||
|
|
socks5 ${NODE_IP} ${TUNNEL_PORT}
|
||
|
|
PCEOF
|
||
|
|
|
||
|
|
echo "[proxy] ✓ Generated proxychains config at $CFG_PATH"
|
||
|
|
echo "[proxy] - Internal traffic (10.x.x.x, 172.x.x.x, 169.254.x.x) → DIRECT"
|
||
|
|
echo "[proxy] - External traffic (internet) → PROXY via tunnel"
|
||
|
|
|
||
|
|
# ============================================================================
|
||
|
|
# Export proxychains configuration based on mode
|
||
|
|
# ============================================================================
|
||
|
|
export PROXYCHAINS_CONF_FILE="$CFG_PATH"
|
||
|
|
export PROXYCHAINS_SOCKS5_HOST="${NODE_IP}"
|
||
|
|
export PROXYCHAINS_SOCKS5_PORT="${TUNNEL_PORT}"
|
||
|
|
|
||
|
|
# if [[ "$PROXYCHAINS_MODE" == "binary" ]]; then
|
||
|
|
# # Binary wrapper approach (Jupiter ARM GH200)
|
||
|
|
# # Ray workers will use: proxychains4 -f $PROXYCHAINS_CONF_FILE ray start ...
|
||
|
|
# export PROXYCHAINS_BINARY="$PROXYCHAINS_BIN"
|
||
|
|
# echo "[proxy] ✓ PROXYCHAINS_BINARY=$PROXYCHAINS_BIN"
|
||
|
|
# echo "[proxy] ✓ PROXYCHAINS_CONF_FILE=$CFG_PATH"
|
||
|
|
# echo "[proxy] ✓ PROXYCHAINS_SOCKS5_HOST=${NODE_IP} (accessible from worker nodes)"
|
||
|
|
# echo "[proxy] ✓ PROXYCHAINS_SOCKS5_PORT=${TUNNEL_PORT}"
|
||
|
|
# else
|
||
|
|
# # LD_PRELOAD approach (Jureca, Juwels)
|
||
|
|
# # Ray workers inherit proxy via LD_PRELOAD environment variable
|
||
|
|
# export LD_PRELOAD="$PROXYCHAINS_LIB"
|
||
|
|
# echo "[proxy] ✓ LD_PRELOAD set to $PROXYCHAINS_LIB"
|
||
|
|
# echo "[proxy] ✓ PROXYCHAINS_CONF_FILE=$CFG_PATH"
|
||
|
|
# echo "[proxy] ✓ PROXYCHAINS_SOCKS5_HOST=${NODE_IP} (accessible from worker nodes)"
|
||
|
|
# echo "[proxy] ✓ PROXYCHAINS_SOCKS5_PORT=${TUNNEL_PORT}"
|
||
|
|
# fi
|
||
|
|
|
||
|
|
# ============================================================================
|
||
|
|
# Daytona/aiohttp timeout and retry settings
|
||
|
|
# ============================================================================
|
||
|
|
export DAYTONA_MAX_RETRIES=5
|
||
|
|
export DAYTONA_RETRY_DELAY=30
|
||
|
|
export DAYTONA_BACKOFF_FACTOR=2
|
||
|
|
export DAYTONA_TIMEOUT=1800 # 30 minutes
|
||
|
|
export AIOHTTP_CLIENT_TIMEOUT=900 # 15 minutes
|
||
|
|
export AIOHTTP_CONNECTOR_TIMEOUT=900
|
||
|
|
export AIOHTTP_SOCK_CONNECT_TIMEOUT=300
|
||
|
|
export AIOHTTP_TOTAL_TIMEOUT=1800
|
||
|
|
|
||
|
|
# Disable SSL verification (JSC certificate issues)
|
||
|
|
export PYTHONHTTPSVERIFY=0
|
||
|
|
unset SSL_CERT_FILE
|
||
|
|
unset CURL_CA_BUNDLE
|
||
|
|
unset REQUESTS_CA_BUNDLE
|
||
|
|
unset SSL_CERT_DIR
|
||
|
|
|
||
|
|
echo "[proxy] ✓ Daytona timeout settings configured"
|
||
|
|
|
||
|
|
# Test proxy connectivity
|
||
|
|
echo "[proxy] Testing proxy connectivity..."
|
||
|
|
if [[ "$PROXYCHAINS_MODE" == "binary" ]]; then
|
||
|
|
if "$PROXYCHAINS_BIN" -f "$CFG_PATH" curl -s --connect-timeout 10 https://huggingface.co -o /dev/null; then
|
||
|
|
echo "[proxy] ✓ Proxy connectivity test passed (huggingface.co reachable via wrapped binary)"
|
||
|
|
else
|
||
|
|
echo "[proxy] ⚠ Proxy connectivity test failed (may still work for Daytona)"
|
||
|
|
fi
|
||
|
|
else
|
||
|
|
if curl -s --connect-timeout 10 https://huggingface.co -o /dev/null 2>/dev/null; then
|
||
|
|
echo "[proxy] ✓ Proxy connectivity test passed (huggingface.co reachable via LD_PRELOAD)"
|
||
|
|
else
|
||
|
|
echo "[proxy] ⚠ Proxy connectivity test failed (may still work for Daytona)"
|
||
|
|
fi
|
||
|
|
fi
|
||
|
|
|
||
|
|
# Test that tunnel is accessible from this node's IP (for worker node access)
|
||
|
|
if nc -z ${NODE_IP} ${TUNNEL_PORT} 2>/dev/null; then
|
||
|
|
echo "[proxy] ✓ Tunnel accessible at ${NODE_IP}:${TUNNEL_PORT} (workers can connect)"
|
||
|
|
else
|
||
|
|
echo "[proxy] ⚠ Tunnel not accessible at ${NODE_IP}:${TUNNEL_PORT} (workers may fail)"
|
||
|
|
fi
|
||
|
|
|
||
|
|
if [[ "$PROXYCHAINS_MODE" == "binary" ]]; then
|
||
|
|
echo "[proxy] ✓ Proxy setup complete (using wrapped binary for Ray workers)"
|
||
|
|
else
|
||
|
|
echo "[proxy] ✓ Proxy setup complete (using LD_PRELOAD for Ray worker inheritance)"
|
||
|
|
fi
|
||
|
|
fi
|
||
|
|
|
||
|
|
|
||
|
|
# --- Run the SFT job via Python runner ---
|
||
|
|
echo "=== Universal SFT Training Runner ==="
|
||
|
|
echo "Config: /e/scratch/jureap59/raoof1/sft_experiments/g1_gptlong_top8_32b/configs/g1_gptlong_top8_32b__Qwen3-8B_sft_config.json"
|
||
|
|
echo "Working directory: $WORKDIR"
|
||
|
|
echo "Nodes: $NUM_NODES, GPUs/node: $NUM_GPUS_PER_NODE"
|
||
|
|
echo "======================================"
|
||
|
|
|
||
|
|
echo LD_LIBRARY_PATH=$LD_LIBRARY_PATH
|
||
|
|
srun --mpi=none --nodes=24 $PROXY_CMD bash -c 'python -m hpc.sft_launch_utils --config "/e/scratch/jureap59/raoof1/sft_experiments/g1_gptlong_top8_32b/configs/g1_gptlong_top8_32b__Qwen3-8B_sft_config.json"'
|