#!/bin/bash #SBATCH --time=12:00:00 #SBATCH --nodes=24 #SBATCH --ntasks-per-node=1 #SBATCH --cpus-per-task=288 #SBATCH --output=/e/scratch/jureap59/raoof1/sft_experiments/g1_gptlong_top8_32b/logs/%x_%j.out #SBATCH --job-name=g1_gptlong_top8_32b__Qwen3-8B #SBATCH --mail-type=END,TIME_LIMIT,FAIL #SBATCH --mail-user= #SBATCH -p booster #SBATCH --account reformo #SBATCH --gres=gpu:4 #SBATCH --exclude=jpbo-031-[01-48] # ============================================================================== # Universal SFT Training SBATCH Template # ============================================================================== # This template replaces the cluster-specific *_train.sbatch scripts by delegating # all logic to the SFTJobRunner Python class. # # Usage: The launcher writes a JSON config file and substitutes /e/scratch/jureap59/raoof1/sft_experiments/g1_gptlong_top8_32b/configs/g1_gptlong_top8_32b__Qwen3-8B_sft_config.json # ============================================================================== source /e/scratch/jureap59/feuer1/miniforge3/etc/profile.d/conda.sh conda activate otagent set -eo pipefail ml purge ulimit -c 0 # Disable core dumps to avoid filling disk space # Handle bash completion scripts that use BASH_COMPLETION_DEBUG if [ -z "${BASH_COMPLETION_DEBUG+x}" ]; then export BASH_COMPLETION_DEBUG="" fi # --- Clean up /tmp to prevent state pollution from previous jobs --- # Some HPC systems retain /tmp contents across job allocations on the same node. # This can cause issues with tmux sessions, container state, and other temporary files. rm -rf /tmp/tmux-* 2>/dev/null || true rm -rf /tmp/ray 2>/dev/null || true rm -rf /tmp/hf_home 2>/dev/null || true rm -rf /tmp/containers 2>/dev/null || true rm -rf /tmp/podman-* 2>/dev/null || true # Guard conda deactivate scripts from set -u complaints export CONDA_BACKUP_CXX="${CONDA_BACKUP_CXX:-}" export CONDA_BACKUP_CC="${CONDA_BACKUP_CC:-}" export CONDA_BACKUP_FC="${CONDA_BACKUP_FC:-}" # --- Module and Conda Setup --- # --- Module loading (cluster-specific, substituted by launcher) --- module load nvidia-compilers/25.9-CUDA-13 # --- Environment setup --- if [ -n "${DCFT_PRIVATE:-}" ]; then WORKDIR="$DCFT_PRIVATE" elif [ -n "${DCFT:-}" ]; then WORKDIR="$DCFT" else WORKDIR="$PWD" fi cd "$WORKDIR" if [ -z "${DCFT:-}" ]; then export DCFT="$WORKDIR" fi # --- Conda activation (cluster-specific, substituted by launcher) --- # No conda activation configured # --- Source environment files --- if [ -n "${DCFT:-}" ] && [ -f "$DCFT/hpc/dotenv/jupiter.env" ]; then source "$DCFT/hpc/dotenv/jupiter.env" fi if [ -n "${DC_AGENT_SECRET_ENV:-}" ] && [ -f "$DC_AGENT_SECRET_ENV" ]; then set -a source "$DC_AGENT_SECRET_ENV" set +a fi if [ -n "${DCFT_ACTIVATE_ENV:-}" ]; then eval "$DCFT_ACTIVATE_ENV" fi # --- CUDA path detection (Perlmutter and similar) --- # --- NCCL/Networking settings (cluster-specific) --- # Cluster-specific NCCL/networking settings export NCCL_DEBUG="INFO" export NCCL_NET_GDR_LEVEL="0" export NCCL_SOCKET_IFNAME="ib0" export NCCL_IB_TIMEOUT="60" # --- Cluster-specific environment variables --- export WANDB_MODE="offline" export GLOO_USE_IPV6="0" export NCCL_SOCKET_FAMILY="AF_INET" export SKYRL_ENABLE_NUMA_AFFINITY="1" export DISABLE_AIOHTTP_TRANSPORT="True" export VLLM_ALLREDUCE_USE_SYMM_MEM="0" # --- Ray defaults --- # --- Ray defaults --- export RAY_CGRAPH_get_timeout="${RAY_CGRAPH_get_timeout:-900}" # GH200 unified memory: GPU HBM is part of system RAM, so Ray's # memory monitor double-counts GPU allocations and kills workers # during model loading. Disable the monitor entirely. export RAY_memory_monitor_refresh_ms=0 if [ -z "${RAY_TMPDIR:-}" ]; then RAY_TMPDIR_BASE="/tmp/ray" RAY_TMPDIR="${RAY_TMPDIR_BASE}/ray_${SLURM_JOB_ID:-$$}" mkdir -p "$RAY_TMPDIR" fi export RAY_TMPDIR="${RAY_TMPDIR}" echo "[ray] RAY_TMPDIR=$RAY_TMPDIR" # --- Standard environment variables --- export PYTHONFAULTHANDLER=1 export TORCH_SHOW_CPP_STACKTRACES=1 export CUDA_LAUNCH_BLOCKING=0 export TORCH_NCCL_ASYNC_ERROR_HANDLING=1 export OMP_NUM_THREADS=1 export PYTHONPATH="$WORKDIR:${PYTHONPATH:-}" export DISABLE_VERSION_CHECK=1 # Skip LlamaFactory transformers version check # --- Distributed training setup --- export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) # if histname contains jrc or jwb, set master addres to ${MASTER_ADDR}i to use infiniband if [[ "$MASTER_ADDR" == *"jrc"* || "$MASTER_ADDR" == *"jwb"* ]]; then export MASTER_ADDR="${MASTER_ADDR}i" fi echo "MASTER_ADDR set to $MASTER_ADDR" export MASTER_PORT=12802 export NUM_NODES=$SLURM_JOB_NUM_NODES export NUM_GPUS_PER_NODE=4 export NUM_GPUS=$((NUM_GPUS_PER_NODE*SLURM_NNODES)) # --- HuggingFace/WandB paths --- export HF_HOME="${HF_HOME:-${HF_HUB_CACHE:-/tmp/hf_home}}" export WANDB_DIR="${DCFT_WANDB_DIR:-$DCFT//e/scratch/jureap59/raoof1/sft_experiments/g1_gptlong_top8_32b/wandb}" # --- Triton/TorchInductor cache settings (node-local to avoid shared FS issues) --- export TRITON_CACHE_VERBOSE=1 source "$WORKDIR/hpc/shell_utils/triton_cache.sh" # --- Create experiment directories --- mkdir -p "$DCFT//e/scratch/jureap59/raoof1/sft_experiments/g1_gptlong_top8_32b" mkdir -p "$DCFT//e/scratch/jureap59/raoof1/sft_experiments/g1_gptlong_top8_32b/logs" mkdir -p "$DCFT//e/scratch/jureap59/raoof1/sft_experiments/g1_gptlong_top8_32b/tmp" # --- Supabase environment variables for DB registration --- for _supabase_var in SUPABASE_URL SUPABASE_ANON_KEY SUPABASE_SERVICE_ROLE_KEY; do if [[ -n "${!_supabase_var:-}" ]]; then export "${_supabase_var}=${!_supabase_var}" else echo "Warning: ${_supabase_var} is not set; Supabase registration may fail." >&2 fi done # --- SSH Tunneling (JSC clusters only) --- # ============================================================================ # SSH Tunnel + Proxychains Setup for No-Internet Clusters (JSC) # # Creates SOCKS5 proxy via SSH tunnel to login node, then uses proxychains # to route external traffic through the tunnel. # # Jupiter (ARM GH200): Uses wrapped binary approach (proxychains4 -f cmd) # Other JSC clusters: Uses LD_PRELOAD approach for Ray worker inheritance # ============================================================================ # Determine login node and proxychains paths based on cluster NODE_HOST=$(hostname -s) PROXYCHAINS_MODE="" # "binary" or "ldpreload" if [[ $NODE_HOST == jrc* ]]; then LOGIN_NODE="jrlogin05i" PROXYCHAINS_LIB="/p/scratch/synthlaion/dc-agent-shared/tools/proxychains-ng-install/lib/libproxychains4.so" PROXYCHAINS_MODE="ldpreload" elif [[ $NODE_HOST == jwb* ]]; then LOGIN_NODE="jwlogin22i" PROXYCHAINS_LIB="/p/scratch/synthlaion/dc-agent-shared/tools/proxychains-ng-install/lib/libproxychains4.so" PROXYCHAINS_MODE="ldpreload" elif [[ $NODE_HOST == jpb* ]] || [[ $NODE_HOST == jpc* ]]; then LOGIN_NODE="jpbl-s01-01" # Jupiter uses aarch64 build - binary wrapper approach (LD_PRELOAD doesn't work reliably) PROXYCHAINS_BIN="/e/scratch/jureap59/feuer1/proxychains-ng-aarch64/bin/proxychains4" PROXYCHAINS_MODE="binary" elif [[ $NODE_HOST == lrdn* ]] || [[ $NODE_HOST == *.leonardo.local ]]; then LOGIN_NODE="login05-ext.leonardo.cineca.it" # Leonardo uses x86 build - binary wrapper approach PROXYCHAINS_BIN="/leonardo/home/userexternal/bfeuer00/proxychains/bin/proxychains4" PROXYCHAINS_MODE="binary" else echo "[proxy] Unknown cluster for node $NODE_HOST - skipping proxy setup" return 0 fi TUNNEL_PORT=7003 # Check if proxychains is available if [[ "$PROXYCHAINS_MODE" == "binary" ]]; then if [ ! -x "$PROXYCHAINS_BIN" ]; then echo "[proxy] ✗ proxychains binary not found at $PROXYCHAINS_BIN" echo "[proxy] Skipping proxy setup - external connectivity will fail" return 0 fi echo "[proxy] ✓ Found proxychains binary at $PROXYCHAINS_BIN" else if [ ! -f "$PROXYCHAINS_LIB" ]; then echo "[proxy] ✗ proxychains library not found at $PROXYCHAINS_LIB" echo "[proxy] Skipping proxy setup - external connectivity will fail" return 0 fi echo "[proxy] ✓ Found proxychains library at $PROXYCHAINS_LIB" fi if [ -z "${SSH_KEY:-}" ]; then echo "[proxy] SSH_KEY not set - skipping proxy setup" echo "[proxy] Set SSH_KEY in your environment to enable internet access" else # Get this node's IP address for multi-node proxy access NODE_IP=$(nslookup $NODE_HOST | grep 'Address' | tail -n1 | awk '{print $2}') echo "[proxy] Setting up SSH tunnel to $LOGIN_NODE" echo "[proxy] SSH key: $SSH_KEY" echo "[proxy] Tunnel port: $TUNNEL_PORT" echo "[proxy] Node IP: $NODE_IP (workers will connect here)" # Create SSH tunnel with SOCKS5 proxy # -g flag allows remote hosts (worker nodes) to connect to the tunnel ssh -g -f -N -D ${TUNNEL_PORT} \ -o StrictHostKeyChecking=no \ -o ConnectTimeout=1000 \ -o ServerAliveInterval=10 \ -o ServerAliveCountMax=30 \ -o TCPKeepAlive=yes \ -o ExitOnForwardFailure=yes \ -o BatchMode=yes \ -i ${SSH_KEY} \ ${USER}@${LOGIN_NODE} # Give tunnel time to establish sleep 5 # Verify tunnel is running if pgrep -f "ssh.*-D.*${TUNNEL_PORT}" > /dev/null; then echo "[proxy] ✓ SSH tunnel started successfully" else echo "[proxy] ✗ SSH tunnel failed to start" return 0 fi # ============================================================================ # Generate proxychains config # Key: Uses NODE_IP (not localhost) so worker nodes can access the tunnel # localnet entries ensure internal traffic (Ray, NCCL) bypasses proxy # ============================================================================ SLURM_JOB_ID=${SLURM_JOB_ID:-"local"} CFG_PATH=~/.proxychains/proxychains_${SLURM_JOB_ID}.conf mkdir -p ~/.proxychains cat > "$CFG_PATH" </dev/null; then echo "[proxy] ✓ Proxy connectivity test passed (huggingface.co reachable via LD_PRELOAD)" else echo "[proxy] ⚠ Proxy connectivity test failed (may still work for Daytona)" fi fi # Test that tunnel is accessible from this node's IP (for worker node access) if nc -z ${NODE_IP} ${TUNNEL_PORT} 2>/dev/null; then echo "[proxy] ✓ Tunnel accessible at ${NODE_IP}:${TUNNEL_PORT} (workers can connect)" else echo "[proxy] ⚠ Tunnel not accessible at ${NODE_IP}:${TUNNEL_PORT} (workers may fail)" fi if [[ "$PROXYCHAINS_MODE" == "binary" ]]; then echo "[proxy] ✓ Proxy setup complete (using wrapped binary for Ray workers)" else echo "[proxy] ✓ Proxy setup complete (using LD_PRELOAD for Ray worker inheritance)" fi fi # --- Run the SFT job via Python runner --- echo "=== Universal SFT Training Runner ===" echo "Config: /e/scratch/jureap59/raoof1/sft_experiments/g1_gptlong_top8_32b/configs/g1_gptlong_top8_32b__Qwen3-8B_sft_config.json" echo "Working directory: $WORKDIR" echo "Nodes: $NUM_NODES, GPUs/node: $NUM_GPUS_PER_NODE" echo "======================================" echo LD_LIBRARY_PATH=$LD_LIBRARY_PATH srun --mpi=none --nodes=24 $PROXY_CMD bash -c 'python -m hpc.sft_launch_utils --config "/e/scratch/jureap59/raoof1/sft_experiments/g1_gptlong_top8_32b/configs/g1_gptlong_top8_32b__Qwen3-8B_sft_config.json"'