初始化项目，由ModelHub XC社区提供模型

Model: DCAgent/g1_gptlong_top8_32b Source: Original Platform
2026-05-26 18:58:52 +08:00
commit b71d09829c
33 changed files with 156554 additions and 0 deletions
--- a/training_configs/sft.sbatch
+++ b/training_configs/sft.sbatch
@@ -0,0 +1,366 @@
+#!/bin/bash
+#SBATCH --time=12:00:00
+#SBATCH --nodes=24
+#SBATCH --ntasks-per-node=1
+#SBATCH --cpus-per-task=288
+#SBATCH --output=/e/scratch/jureap59/raoof1/sft_experiments/g1_gptlong_top8_32b/logs/%x_%j.out
+#SBATCH --job-name=g1_gptlong_top8_32b__Qwen3-8B
+#SBATCH --mail-type=END,TIME_LIMIT,FAIL
+#SBATCH --mail-user=
+#SBATCH -p booster
+#SBATCH --account reformo
+#SBATCH --gres=gpu:4
+#SBATCH --exclude=jpbo-031-[01-48]
+
+# ==============================================================================
+# Universal SFT Training SBATCH Template
+# ==============================================================================
+# This template replaces the cluster-specific *_train.sbatch scripts by delegating
+# all logic to the SFTJobRunner Python class.
+#
+# Usage: The launcher writes a JSON config file and substitutes /e/scratch/jureap59/raoof1/sft_experiments/g1_gptlong_top8_32b/configs/g1_gptlong_top8_32b__Qwen3-8B_sft_config.json
+# ==============================================================================
+
+source /e/scratch/jureap59/feuer1/miniforge3/etc/profile.d/conda.sh
+conda activate otagent
+set -eo pipefail
+ml purge
+ulimit -c 0  # Disable core dumps to avoid filling disk space
+
+# Handle bash completion scripts that use BASH_COMPLETION_DEBUG
+if [ -z "${BASH_COMPLETION_DEBUG+x}" ]; then
+  export BASH_COMPLETION_DEBUG=""
+fi
+
+# --- Clean up /tmp to prevent state pollution from previous jobs ---
+# Some HPC systems retain /tmp contents across job allocations on the same node.
+# This can cause issues with tmux sessions, container state, and other temporary files.
+rm -rf /tmp/tmux-* 2>/dev/null || true
+rm -rf /tmp/ray 2>/dev/null || true
+rm -rf /tmp/hf_home 2>/dev/null || true
+rm -rf /tmp/containers 2>/dev/null || true
+rm -rf /tmp/podman-* 2>/dev/null || true
+
+# Guard conda deactivate scripts from set -u complaints
+export CONDA_BACKUP_CXX="${CONDA_BACKUP_CXX:-}"
+export CONDA_BACKUP_CC="${CONDA_BACKUP_CC:-}"
+export CONDA_BACKUP_FC="${CONDA_BACKUP_FC:-}"
+
+# --- Module and Conda Setup ---
+
+# --- Module loading (cluster-specific, substituted by launcher) ---
+module load nvidia-compilers/25.9-CUDA-13
+
+# --- Environment setup ---
+if [ -n "${DCFT_PRIVATE:-}" ]; then
+  WORKDIR="$DCFT_PRIVATE"
+elif [ -n "${DCFT:-}" ]; then
+  WORKDIR="$DCFT"
+else
+  WORKDIR="$PWD"
+fi
+cd "$WORKDIR"
+
+if [ -z "${DCFT:-}" ]; then
+  export DCFT="$WORKDIR"
+fi
+
+# --- Conda activation (cluster-specific, substituted by launcher) ---
+# No conda activation configured
+
+# --- Source environment files ---
+if [ -n "${DCFT:-}" ] && [ -f "$DCFT/hpc/dotenv/jupiter.env" ]; then
+  source "$DCFT/hpc/dotenv/jupiter.env"
+fi
+if [ -n "${DC_AGENT_SECRET_ENV:-}" ] && [ -f "$DC_AGENT_SECRET_ENV" ]; then
+  set -a
+  source "$DC_AGENT_SECRET_ENV"
+  set +a
+fi
+if [ -n "${DCFT_ACTIVATE_ENV:-}" ]; then
+  eval "$DCFT_ACTIVATE_ENV"
+fi
+
+# --- CUDA path detection (Perlmutter and similar) ---
+
+
+# --- NCCL/Networking settings (cluster-specific) ---
+# Cluster-specific NCCL/networking settings
+export NCCL_DEBUG="INFO"
+export NCCL_NET_GDR_LEVEL="0"
+export NCCL_SOCKET_IFNAME="ib0"
+export NCCL_IB_TIMEOUT="60"
+
+# --- Cluster-specific environment variables ---
+export WANDB_MODE="offline"
+export GLOO_USE_IPV6="0"
+export NCCL_SOCKET_FAMILY="AF_INET"
+export SKYRL_ENABLE_NUMA_AFFINITY="1"
+export DISABLE_AIOHTTP_TRANSPORT="True"
+export VLLM_ALLREDUCE_USE_SYMM_MEM="0"
+
+# --- Ray defaults ---
+# --- Ray defaults ---
+export RAY_CGRAPH_get_timeout="${RAY_CGRAPH_get_timeout:-900}"
+# GH200 unified memory: GPU HBM is part of system RAM, so Ray's
+# memory monitor double-counts GPU allocations and kills workers
+# during model loading.  Disable the monitor entirely.
+export RAY_memory_monitor_refresh_ms=0
+if [ -z "${RAY_TMPDIR:-}" ]; then
+  RAY_TMPDIR_BASE="/tmp/ray"
+  RAY_TMPDIR="${RAY_TMPDIR_BASE}/ray_${SLURM_JOB_ID:-$$}"
+  mkdir -p "$RAY_TMPDIR"
+fi
+export RAY_TMPDIR="${RAY_TMPDIR}"
+echo "[ray] RAY_TMPDIR=$RAY_TMPDIR"
+
+# --- Standard environment variables ---
+export PYTHONFAULTHANDLER=1
+export TORCH_SHOW_CPP_STACKTRACES=1
+export CUDA_LAUNCH_BLOCKING=0
+export TORCH_NCCL_ASYNC_ERROR_HANDLING=1
+export OMP_NUM_THREADS=1
+export PYTHONPATH="$WORKDIR:${PYTHONPATH:-}"
+export DISABLE_VERSION_CHECK=1  # Skip LlamaFactory transformers version check
+
+# --- Distributed training setup ---
+export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
+# if histname contains jrc or jwb, set master addres to ${MASTER_ADDR}i to use infiniband
+if [[  "$MASTER_ADDR" == *"jrc"* || "$MASTER_ADDR" == *"jwb"* ]]; then
+    export MASTER_ADDR="${MASTER_ADDR}i"
+fi
+
+echo "MASTER_ADDR set to $MASTER_ADDR"
+export MASTER_PORT=12802
+export NUM_NODES=$SLURM_JOB_NUM_NODES
+export NUM_GPUS_PER_NODE=4
+export NUM_GPUS=$((NUM_GPUS_PER_NODE*SLURM_NNODES))
+
+# --- HuggingFace/WandB paths ---
+export HF_HOME="${HF_HOME:-${HF_HUB_CACHE:-/tmp/hf_home}}"
+export WANDB_DIR="${DCFT_WANDB_DIR:-$DCFT//e/scratch/jureap59/raoof1/sft_experiments/g1_gptlong_top8_32b/wandb}"
+
+# --- Triton/TorchInductor cache settings (node-local to avoid shared FS issues) ---
+export TRITON_CACHE_VERBOSE=1
+source "$WORKDIR/hpc/shell_utils/triton_cache.sh"
+
+# --- Create experiment directories ---
+mkdir -p "$DCFT//e/scratch/jureap59/raoof1/sft_experiments/g1_gptlong_top8_32b"
+mkdir -p "$DCFT//e/scratch/jureap59/raoof1/sft_experiments/g1_gptlong_top8_32b/logs"
+mkdir -p "$DCFT//e/scratch/jureap59/raoof1/sft_experiments/g1_gptlong_top8_32b/tmp"
+
+# --- Supabase environment variables for DB registration ---
+for _supabase_var in SUPABASE_URL SUPABASE_ANON_KEY SUPABASE_SERVICE_ROLE_KEY; do
+    if [[ -n "${!_supabase_var:-}" ]]; then
+        export "${_supabase_var}=${!_supabase_var}"
+    else
+        echo "Warning: ${_supabase_var} is not set; Supabase registration may fail." >&2
+    fi
+done
+
+# --- SSH Tunneling (JSC clusters only) ---
+# ============================================================================
+# SSH Tunnel + Proxychains Setup for No-Internet Clusters (JSC)
+#
+# Creates SOCKS5 proxy via SSH tunnel to login node, then uses proxychains
+# to route external traffic through the tunnel.
+#
+# Jupiter (ARM GH200): Uses wrapped binary approach (proxychains4 -f <config> cmd)
+# Other JSC clusters: Uses LD_PRELOAD approach for Ray worker inheritance
+# ============================================================================
+
+# Determine login node and proxychains paths based on cluster
+NODE_HOST=$(hostname -s)
+PROXYCHAINS_MODE=""  # "binary" or "ldpreload"
+
+if [[ $NODE_HOST == jrc* ]]; then
+    LOGIN_NODE="jrlogin05i"
+    PROXYCHAINS_LIB="/p/scratch/synthlaion/dc-agent-shared/tools/proxychains-ng-install/lib/libproxychains4.so"
+    PROXYCHAINS_MODE="ldpreload"
+elif [[ $NODE_HOST == jwb* ]]; then
+    LOGIN_NODE="jwlogin22i"
+    PROXYCHAINS_LIB="/p/scratch/synthlaion/dc-agent-shared/tools/proxychains-ng-install/lib/libproxychains4.so"
+    PROXYCHAINS_MODE="ldpreload"
+elif [[ $NODE_HOST == jpb* ]] || [[ $NODE_HOST == jpc* ]]; then
+    LOGIN_NODE="jpbl-s01-01"
+    # Jupiter uses aarch64 build - binary wrapper approach (LD_PRELOAD doesn't work reliably)
+    PROXYCHAINS_BIN="/e/scratch/jureap59/feuer1/proxychains-ng-aarch64/bin/proxychains4"
+    PROXYCHAINS_MODE="binary"
+elif [[ $NODE_HOST == lrdn* ]] || [[ $NODE_HOST == *.leonardo.local ]]; then
+    LOGIN_NODE="login05-ext.leonardo.cineca.it"
+    # Leonardo uses x86 build - binary wrapper approach
+    PROXYCHAINS_BIN="/leonardo/home/userexternal/bfeuer00/proxychains/bin/proxychains4"
+    PROXYCHAINS_MODE="binary"
+else
+    echo "[proxy] Unknown cluster for node $NODE_HOST - skipping proxy setup"
+    return 0
+fi
+
+TUNNEL_PORT=7003
+
+# Check if proxychains is available
+if [[ "$PROXYCHAINS_MODE" == "binary" ]]; then
+    if [ ! -x "$PROXYCHAINS_BIN" ]; then
+        echo "[proxy] ✗ proxychains binary not found at $PROXYCHAINS_BIN"
+        echo "[proxy] Skipping proxy setup - external connectivity will fail"
+        return 0
+    fi
+    echo "[proxy] ✓ Found proxychains binary at $PROXYCHAINS_BIN"
+else
+    if [ ! -f "$PROXYCHAINS_LIB" ]; then
+        echo "[proxy] ✗ proxychains library not found at $PROXYCHAINS_LIB"
+        echo "[proxy] Skipping proxy setup - external connectivity will fail"
+        return 0
+    fi
+    echo "[proxy] ✓ Found proxychains library at $PROXYCHAINS_LIB"
+fi
+
+if [ -z "${SSH_KEY:-}" ]; then
+    echo "[proxy] SSH_KEY not set - skipping proxy setup"
+    echo "[proxy] Set SSH_KEY in your environment to enable internet access"
+else
+    # Get this node's IP address for multi-node proxy access
+    NODE_IP=$(nslookup $NODE_HOST | grep 'Address' | tail -n1 | awk '{print $2}')
+    echo "[proxy] Setting up SSH tunnel to $LOGIN_NODE"
+    echo "[proxy] SSH key: $SSH_KEY"
+    echo "[proxy] Tunnel port: $TUNNEL_PORT"
+    echo "[proxy] Node IP: $NODE_IP (workers will connect here)"
+
+    # Create SSH tunnel with SOCKS5 proxy
+    # -g flag allows remote hosts (worker nodes) to connect to the tunnel
+    ssh -g -f -N -D ${TUNNEL_PORT} \
+        -o StrictHostKeyChecking=no \
+        -o ConnectTimeout=1000 \
+        -o ServerAliveInterval=10 \
+        -o ServerAliveCountMax=30 \
+        -o TCPKeepAlive=yes \
+        -o ExitOnForwardFailure=yes \
+        -o BatchMode=yes \
+        -i ${SSH_KEY} \
+        ${USER}@${LOGIN_NODE}
+
+    # Give tunnel time to establish
+    sleep 5
+
+    # Verify tunnel is running
+    if pgrep -f "ssh.*-D.*${TUNNEL_PORT}" > /dev/null; then
+        echo "[proxy] ✓ SSH tunnel started successfully"
+    else
+        echo "[proxy] ✗ SSH tunnel failed to start"
+        return 0
+    fi
+
+    # ============================================================================
+    # Generate proxychains config
+    # Key: Uses NODE_IP (not localhost) so worker nodes can access the tunnel
+    # localnet entries ensure internal traffic (Ray, NCCL) bypasses proxy
+    # ============================================================================
+    SLURM_JOB_ID=${SLURM_JOB_ID:-"local"}
+    CFG_PATH=~/.proxychains/proxychains_${SLURM_JOB_ID}.conf
+    mkdir -p ~/.proxychains
+
+    cat > "$CFG_PATH" <<PCEOF
+strict_chain
+quiet_mode
+tcp_read_time_out 30000
+tcp_connect_time_out 15000
+localnet 127.0.0.0/255.0.0.0
+localnet 127.0.0.1/255.255.255.255
+localnet 10.0.0.0/255.0.0.0
+localnet 172.16.0.0/255.240.0.0
+localnet 192.168.0.0/255.255.0.0
+localnet 169.254.0.0/255.255.0.0
+[ProxyList]
+socks5 ${NODE_IP} ${TUNNEL_PORT}
+PCEOF
+
+    echo "[proxy] ✓ Generated proxychains config at $CFG_PATH"
+    echo "[proxy]   - Internal traffic (10.x.x.x, 172.x.x.x, 169.254.x.x) → DIRECT"
+    echo "[proxy]   - External traffic (internet) → PROXY via tunnel"
+
+    # ============================================================================
+    # Export proxychains configuration based on mode
+    # ============================================================================
+    export PROXYCHAINS_CONF_FILE="$CFG_PATH"
+    export PROXYCHAINS_SOCKS5_HOST="${NODE_IP}"
+    export PROXYCHAINS_SOCKS5_PORT="${TUNNEL_PORT}"
+
+    # if [[ "$PROXYCHAINS_MODE" == "binary" ]]; then
+    #     # Binary wrapper approach (Jupiter ARM GH200)
+    #     # Ray workers will use: proxychains4 -f $PROXYCHAINS_CONF_FILE ray start ...
+    #     export PROXYCHAINS_BINARY="$PROXYCHAINS_BIN"
+    #     echo "[proxy] ✓ PROXYCHAINS_BINARY=$PROXYCHAINS_BIN"
+    #     echo "[proxy] ✓ PROXYCHAINS_CONF_FILE=$CFG_PATH"
+    #     echo "[proxy] ✓ PROXYCHAINS_SOCKS5_HOST=${NODE_IP} (accessible from worker nodes)"
+    #     echo "[proxy] ✓ PROXYCHAINS_SOCKS5_PORT=${TUNNEL_PORT}"
+    # else
+    #     # LD_PRELOAD approach (Jureca, Juwels)
+    #     # Ray workers inherit proxy via LD_PRELOAD environment variable
+    #     export LD_PRELOAD="$PROXYCHAINS_LIB"
+    #     echo "[proxy] ✓ LD_PRELOAD set to $PROXYCHAINS_LIB"
+    #     echo "[proxy] ✓ PROXYCHAINS_CONF_FILE=$CFG_PATH"
+    #     echo "[proxy] ✓ PROXYCHAINS_SOCKS5_HOST=${NODE_IP} (accessible from worker nodes)"
+    #     echo "[proxy] ✓ PROXYCHAINS_SOCKS5_PORT=${TUNNEL_PORT}"
+    # fi
+
+    # ============================================================================
+    # Daytona/aiohttp timeout and retry settings
+    # ============================================================================
+    export DAYTONA_MAX_RETRIES=5
+    export DAYTONA_RETRY_DELAY=30
+    export DAYTONA_BACKOFF_FACTOR=2
+    export DAYTONA_TIMEOUT=1800  # 30 minutes
+    export AIOHTTP_CLIENT_TIMEOUT=900  # 15 minutes
+    export AIOHTTP_CONNECTOR_TIMEOUT=900
+    export AIOHTTP_SOCK_CONNECT_TIMEOUT=300
+    export AIOHTTP_TOTAL_TIMEOUT=1800
+
+    # Disable SSL verification (JSC certificate issues)
+    export PYTHONHTTPSVERIFY=0
+    unset SSL_CERT_FILE
+    unset CURL_CA_BUNDLE
+    unset REQUESTS_CA_BUNDLE
+    unset SSL_CERT_DIR
+
+    echo "[proxy] ✓ Daytona timeout settings configured"
+
+    # Test proxy connectivity
+    echo "[proxy] Testing proxy connectivity..."
+    if [[ "$PROXYCHAINS_MODE" == "binary" ]]; then
+        if "$PROXYCHAINS_BIN" -f "$CFG_PATH" curl -s --connect-timeout 10 https://huggingface.co -o /dev/null; then
+            echo "[proxy] ✓ Proxy connectivity test passed (huggingface.co reachable via wrapped binary)"
+        else
+            echo "[proxy] ⚠ Proxy connectivity test failed (may still work for Daytona)"
+        fi
+    else
+        if curl -s --connect-timeout 10 https://huggingface.co -o /dev/null 2>/dev/null; then
+            echo "[proxy] ✓ Proxy connectivity test passed (huggingface.co reachable via LD_PRELOAD)"
+        else
+            echo "[proxy] ⚠ Proxy connectivity test failed (may still work for Daytona)"
+        fi
+    fi
+
+    # Test that tunnel is accessible from this node's IP (for worker node access)
+    if nc -z ${NODE_IP} ${TUNNEL_PORT} 2>/dev/null; then
+        echo "[proxy] ✓ Tunnel accessible at ${NODE_IP}:${TUNNEL_PORT} (workers can connect)"
+    else
+        echo "[proxy] ⚠ Tunnel not accessible at ${NODE_IP}:${TUNNEL_PORT} (workers may fail)"
+    fi
+
+    if [[ "$PROXYCHAINS_MODE" == "binary" ]]; then
+        echo "[proxy] ✓ Proxy setup complete (using wrapped binary for Ray workers)"
+    else
+        echo "[proxy] ✓ Proxy setup complete (using LD_PRELOAD for Ray worker inheritance)"
+    fi
+fi
+
+
+# --- Run the SFT job via Python runner ---
+echo "=== Universal SFT Training Runner ==="
+echo "Config: /e/scratch/jureap59/raoof1/sft_experiments/g1_gptlong_top8_32b/configs/g1_gptlong_top8_32b__Qwen3-8B_sft_config.json"
+echo "Working directory: $WORKDIR"
+echo "Nodes: $NUM_NODES, GPUs/node: $NUM_GPUS_PER_NODE"
+echo "======================================"
+
+echo LD_LIBRARY_PATH=$LD_LIBRARY_PATH
+srun --mpi=none --nodes=24 $PROXY_CMD bash -c 'python -m hpc.sft_launch_utils --config "/e/scratch/jureap59/raoof1/sft_experiments/g1_gptlong_top8_32b/configs/g1_gptlong_top8_32b__Qwen3-8B_sft_config.json"'
--- a/training_configs/sft_config.json
+++ b/training_configs/sft_config.json
@@ -0,0 +1,15 @@
+{
+  "job_name": "g1_gptlong_top8_32b__Qwen3-8B",
+  "train_config_path": "/e/scratch/jureap59/raoof1/sft_experiments/g1_gptlong_top8_32b/configs/g1_gptlong_top8_32b__Qwen3-8B_train_config.yaml",
+  "experiments_dir": "/e/scratch/jureap59/raoof1/sft_experiments/g1_gptlong_top8_32b",
+  "cluster_name": "jupiter",
+  "num_nodes": 24,
+  "gpus_per_node": 4,
+  "cpus_per_node": 288,
+  "launcher": "accelerate",
+  "accelerate_config_path": null,
+  "deepspeed_config": "sft/lf_configs/deepspeed/ds_z3_accelerate.json",
+  "master_port": 12802,
+  "needs_ssh_tunnel": true,
+  "needs_cuda_detection": false
+}
--- a/training_configs/train_config.yaml
+++ b/training_configs/train_config.yaml
@@ -0,0 +1,52 @@
+adam_beta2: 0.98
+assistant_tag: assistant
+attn: fa2
+bf16: true
+content_tag: content
+cutoff_len: 32768
+dataloader_num_workers: 4
+dataloader_persistent_workers: true
+dataloader_pin_memory: true
+dataset: /e/scratch/jureap59/raoof1/sft_data/hf_hub/datasets--DCAgent--g1_min_episodes_e1_gpt_long_top8_glm47_traces/snapshots/9828cc7d5cb31c19ed7e6dead76bd24dc2d66262_thinking_preprocessed
+dataset_dir: ONLINE
+datasets_cache_dir: /e/scratch/jureap59/raoof1/sft_data/arrow_cache
+ddp_timeout: 180000000
+deepspeed: sft/lf_configs/deepspeed/ds_z3_accelerate.json
+do_train: true
+enable_liger_kernel: true
+finetuning_type: full
+formatting: sharegpt
+gradient_accumulation_steps: 1
+gradient_checkpointing: true
+hub_model_id: DCAgent/g1_gptlong_top8_32b
+include_mfu: true
+learning_rate: 4.0e-05
+load_best_model_at_end: false
+logging_steps: 5
+logging_strategy: steps
+lr_scheduler_type: cosine
+max_grad_norm: 0.001
+messages: conversations
+model_name_or_path: /e/scratch/jureap59/raoof1/sft_data/hf_hub/models--Qwen--Qwen3-32B/snapshots/9216db5781bf21249d130ec9da846c4624c16137
+num_train_epochs: 5.0
+optim: adamw_torch_fused
+output_dir: /e/scratch/jureap59/raoof1/sft_data/checkpoints/sft_g1_gptlong_top8_32b__Qwen3-32B
+overwrite_cache: true
+per_device_train_batch_size: 1
+plot_loss: true
+preprocessing_num_workers: 16
+pure_bf16: false
+push_to_hub: false
+role_tag: role
+run_name: g1_gptlong_top8_32b__Qwen3-8B
+save_steps: 300
+save_strategy: steps
+save_total_limit: 1
+seed: 42
+stage: sft
+template: qwen3
+trust_remote_code: true
+user_tag: user
+warmup_ratio: 0.1
+weight_decay: 0.04
+disable_shuffling: true