初始化项目,由ModelHub XC社区提供模型
Model: DCAgent/g1_gptlong_top8_32b Source: Original Platform
This commit is contained in:
366
training_configs/sft.sbatch
Normal file
366
training_configs/sft.sbatch
Normal file
@@ -0,0 +1,366 @@
|
||||
#!/bin/bash
|
||||
#SBATCH --time=12:00:00
|
||||
#SBATCH --nodes=24
|
||||
#SBATCH --ntasks-per-node=1
|
||||
#SBATCH --cpus-per-task=288
|
||||
#SBATCH --output=/e/scratch/jureap59/raoof1/sft_experiments/g1_gptlong_top8_32b/logs/%x_%j.out
|
||||
#SBATCH --job-name=g1_gptlong_top8_32b__Qwen3-8B
|
||||
#SBATCH --mail-type=END,TIME_LIMIT,FAIL
|
||||
#SBATCH --mail-user=
|
||||
#SBATCH -p booster
|
||||
#SBATCH --account reformo
|
||||
#SBATCH --gres=gpu:4
|
||||
#SBATCH --exclude=jpbo-031-[01-48]
|
||||
|
||||
# ==============================================================================
|
||||
# Universal SFT Training SBATCH Template
|
||||
# ==============================================================================
|
||||
# This template replaces the cluster-specific *_train.sbatch scripts by delegating
|
||||
# all logic to the SFTJobRunner Python class.
|
||||
#
|
||||
# Usage: The launcher writes a JSON config file and substitutes /e/scratch/jureap59/raoof1/sft_experiments/g1_gptlong_top8_32b/configs/g1_gptlong_top8_32b__Qwen3-8B_sft_config.json
|
||||
# ==============================================================================
|
||||
|
||||
source /e/scratch/jureap59/feuer1/miniforge3/etc/profile.d/conda.sh
|
||||
conda activate otagent
|
||||
set -eo pipefail
|
||||
ml purge
|
||||
ulimit -c 0 # Disable core dumps to avoid filling disk space
|
||||
|
||||
# Handle bash completion scripts that use BASH_COMPLETION_DEBUG
|
||||
if [ -z "${BASH_COMPLETION_DEBUG+x}" ]; then
|
||||
export BASH_COMPLETION_DEBUG=""
|
||||
fi
|
||||
|
||||
# --- Clean up /tmp to prevent state pollution from previous jobs ---
|
||||
# Some HPC systems retain /tmp contents across job allocations on the same node.
|
||||
# This can cause issues with tmux sessions, container state, and other temporary files.
|
||||
rm -rf /tmp/tmux-* 2>/dev/null || true
|
||||
rm -rf /tmp/ray 2>/dev/null || true
|
||||
rm -rf /tmp/hf_home 2>/dev/null || true
|
||||
rm -rf /tmp/containers 2>/dev/null || true
|
||||
rm -rf /tmp/podman-* 2>/dev/null || true
|
||||
|
||||
# Guard conda deactivate scripts from set -u complaints
|
||||
export CONDA_BACKUP_CXX="${CONDA_BACKUP_CXX:-}"
|
||||
export CONDA_BACKUP_CC="${CONDA_BACKUP_CC:-}"
|
||||
export CONDA_BACKUP_FC="${CONDA_BACKUP_FC:-}"
|
||||
|
||||
# --- Module and Conda Setup ---
|
||||
|
||||
# --- Module loading (cluster-specific, substituted by launcher) ---
|
||||
module load nvidia-compilers/25.9-CUDA-13
|
||||
|
||||
# --- Environment setup ---
|
||||
if [ -n "${DCFT_PRIVATE:-}" ]; then
|
||||
WORKDIR="$DCFT_PRIVATE"
|
||||
elif [ -n "${DCFT:-}" ]; then
|
||||
WORKDIR="$DCFT"
|
||||
else
|
||||
WORKDIR="$PWD"
|
||||
fi
|
||||
cd "$WORKDIR"
|
||||
|
||||
if [ -z "${DCFT:-}" ]; then
|
||||
export DCFT="$WORKDIR"
|
||||
fi
|
||||
|
||||
# --- Conda activation (cluster-specific, substituted by launcher) ---
|
||||
# No conda activation configured
|
||||
|
||||
# --- Source environment files ---
|
||||
if [ -n "${DCFT:-}" ] && [ -f "$DCFT/hpc/dotenv/jupiter.env" ]; then
|
||||
source "$DCFT/hpc/dotenv/jupiter.env"
|
||||
fi
|
||||
if [ -n "${DC_AGENT_SECRET_ENV:-}" ] && [ -f "$DC_AGENT_SECRET_ENV" ]; then
|
||||
set -a
|
||||
source "$DC_AGENT_SECRET_ENV"
|
||||
set +a
|
||||
fi
|
||||
if [ -n "${DCFT_ACTIVATE_ENV:-}" ]; then
|
||||
eval "$DCFT_ACTIVATE_ENV"
|
||||
fi
|
||||
|
||||
# --- CUDA path detection (Perlmutter and similar) ---
|
||||
|
||||
|
||||
# --- NCCL/Networking settings (cluster-specific) ---
|
||||
# Cluster-specific NCCL/networking settings
|
||||
export NCCL_DEBUG="INFO"
|
||||
export NCCL_NET_GDR_LEVEL="0"
|
||||
export NCCL_SOCKET_IFNAME="ib0"
|
||||
export NCCL_IB_TIMEOUT="60"
|
||||
|
||||
# --- Cluster-specific environment variables ---
|
||||
export WANDB_MODE="offline"
|
||||
export GLOO_USE_IPV6="0"
|
||||
export NCCL_SOCKET_FAMILY="AF_INET"
|
||||
export SKYRL_ENABLE_NUMA_AFFINITY="1"
|
||||
export DISABLE_AIOHTTP_TRANSPORT="True"
|
||||
export VLLM_ALLREDUCE_USE_SYMM_MEM="0"
|
||||
|
||||
# --- Ray defaults ---
|
||||
# --- Ray defaults ---
|
||||
export RAY_CGRAPH_get_timeout="${RAY_CGRAPH_get_timeout:-900}"
|
||||
# GH200 unified memory: GPU HBM is part of system RAM, so Ray's
|
||||
# memory monitor double-counts GPU allocations and kills workers
|
||||
# during model loading. Disable the monitor entirely.
|
||||
export RAY_memory_monitor_refresh_ms=0
|
||||
if [ -z "${RAY_TMPDIR:-}" ]; then
|
||||
RAY_TMPDIR_BASE="/tmp/ray"
|
||||
RAY_TMPDIR="${RAY_TMPDIR_BASE}/ray_${SLURM_JOB_ID:-$$}"
|
||||
mkdir -p "$RAY_TMPDIR"
|
||||
fi
|
||||
export RAY_TMPDIR="${RAY_TMPDIR}"
|
||||
echo "[ray] RAY_TMPDIR=$RAY_TMPDIR"
|
||||
|
||||
# --- Standard environment variables ---
|
||||
export PYTHONFAULTHANDLER=1
|
||||
export TORCH_SHOW_CPP_STACKTRACES=1
|
||||
export CUDA_LAUNCH_BLOCKING=0
|
||||
export TORCH_NCCL_ASYNC_ERROR_HANDLING=1
|
||||
export OMP_NUM_THREADS=1
|
||||
export PYTHONPATH="$WORKDIR:${PYTHONPATH:-}"
|
||||
export DISABLE_VERSION_CHECK=1 # Skip LlamaFactory transformers version check
|
||||
|
||||
# --- Distributed training setup ---
|
||||
export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
|
||||
# if histname contains jrc or jwb, set master addres to ${MASTER_ADDR}i to use infiniband
|
||||
if [[ "$MASTER_ADDR" == *"jrc"* || "$MASTER_ADDR" == *"jwb"* ]]; then
|
||||
export MASTER_ADDR="${MASTER_ADDR}i"
|
||||
fi
|
||||
|
||||
echo "MASTER_ADDR set to $MASTER_ADDR"
|
||||
export MASTER_PORT=12802
|
||||
export NUM_NODES=$SLURM_JOB_NUM_NODES
|
||||
export NUM_GPUS_PER_NODE=4
|
||||
export NUM_GPUS=$((NUM_GPUS_PER_NODE*SLURM_NNODES))
|
||||
|
||||
# --- HuggingFace/WandB paths ---
|
||||
export HF_HOME="${HF_HOME:-${HF_HUB_CACHE:-/tmp/hf_home}}"
|
||||
export WANDB_DIR="${DCFT_WANDB_DIR:-$DCFT//e/scratch/jureap59/raoof1/sft_experiments/g1_gptlong_top8_32b/wandb}"
|
||||
|
||||
# --- Triton/TorchInductor cache settings (node-local to avoid shared FS issues) ---
|
||||
export TRITON_CACHE_VERBOSE=1
|
||||
source "$WORKDIR/hpc/shell_utils/triton_cache.sh"
|
||||
|
||||
# --- Create experiment directories ---
|
||||
mkdir -p "$DCFT//e/scratch/jureap59/raoof1/sft_experiments/g1_gptlong_top8_32b"
|
||||
mkdir -p "$DCFT//e/scratch/jureap59/raoof1/sft_experiments/g1_gptlong_top8_32b/logs"
|
||||
mkdir -p "$DCFT//e/scratch/jureap59/raoof1/sft_experiments/g1_gptlong_top8_32b/tmp"
|
||||
|
||||
# --- Supabase environment variables for DB registration ---
|
||||
for _supabase_var in SUPABASE_URL SUPABASE_ANON_KEY SUPABASE_SERVICE_ROLE_KEY; do
|
||||
if [[ -n "${!_supabase_var:-}" ]]; then
|
||||
export "${_supabase_var}=${!_supabase_var}"
|
||||
else
|
||||
echo "Warning: ${_supabase_var} is not set; Supabase registration may fail." >&2
|
||||
fi
|
||||
done
|
||||
|
||||
# --- SSH Tunneling (JSC clusters only) ---
|
||||
# ============================================================================
|
||||
# SSH Tunnel + Proxychains Setup for No-Internet Clusters (JSC)
|
||||
#
|
||||
# Creates SOCKS5 proxy via SSH tunnel to login node, then uses proxychains
|
||||
# to route external traffic through the tunnel.
|
||||
#
|
||||
# Jupiter (ARM GH200): Uses wrapped binary approach (proxychains4 -f <config> cmd)
|
||||
# Other JSC clusters: Uses LD_PRELOAD approach for Ray worker inheritance
|
||||
# ============================================================================
|
||||
|
||||
# Determine login node and proxychains paths based on cluster
|
||||
NODE_HOST=$(hostname -s)
|
||||
PROXYCHAINS_MODE="" # "binary" or "ldpreload"
|
||||
|
||||
if [[ $NODE_HOST == jrc* ]]; then
|
||||
LOGIN_NODE="jrlogin05i"
|
||||
PROXYCHAINS_LIB="/p/scratch/synthlaion/dc-agent-shared/tools/proxychains-ng-install/lib/libproxychains4.so"
|
||||
PROXYCHAINS_MODE="ldpreload"
|
||||
elif [[ $NODE_HOST == jwb* ]]; then
|
||||
LOGIN_NODE="jwlogin22i"
|
||||
PROXYCHAINS_LIB="/p/scratch/synthlaion/dc-agent-shared/tools/proxychains-ng-install/lib/libproxychains4.so"
|
||||
PROXYCHAINS_MODE="ldpreload"
|
||||
elif [[ $NODE_HOST == jpb* ]] || [[ $NODE_HOST == jpc* ]]; then
|
||||
LOGIN_NODE="jpbl-s01-01"
|
||||
# Jupiter uses aarch64 build - binary wrapper approach (LD_PRELOAD doesn't work reliably)
|
||||
PROXYCHAINS_BIN="/e/scratch/jureap59/feuer1/proxychains-ng-aarch64/bin/proxychains4"
|
||||
PROXYCHAINS_MODE="binary"
|
||||
elif [[ $NODE_HOST == lrdn* ]] || [[ $NODE_HOST == *.leonardo.local ]]; then
|
||||
LOGIN_NODE="login05-ext.leonardo.cineca.it"
|
||||
# Leonardo uses x86 build - binary wrapper approach
|
||||
PROXYCHAINS_BIN="/leonardo/home/userexternal/bfeuer00/proxychains/bin/proxychains4"
|
||||
PROXYCHAINS_MODE="binary"
|
||||
else
|
||||
echo "[proxy] Unknown cluster for node $NODE_HOST - skipping proxy setup"
|
||||
return 0
|
||||
fi
|
||||
|
||||
TUNNEL_PORT=7003
|
||||
|
||||
# Check if proxychains is available
|
||||
if [[ "$PROXYCHAINS_MODE" == "binary" ]]; then
|
||||
if [ ! -x "$PROXYCHAINS_BIN" ]; then
|
||||
echo "[proxy] ✗ proxychains binary not found at $PROXYCHAINS_BIN"
|
||||
echo "[proxy] Skipping proxy setup - external connectivity will fail"
|
||||
return 0
|
||||
fi
|
||||
echo "[proxy] ✓ Found proxychains binary at $PROXYCHAINS_BIN"
|
||||
else
|
||||
if [ ! -f "$PROXYCHAINS_LIB" ]; then
|
||||
echo "[proxy] ✗ proxychains library not found at $PROXYCHAINS_LIB"
|
||||
echo "[proxy] Skipping proxy setup - external connectivity will fail"
|
||||
return 0
|
||||
fi
|
||||
echo "[proxy] ✓ Found proxychains library at $PROXYCHAINS_LIB"
|
||||
fi
|
||||
|
||||
if [ -z "${SSH_KEY:-}" ]; then
|
||||
echo "[proxy] SSH_KEY not set - skipping proxy setup"
|
||||
echo "[proxy] Set SSH_KEY in your environment to enable internet access"
|
||||
else
|
||||
# Get this node's IP address for multi-node proxy access
|
||||
NODE_IP=$(nslookup $NODE_HOST | grep 'Address' | tail -n1 | awk '{print $2}')
|
||||
echo "[proxy] Setting up SSH tunnel to $LOGIN_NODE"
|
||||
echo "[proxy] SSH key: $SSH_KEY"
|
||||
echo "[proxy] Tunnel port: $TUNNEL_PORT"
|
||||
echo "[proxy] Node IP: $NODE_IP (workers will connect here)"
|
||||
|
||||
# Create SSH tunnel with SOCKS5 proxy
|
||||
# -g flag allows remote hosts (worker nodes) to connect to the tunnel
|
||||
ssh -g -f -N -D ${TUNNEL_PORT} \
|
||||
-o StrictHostKeyChecking=no \
|
||||
-o ConnectTimeout=1000 \
|
||||
-o ServerAliveInterval=10 \
|
||||
-o ServerAliveCountMax=30 \
|
||||
-o TCPKeepAlive=yes \
|
||||
-o ExitOnForwardFailure=yes \
|
||||
-o BatchMode=yes \
|
||||
-i ${SSH_KEY} \
|
||||
${USER}@${LOGIN_NODE}
|
||||
|
||||
# Give tunnel time to establish
|
||||
sleep 5
|
||||
|
||||
# Verify tunnel is running
|
||||
if pgrep -f "ssh.*-D.*${TUNNEL_PORT}" > /dev/null; then
|
||||
echo "[proxy] ✓ SSH tunnel started successfully"
|
||||
else
|
||||
echo "[proxy] ✗ SSH tunnel failed to start"
|
||||
return 0
|
||||
fi
|
||||
|
||||
# ============================================================================
|
||||
# Generate proxychains config
|
||||
# Key: Uses NODE_IP (not localhost) so worker nodes can access the tunnel
|
||||
# localnet entries ensure internal traffic (Ray, NCCL) bypasses proxy
|
||||
# ============================================================================
|
||||
SLURM_JOB_ID=${SLURM_JOB_ID:-"local"}
|
||||
CFG_PATH=~/.proxychains/proxychains_${SLURM_JOB_ID}.conf
|
||||
mkdir -p ~/.proxychains
|
||||
|
||||
cat > "$CFG_PATH" <<PCEOF
|
||||
strict_chain
|
||||
quiet_mode
|
||||
tcp_read_time_out 30000
|
||||
tcp_connect_time_out 15000
|
||||
localnet 127.0.0.0/255.0.0.0
|
||||
localnet 127.0.0.1/255.255.255.255
|
||||
localnet 10.0.0.0/255.0.0.0
|
||||
localnet 172.16.0.0/255.240.0.0
|
||||
localnet 192.168.0.0/255.255.0.0
|
||||
localnet 169.254.0.0/255.255.0.0
|
||||
[ProxyList]
|
||||
socks5 ${NODE_IP} ${TUNNEL_PORT}
|
||||
PCEOF
|
||||
|
||||
echo "[proxy] ✓ Generated proxychains config at $CFG_PATH"
|
||||
echo "[proxy] - Internal traffic (10.x.x.x, 172.x.x.x, 169.254.x.x) → DIRECT"
|
||||
echo "[proxy] - External traffic (internet) → PROXY via tunnel"
|
||||
|
||||
# ============================================================================
|
||||
# Export proxychains configuration based on mode
|
||||
# ============================================================================
|
||||
export PROXYCHAINS_CONF_FILE="$CFG_PATH"
|
||||
export PROXYCHAINS_SOCKS5_HOST="${NODE_IP}"
|
||||
export PROXYCHAINS_SOCKS5_PORT="${TUNNEL_PORT}"
|
||||
|
||||
# if [[ "$PROXYCHAINS_MODE" == "binary" ]]; then
|
||||
# # Binary wrapper approach (Jupiter ARM GH200)
|
||||
# # Ray workers will use: proxychains4 -f $PROXYCHAINS_CONF_FILE ray start ...
|
||||
# export PROXYCHAINS_BINARY="$PROXYCHAINS_BIN"
|
||||
# echo "[proxy] ✓ PROXYCHAINS_BINARY=$PROXYCHAINS_BIN"
|
||||
# echo "[proxy] ✓ PROXYCHAINS_CONF_FILE=$CFG_PATH"
|
||||
# echo "[proxy] ✓ PROXYCHAINS_SOCKS5_HOST=${NODE_IP} (accessible from worker nodes)"
|
||||
# echo "[proxy] ✓ PROXYCHAINS_SOCKS5_PORT=${TUNNEL_PORT}"
|
||||
# else
|
||||
# # LD_PRELOAD approach (Jureca, Juwels)
|
||||
# # Ray workers inherit proxy via LD_PRELOAD environment variable
|
||||
# export LD_PRELOAD="$PROXYCHAINS_LIB"
|
||||
# echo "[proxy] ✓ LD_PRELOAD set to $PROXYCHAINS_LIB"
|
||||
# echo "[proxy] ✓ PROXYCHAINS_CONF_FILE=$CFG_PATH"
|
||||
# echo "[proxy] ✓ PROXYCHAINS_SOCKS5_HOST=${NODE_IP} (accessible from worker nodes)"
|
||||
# echo "[proxy] ✓ PROXYCHAINS_SOCKS5_PORT=${TUNNEL_PORT}"
|
||||
# fi
|
||||
|
||||
# ============================================================================
|
||||
# Daytona/aiohttp timeout and retry settings
|
||||
# ============================================================================
|
||||
export DAYTONA_MAX_RETRIES=5
|
||||
export DAYTONA_RETRY_DELAY=30
|
||||
export DAYTONA_BACKOFF_FACTOR=2
|
||||
export DAYTONA_TIMEOUT=1800 # 30 minutes
|
||||
export AIOHTTP_CLIENT_TIMEOUT=900 # 15 minutes
|
||||
export AIOHTTP_CONNECTOR_TIMEOUT=900
|
||||
export AIOHTTP_SOCK_CONNECT_TIMEOUT=300
|
||||
export AIOHTTP_TOTAL_TIMEOUT=1800
|
||||
|
||||
# Disable SSL verification (JSC certificate issues)
|
||||
export PYTHONHTTPSVERIFY=0
|
||||
unset SSL_CERT_FILE
|
||||
unset CURL_CA_BUNDLE
|
||||
unset REQUESTS_CA_BUNDLE
|
||||
unset SSL_CERT_DIR
|
||||
|
||||
echo "[proxy] ✓ Daytona timeout settings configured"
|
||||
|
||||
# Test proxy connectivity
|
||||
echo "[proxy] Testing proxy connectivity..."
|
||||
if [[ "$PROXYCHAINS_MODE" == "binary" ]]; then
|
||||
if "$PROXYCHAINS_BIN" -f "$CFG_PATH" curl -s --connect-timeout 10 https://huggingface.co -o /dev/null; then
|
||||
echo "[proxy] ✓ Proxy connectivity test passed (huggingface.co reachable via wrapped binary)"
|
||||
else
|
||||
echo "[proxy] ⚠ Proxy connectivity test failed (may still work for Daytona)"
|
||||
fi
|
||||
else
|
||||
if curl -s --connect-timeout 10 https://huggingface.co -o /dev/null 2>/dev/null; then
|
||||
echo "[proxy] ✓ Proxy connectivity test passed (huggingface.co reachable via LD_PRELOAD)"
|
||||
else
|
||||
echo "[proxy] ⚠ Proxy connectivity test failed (may still work for Daytona)"
|
||||
fi
|
||||
fi
|
||||
|
||||
# Test that tunnel is accessible from this node's IP (for worker node access)
|
||||
if nc -z ${NODE_IP} ${TUNNEL_PORT} 2>/dev/null; then
|
||||
echo "[proxy] ✓ Tunnel accessible at ${NODE_IP}:${TUNNEL_PORT} (workers can connect)"
|
||||
else
|
||||
echo "[proxy] ⚠ Tunnel not accessible at ${NODE_IP}:${TUNNEL_PORT} (workers may fail)"
|
||||
fi
|
||||
|
||||
if [[ "$PROXYCHAINS_MODE" == "binary" ]]; then
|
||||
echo "[proxy] ✓ Proxy setup complete (using wrapped binary for Ray workers)"
|
||||
else
|
||||
echo "[proxy] ✓ Proxy setup complete (using LD_PRELOAD for Ray worker inheritance)"
|
||||
fi
|
||||
fi
|
||||
|
||||
|
||||
# --- Run the SFT job via Python runner ---
|
||||
echo "=== Universal SFT Training Runner ==="
|
||||
echo "Config: /e/scratch/jureap59/raoof1/sft_experiments/g1_gptlong_top8_32b/configs/g1_gptlong_top8_32b__Qwen3-8B_sft_config.json"
|
||||
echo "Working directory: $WORKDIR"
|
||||
echo "Nodes: $NUM_NODES, GPUs/node: $NUM_GPUS_PER_NODE"
|
||||
echo "======================================"
|
||||
|
||||
echo LD_LIBRARY_PATH=$LD_LIBRARY_PATH
|
||||
srun --mpi=none --nodes=24 $PROXY_CMD bash -c 'python -m hpc.sft_launch_utils --config "/e/scratch/jureap59/raoof1/sft_experiments/g1_gptlong_top8_32b/configs/g1_gptlong_top8_32b__Qwen3-8B_sft_config.json"'
|
||||
15
training_configs/sft_config.json
Normal file
15
training_configs/sft_config.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"job_name": "g1_gptlong_top8_32b__Qwen3-8B",
|
||||
"train_config_path": "/e/scratch/jureap59/raoof1/sft_experiments/g1_gptlong_top8_32b/configs/g1_gptlong_top8_32b__Qwen3-8B_train_config.yaml",
|
||||
"experiments_dir": "/e/scratch/jureap59/raoof1/sft_experiments/g1_gptlong_top8_32b",
|
||||
"cluster_name": "jupiter",
|
||||
"num_nodes": 24,
|
||||
"gpus_per_node": 4,
|
||||
"cpus_per_node": 288,
|
||||
"launcher": "accelerate",
|
||||
"accelerate_config_path": null,
|
||||
"deepspeed_config": "sft/lf_configs/deepspeed/ds_z3_accelerate.json",
|
||||
"master_port": 12802,
|
||||
"needs_ssh_tunnel": true,
|
||||
"needs_cuda_detection": false
|
||||
}
|
||||
52
training_configs/train_config.yaml
Normal file
52
training_configs/train_config.yaml
Normal file
@@ -0,0 +1,52 @@
|
||||
adam_beta2: 0.98
|
||||
assistant_tag: assistant
|
||||
attn: fa2
|
||||
bf16: true
|
||||
content_tag: content
|
||||
cutoff_len: 32768
|
||||
dataloader_num_workers: 4
|
||||
dataloader_persistent_workers: true
|
||||
dataloader_pin_memory: true
|
||||
dataset: /e/scratch/jureap59/raoof1/sft_data/hf_hub/datasets--DCAgent--g1_min_episodes_e1_gpt_long_top8_glm47_traces/snapshots/9828cc7d5cb31c19ed7e6dead76bd24dc2d66262_thinking_preprocessed
|
||||
dataset_dir: ONLINE
|
||||
datasets_cache_dir: /e/scratch/jureap59/raoof1/sft_data/arrow_cache
|
||||
ddp_timeout: 180000000
|
||||
deepspeed: sft/lf_configs/deepspeed/ds_z3_accelerate.json
|
||||
do_train: true
|
||||
enable_liger_kernel: true
|
||||
finetuning_type: full
|
||||
formatting: sharegpt
|
||||
gradient_accumulation_steps: 1
|
||||
gradient_checkpointing: true
|
||||
hub_model_id: DCAgent/g1_gptlong_top8_32b
|
||||
include_mfu: true
|
||||
learning_rate: 4.0e-05
|
||||
load_best_model_at_end: false
|
||||
logging_steps: 5
|
||||
logging_strategy: steps
|
||||
lr_scheduler_type: cosine
|
||||
max_grad_norm: 0.001
|
||||
messages: conversations
|
||||
model_name_or_path: /e/scratch/jureap59/raoof1/sft_data/hf_hub/models--Qwen--Qwen3-32B/snapshots/9216db5781bf21249d130ec9da846c4624c16137
|
||||
num_train_epochs: 5.0
|
||||
optim: adamw_torch_fused
|
||||
output_dir: /e/scratch/jureap59/raoof1/sft_data/checkpoints/sft_g1_gptlong_top8_32b__Qwen3-32B
|
||||
overwrite_cache: true
|
||||
per_device_train_batch_size: 1
|
||||
plot_loss: true
|
||||
preprocessing_num_workers: 16
|
||||
pure_bf16: false
|
||||
push_to_hub: false
|
||||
role_tag: role
|
||||
run_name: g1_gptlong_top8_32b__Qwen3-8B
|
||||
save_steps: 300
|
||||
save_strategy: steps
|
||||
save_total_limit: 1
|
||||
seed: 42
|
||||
stage: sft
|
||||
template: qwen3
|
||||
trust_remote_code: true
|
||||
user_tag: user
|
||||
warmup_ratio: 0.1
|
||||
weight_decay: 0.04
|
||||
disable_shuffling: true
|
||||
Reference in New Issue
Block a user