1216 lines
229 KiB
Plaintext
1216 lines
229 KiB
Plaintext
2026-04-14 19:26:26 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 1 distributed training: True, 16-bits training: False
|
||
2026-04-14 19:26:26 - INFO - __main__ - Model parameters ModelArguments(base_model_revision=None, model_name_or_path='Qwen/Qwen3-8B-Base', model_revision='main', model_code_revision=None, torch_dtype='bfloat16', tokenizer_name_or_path=None, trust_remote_code=False, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False, bnb_4bit_quant_storage='uint8')
|
||
2026-04-14 19:26:26 - INFO - __main__ - Data parameters DataArguments(chat_template=None, dataset_mixer={'Anthropic/hh-rlhf': 1.0}, text_column='text', dataset_splits=['train', 'test'], dataset_configs=['helpful-base'], dataset_dir=None, preprocessing_num_workers=12, use_persistent_hf_cache=False, hf_cache_dir=None, truncation_side=None, auto_insert_empty_system_msg=True, preprocessing_log_samples=0, preprocessing_log_dir=None)
|
||
2026-04-14 19:26:26 - INFO - __main__ - Training/evaluation parameters SFTConfig(
|
||
_n_gpu=1,
|
||
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
|
||
adafactor=False,
|
||
adam_beta1=0.9,
|
||
adam_beta2=0.999,
|
||
adam_epsilon=1e-08,
|
||
auto_find_batch_size=False,
|
||
average_tokens_across_devices=False,
|
||
batch_eval_metrics=False,
|
||
bf16=True,
|
||
bf16_full_eval=False,
|
||
chars_per_token=<CHARS_PER_TOKEN>,
|
||
data_seed=None,
|
||
dataloader_drop_last=False,
|
||
dataloader_num_workers=0,
|
||
dataloader_persistent_workers=False,
|
||
dataloader_pin_memory=True,
|
||
dataloader_prefetch_factor=None,
|
||
dataset_batch_size=1000,
|
||
dataset_kwargs=None,
|
||
dataset_num_proc=None,
|
||
dataset_text_field=None,
|
||
ddp_backend=None,
|
||
ddp_broadcast_buffers=None,
|
||
ddp_bucket_cap_mb=None,
|
||
ddp_find_unused_parameters=None,
|
||
ddp_timeout=1800,
|
||
debug=[],
|
||
deepspeed=None,
|
||
disable_tqdm=False,
|
||
do_eval=True,
|
||
do_predict=False,
|
||
do_train=False,
|
||
eval_accumulation_steps=None,
|
||
eval_delay=0,
|
||
eval_do_concat_batches=True,
|
||
eval_on_start=False,
|
||
eval_packing=None,
|
||
eval_steps=100,
|
||
eval_strategy=IntervalStrategy.STEPS,
|
||
eval_use_gather_object=False,
|
||
fp16=False,
|
||
fp16_backend=auto,
|
||
fp16_full_eval=False,
|
||
fp16_opt_level=O1,
|
||
fsdp=[],
|
||
fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False},
|
||
fsdp_min_num_params=0,
|
||
fsdp_transformer_layer_cls_to_wrap=None,
|
||
full_determinism=False,
|
||
gradient_accumulation_steps=1,
|
||
gradient_checkpointing=True,
|
||
gradient_checkpointing_kwargs={'use_reentrant': False},
|
||
greater_is_better=None,
|
||
group_by_length=False,
|
||
half_precision_backend=auto,
|
||
hub_always_push=False,
|
||
hub_model_id=qwen3-8b-base-sft-hh-helpful-8xh200,
|
||
hub_model_revision=main,
|
||
hub_private_repo=None,
|
||
hub_strategy=HubStrategy.END,
|
||
hub_token=<HUB_TOKEN>,
|
||
ignore_data_skip=False,
|
||
include_for_metrics=[],
|
||
include_inputs_for_metrics=False,
|
||
include_num_input_tokens_seen=False,
|
||
include_tokens_per_second=False,
|
||
jit_mode_eval=False,
|
||
label_names=None,
|
||
label_smoothing_factor=0.0,
|
||
learning_rate=2e-05,
|
||
length_column_name=length,
|
||
load_best_model_at_end=False,
|
||
local_rank=0,
|
||
log_level=info,
|
||
log_level_replica=warning,
|
||
log_on_each_node=True,
|
||
logging_dir=outputs/qwen3-8b-base-sft-hh-helpful-8xh200/runs/Apr14_19-26-24_d4053,
|
||
logging_first_step=True,
|
||
logging_nan_inf_filter=True,
|
||
logging_steps=10,
|
||
logging_strategy=IntervalStrategy.STEPS,
|
||
lr_scheduler_kwargs={},
|
||
lr_scheduler_type=SchedulerType.COSINE,
|
||
max_grad_norm=1.0,
|
||
max_seq_length=512,
|
||
max_steps=-1,
|
||
metric_for_best_model=None,
|
||
model_init_kwargs=None,
|
||
mp_parameters=,
|
||
neftune_noise_alpha=None,
|
||
no_cuda=False,
|
||
num_of_sequences=1024,
|
||
num_train_epochs=1,
|
||
optim=OptimizerNames.ADAMW_TORCH,
|
||
optim_args=None,
|
||
optim_target_modules=None,
|
||
output_dir=/scratch/qu.yang1/outputs/qwen3-8b-base-sft-hh-helpful-8xh200-20260414-192602-232981,
|
||
overwrite_output_dir=True,
|
||
packing=False,
|
||
past_index=-1,
|
||
per_device_eval_batch_size=16,
|
||
per_device_train_batch_size=16,
|
||
prediction_loss_only=False,
|
||
push_to_hub=False,
|
||
push_to_hub_model_id=None,
|
||
push_to_hub_organization=None,
|
||
push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
|
||
ray_scope=last,
|
||
remove_unused_columns=True,
|
||
report_to=['wandb'],
|
||
restore_callback_states_from_checkpoint=False,
|
||
resume_from_checkpoint=None,
|
||
run_name=qwen3-8b-base-sft-hh-helpful-8xh200-20260414-192602-232981,
|
||
save_on_each_node=False,
|
||
save_only_model=False,
|
||
save_safetensors=True,
|
||
save_steps=200,
|
||
save_strategy=SaveStrategy.STEPS,
|
||
save_total_limit=2,
|
||
seed=42,
|
||
skip_memory_metrics=True,
|
||
tf32=None,
|
||
torch_compile=False,
|
||
torch_compile_backend=None,
|
||
torch_compile_mode=None,
|
||
torch_empty_cache_steps=None,
|
||
torchdynamo=None,
|
||
tp_size=0,
|
||
tpu_metrics_debug=False,
|
||
tpu_num_cores=None,
|
||
use_cpu=False,
|
||
use_ipex=False,
|
||
use_legacy_prediction_loop=False,
|
||
use_liger=False,
|
||
use_liger_kernel=False,
|
||
use_mps_device=False,
|
||
warmup_ratio=0.1,
|
||
warmup_steps=0,
|
||
weight_decay=0.0,
|
||
)
|
||
2026-04-14 19:26:26 - WARNING - __main__ - Process rank: 1, device: cuda:1, n_gpu: 1 distributed training: True, 16-bits training: False
|
||
2026-04-14 19:26:26 - WARNING - __main__ - Process rank: 5, device: cuda:5, n_gpu: 1 distributed training: True, 16-bits training: False
|
||
2026-04-14 19:26:26 - WARNING - __main__ - Process rank: 6, device: cuda:6, n_gpu: 1 distributed training: True, 16-bits training: False
|
||
2026-04-14 19:26:26 - WARNING - __main__ - Process rank: 7, device: cuda:7, n_gpu: 1 distributed training: True, 16-bits training: False
|
||
2026-04-14 19:26:26 - WARNING - __main__ - Process rank: 4, device: cuda:4, n_gpu: 1 distributed training: True, 16-bits training: False
|
||
2026-04-14 19:26:26 - WARNING - __main__ - Process rank: 3, device: cuda:3, n_gpu: 1 distributed training: True, 16-bits training: False
|
||
2026-04-14 19:26:26 - WARNING - __main__ - Process rank: 2, device: cuda:2, n_gpu: 1 distributed training: True, 16-bits training: False
|
||
hf://datasets/Anthropic/hh-rlhf@09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/README.md not found in cache or force_download set to True, downloading to /scratch/qu.yang1/hf/datasets/downloads/4f81b314c38f15a41852e8177b3798694a3b34661cba63570f6f4cb2bfd194b9.incomplete
|
||
2026-04-14 19:26:26 - INFO - datasets.utils.file_utils - hf://datasets/Anthropic/hh-rlhf@09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/README.md not found in cache or force_download set to True, downloading to /scratch/qu.yang1/hf/datasets/downloads/4f81b314c38f15a41852e8177b3798694a3b34661cba63570f6f4cb2bfd194b9.incomplete
|
||
|
||
Downloading readme: 0%| | 0.00/5.77k [00:00<?, ?B/s]
|
||
Downloading readme: 39%|███▉ | 2.26k/5.77k [00:00<00:00, 22.1kB/s]
|
||
Downloading readme: 100%|██████████| 5.77k/5.77k [00:00<00:00, 54.2kB/s]
|
||
storing hf://datasets/Anthropic/hh-rlhf@09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/README.md in cache at /scratch/qu.yang1/hf/datasets/downloads/4f81b314c38f15a41852e8177b3798694a3b34661cba63570f6f4cb2bfd194b9
|
||
2026-04-14 19:26:27 - INFO - datasets.utils.file_utils - storing hf://datasets/Anthropic/hh-rlhf@09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/README.md in cache at /scratch/qu.yang1/hf/datasets/downloads/4f81b314c38f15a41852e8177b3798694a3b34661cba63570f6f4cb2bfd194b9
|
||
creating metadata file for /scratch/qu.yang1/hf/datasets/downloads/4f81b314c38f15a41852e8177b3798694a3b34661cba63570f6f4cb2bfd194b9
|
||
2026-04-14 19:26:27 - INFO - datasets.utils.file_utils - creating metadata file for /scratch/qu.yang1/hf/datasets/downloads/4f81b314c38f15a41852e8177b3798694a3b34661cba63570f6f4cb2bfd194b9
|
||
No config specified, defaulting to the single config: hh-rlhf/default
|
||
2026-04-14 19:26:27 - INFO - datasets.builder - No config specified, defaulting to the single config: hh-rlhf/default
|
||
Using custom data configuration default-cfba128a0ab1b99f
|
||
2026-04-14 19:26:27 - INFO - datasets.builder - Using custom data configuration default-cfba128a0ab1b99f
|
||
Loading Dataset Infos from /home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/datasets/packaged_modules/json
|
||
2026-04-14 19:26:27 - INFO - datasets.info - Loading Dataset Infos from /home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/datasets/packaged_modules/json
|
||
Generating dataset hh-rlhf (/scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa)
|
||
2026-04-14 19:26:27 - INFO - datasets.builder - Generating dataset hh-rlhf (/scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa)
|
||
Downloading and preparing dataset hh-rlhf/default to /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa...
|
||
2026-04-14 19:26:27 - INFO - datasets.builder - Downloading and preparing dataset hh-rlhf/default to /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa...
|
||
hf://datasets/Anthropic/hh-rlhf@09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/helpful-base/train.jsonl.gz not found in cache or force_download set to True, downloading to /scratch/qu.yang1/hf/datasets/downloads/d0111279a22b3858ad1057a18233beb9d57be16a3c41f6e80a5f3a006fee8984.incomplete
|
||
2026-04-14 19:26:27 - INFO - datasets.utils.file_utils - hf://datasets/Anthropic/hh-rlhf@09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/helpful-base/train.jsonl.gz not found in cache or force_download set to True, downloading to /scratch/qu.yang1/hf/datasets/downloads/d0111279a22b3858ad1057a18233beb9d57be16a3c41f6e80a5f3a006fee8984.incomplete
|
||
|
||
Downloading data: 0%| | 0.00/16.2M [00:00<?, ?B/s]
|
||
Downloading data: 65%|██████▍ | 10.5M/16.2M [00:00<00:00, 22.2MB/s]
|
||
Downloading data: 100%|██████████| 16.2M/16.2M [00:00<00:00, 18.6MB/s]
|
||
Downloading data: 100%|██████████| 16.2M/16.2M [00:00<00:00, 17.0MB/s]
|
||
storing hf://datasets/Anthropic/hh-rlhf@09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/helpful-base/train.jsonl.gz in cache at /scratch/qu.yang1/hf/datasets/downloads/d0111279a22b3858ad1057a18233beb9d57be16a3c41f6e80a5f3a006fee8984
|
||
2026-04-14 19:26:28 - INFO - datasets.utils.file_utils - storing hf://datasets/Anthropic/hh-rlhf@09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/helpful-base/train.jsonl.gz in cache at /scratch/qu.yang1/hf/datasets/downloads/d0111279a22b3858ad1057a18233beb9d57be16a3c41f6e80a5f3a006fee8984
|
||
creating metadata file for /scratch/qu.yang1/hf/datasets/downloads/d0111279a22b3858ad1057a18233beb9d57be16a3c41f6e80a5f3a006fee8984
|
||
2026-04-14 19:26:28 - INFO - datasets.utils.file_utils - creating metadata file for /scratch/qu.yang1/hf/datasets/downloads/d0111279a22b3858ad1057a18233beb9d57be16a3c41f6e80a5f3a006fee8984
|
||
hf://datasets/Anthropic/hh-rlhf@09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/helpful-base/test.jsonl.gz not found in cache or force_download set to True, downloading to /scratch/qu.yang1/hf/datasets/downloads/5345654c80aa565147f1a32e4c8a323cf967386448a298258b152a58a4884c54.incomplete
|
||
2026-04-14 19:26:29 - INFO - datasets.utils.file_utils - hf://datasets/Anthropic/hh-rlhf@09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/helpful-base/test.jsonl.gz not found in cache or force_download set to True, downloading to /scratch/qu.yang1/hf/datasets/downloads/5345654c80aa565147f1a32e4c8a323cf967386448a298258b152a58a4884c54.incomplete
|
||
|
||
Downloading data: 0%| | 0.00/875k [00:00<?, ?B/s]
|
||
Downloading data: 100%|██████████| 875k/875k [00:00<00:00, 7.15MB/s]
|
||
Downloading data: 100%|██████████| 875k/875k [00:00<00:00, 6.53MB/s]
|
||
storing hf://datasets/Anthropic/hh-rlhf@09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/helpful-base/test.jsonl.gz in cache at /scratch/qu.yang1/hf/datasets/downloads/5345654c80aa565147f1a32e4c8a323cf967386448a298258b152a58a4884c54
|
||
2026-04-14 19:26:29 - INFO - datasets.utils.file_utils - storing hf://datasets/Anthropic/hh-rlhf@09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/helpful-base/test.jsonl.gz in cache at /scratch/qu.yang1/hf/datasets/downloads/5345654c80aa565147f1a32e4c8a323cf967386448a298258b152a58a4884c54
|
||
creating metadata file for /scratch/qu.yang1/hf/datasets/downloads/5345654c80aa565147f1a32e4c8a323cf967386448a298258b152a58a4884c54
|
||
2026-04-14 19:26:29 - INFO - datasets.utils.file_utils - creating metadata file for /scratch/qu.yang1/hf/datasets/downloads/5345654c80aa565147f1a32e4c8a323cf967386448a298258b152a58a4884c54
|
||
Downloading took 0.0 min
|
||
2026-04-14 19:26:29 - INFO - datasets.download.download_manager - Downloading took 0.0 min
|
||
Checksum Computation took 0.0 min
|
||
2026-04-14 19:26:29 - INFO - datasets.download.download_manager - Checksum Computation took 0.0 min
|
||
Generating train split
|
||
2026-04-14 19:26:29 - INFO - datasets.builder - Generating train split
|
||
|
||
Generating train split: 0 examples [00:00, ? examples/s]
|
||
Generating train split: 13255 examples [00:00, 80077.63 examples/s]
|
||
Generating train split: 26541 examples [00:00, 84534.88 examples/s]
|
||
Generating train split: 39887 examples [00:00, 93151.39 examples/s]
|
||
Generating train split: 43835 examples [00:00, 75478.25 examples/s]
|
||
Generating test split
|
||
2026-04-14 19:26:29 - INFO - datasets.builder - Generating test split
|
||
|
||
Generating test split: 0 examples [00:00, ? examples/s]
|
||
Generating test split: 2354 examples [00:00, 64072.99 examples/s]
|
||
Unable to verify splits sizes.
|
||
2026-04-14 19:26:29 - INFO - datasets.utils.info_utils - Unable to verify splits sizes.
|
||
Dataset hh-rlhf downloaded and prepared to /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa. Subsequent calls will reuse this data.
|
||
2026-04-14 19:26:30 - INFO - datasets.builder - Dataset hh-rlhf downloaded and prepared to /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa. Subsequent calls will reuse this data.
|
||
2026-04-14 19:26:32 - WARNING - alignment.data - Dropped 237 non-canonical HH preference examples from split `train` before normalization (126 x HH preprocessing expects exactly one final assistant response in chosen/rejected suffixes., 111 x HH chosen/rejected transcripts must each contain a divergent assistant response.).
|
||
2026-04-14 19:26:32 - WARNING - alignment.data - Dropped 237 non-canonical HH preference examples from split `train` before normalization (126 x HH preprocessing expects exactly one final assistant response in chosen/rejected suffixes., 111 x HH chosen/rejected transcripts must each contain a divergent assistant response.).
|
||
|
||
Normalizing raw HH preferences (train): 0%| | 0/43598 [00:00<?, ? examples/s]
|
||
Normalizing raw HH preferences (train): 0%| | 0/43598 [00:00<?, ? examples/s]2026-04-14 19:26:32 - WARNING - alignment.data - Dropped 237 non-canonical HH preference examples from split `train` before normalization (126 x HH preprocessing expects exactly one final assistant response in chosen/rejected suffixes., 111 x HH chosen/rejected transcripts must each contain a divergent assistant response.).
|
||
|
||
Normalizing raw HH preferences (train): 0%| | 0/43598 [00:00<?, ? examples/s]
|
||
Normalizing raw HH preferences (train): 2%|▏ | 1038/43598 [00:00<00:04, 10328.54 examples/s]
|
||
Normalizing raw HH preferences (train): 2%|▏ | 1045/43598 [00:00<00:04, 10399.92 examples/s]
|
||
Normalizing raw HH preferences (train): 2%|▏ | 1088/43598 [00:00<00:03, 10825.45 examples/s]2026-04-14 19:26:32 - WARNING - alignment.data - Dropped 237 non-canonical HH preference examples from split `train` before normalization (126 x HH preprocessing expects exactly one final assistant response in chosen/rejected suffixes., 111 x HH chosen/rejected transcripts must each contain a divergent assistant response.).
|
||
|
||
Normalizing raw HH preferences (train): 0%| | 0/43598 [00:00<?, ? examples/s]
|
||
Normalizing raw HH preferences (train): 5%|▌ | 2250/43598 [00:00<00:03, 11376.75 examples/s]
|
||
Normalizing raw HH preferences (train): 5%|▌ | 2251/43598 [00:00<00:03, 11369.77 examples/s]
|
||
Normalizing raw HH preferences (train): 5%|▌ | 2385/43598 [00:00<00:03, 12081.03 examples/s]
|
||
Normalizing raw HH preferences (train): 3%|▎ | 1105/43598 [00:00<00:03, 10997.18 examples/s]
|
||
Normalizing raw HH preferences (train): 8%|▊ | 3484/43598 [00:00<00:03, 11810.59 examples/s]
|
||
Normalizing raw HH preferences (train): 8%|▊ | 3529/43598 [00:00<00:03, 12009.33 examples/s]2026-04-14 19:26:32 - WARNING - alignment.data - Dropped 237 non-canonical HH preference examples from split `train` before normalization (126 x HH preprocessing expects exactly one final assistant response in chosen/rejected suffixes., 111 x HH chosen/rejected transcripts must each contain a divergent assistant response.).
|
||
|
||
Normalizing raw HH preferences (train): 0%| | 0/43598 [00:00<?, ? examples/s]2026-04-14 19:26:32 - WARNING - alignment.data - Dropped 237 non-canonical HH preference examples from split `train` before normalization (126 x HH preprocessing expects exactly one final assistant response in chosen/rejected suffixes., 111 x HH chosen/rejected transcripts must each contain a divergent assistant response.).
|
||
|
||
Normalizing raw HH preferences (train): 0%| | 0/43598 [00:00<?, ? examples/s]Caching processed dataset at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-d6e6bfbe34161664.arrow
|
||
2026-04-14 19:26:32 - INFO - datasets.arrow_dataset - Caching processed dataset at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-d6e6bfbe34161664.arrow
|
||
|
||
Normalizing raw HH preferences (train): 8%|▊ | 3700/43598 [00:00<00:03, 12425.92 examples/s]
|
||
Normalizing raw HH preferences (train): 5%|▌ | 2359/43598 [00:00<00:03, 11899.09 examples/s]
|
||
Normalizing raw HH preferences (train): 11%|█ | 4734/43598 [00:00<00:03, 12081.82 examples/s]
|
||
Normalizing raw HH preferences (train): 11%|█ | 4791/43598 [00:00<00:03, 12244.14 examples/s]
|
||
Normalizing raw HH preferences (train): 2%|▏ | 1000/43598 [00:00<00:04, 9673.48 examples/s]
|
||
Normalizing raw HH preferences (train): 2%|▏ | 1051/43598 [00:00<00:04, 10430.06 examples/s]
|
||
Normalizing raw HH preferences (train): 11%|█▏ | 4953/43598 [00:00<00:03, 12463.98 examples/s]
|
||
Normalizing raw HH preferences (train): 8%|▊ | 3698/43598 [00:00<00:03, 12329.99 examples/s]
|
||
Normalizing raw HH preferences (train): 14%|█▍ | 5996/43598 [00:00<00:03, 12269.30 examples/s]
|
||
Normalizing raw HH preferences (train): 5%|▌ | 2295/43598 [00:00<00:03, 11570.92 examples/s]
|
||
Normalizing raw HH preferences (train): 15%|█▌ | 6689/43598 [00:00<00:03, 12299.81 examples/s]
|
||
Normalizing raw HH preferences (train): 5%|▌ | 2348/43598 [00:00<00:03, 11913.62 examples/s]
|
||
Normalizing raw HH preferences (train): 11%|█▏ | 4949/43598 [00:00<00:03, 12398.04 examples/s]
|
||
Normalizing raw HH preferences (train): 16%|█▌ | 6803/43598 [00:00<00:02, 12402.65 examples/s]
|
||
Normalizing raw HH preferences (train): 8%|▊ | 3588/43598 [00:00<00:03, 12183.77 examples/s]
|
||
Normalizing raw HH preferences (train): 18%|█▊ | 7874/43598 [00:00<00:02, 12375.73 examples/s]
|
||
Normalizing raw HH preferences (train): 18%|█▊ | 7972/43598 [00:00<00:02, 12456.45 examples/s]
|
||
Normalizing raw HH preferences (train): 9%|▊ | 3706/43598 [00:00<00:03, 12422.26 examples/s]
|
||
Normalizing raw HH preferences (train): 11%|█ | 4860/43598 [00:00<00:03, 12390.88 examples/s]
|
||
Normalizing raw HH preferences (train): 16%|█▌ | 6802/43598 [00:00<00:02, 12373.31 examples/s]
|
||
Normalizing raw HH preferences (train): 11%|█▏ | 4977/43598 [00:00<00:03, 12531.67 examples/s]
|
||
Normalizing raw HH preferences (train): 20%|█▉ | 8687/43598 [00:00<00:02, 12415.01 examples/s]
|
||
Normalizing raw HH preferences (train): 22%|██▏ | 9763/43598 [00:00<00:02, 12453.86 examples/s]
|
||
Normalizing raw HH preferences (train): 23%|██▎ | 9849/43598 [00:00<00:02, 12472.09 examples/s]
|
||
Normalizing raw HH preferences (train): 23%|██▎ | 9971/43598 [00:00<00:02, 12527.79 examples/s]
|
||
Normalizing raw HH preferences (train): 15%|█▌ | 6722/43598 [00:00<00:02, 12396.84 examples/s]
|
||
Normalizing raw HH preferences (train): 20%|█▉ | 8688/43598 [00:00<00:02, 12411.86 examples/s]
|
||
Normalizing raw HH preferences (train): 16%|█▌ | 6851/43598 [00:00<00:02, 12509.87 examples/s]
|
||
Normalizing raw HH preferences (train): 27%|██▋ | 11700/43598 [00:00<00:02, 12539.65 examples/s]
|
||
Normalizing raw HH preferences (train): 27%|██▋ | 11727/43598 [00:00<00:02, 12480.93 examples/s]
|
||
Normalizing raw HH preferences (train): 23%|██▎ | 9970/43598 [00:00<00:02, 12519.49 examples/s]
|
||
Normalizing raw HH preferences (train): 18%|█▊ | 8000/43598 [00:00<00:02, 12257.20 examples/s]
|
||
Normalizing raw HH preferences (train): 27%|██▋ | 11818/43598 [00:00<00:02, 12447.60 examples/s]
|
||
Normalizing raw HH preferences (train): 30%|██▉ | 12996/43598 [00:01<00:02, 12641.70 examples/s]
|
||
Normalizing raw HH preferences (train): 30%|██▉ | 13000/43598 [00:01<00:02, 12305.63 examples/s]
|
||
Normalizing raw HH preferences (train): 20%|██ | 8744/43598 [00:00<00:02, 12546.91 examples/s]
|
||
Normalizing raw HH preferences (train): 21%|██▏ | 9281/43598 [00:00<00:02, 12421.18 examples/s]
|
||
Normalizing raw HH preferences (train): 27%|██▋ | 11855/43598 [00:00<00:02, 12533.05 examples/s]
|
||
Normalizing raw HH preferences (train): 33%|███▎ | 14243/43598 [00:01<00:02, 12334.21 examples/s]
|
||
Normalizing raw HH preferences (train): 31%|███▏ | 13645/43598 [00:01<00:02, 12322.78 examples/s]
|
||
Normalizing raw HH preferences (train): 23%|██▎ | 10000/43598 [00:00<00:02, 12383.67 examples/s]2026-04-14 19:26:33 - WARNING - alignment.data - Dropped 237 non-canonical HH preference examples from split `train` before normalization (126 x HH preprocessing expects exactly one final assistant response in chosen/rejected suffixes., 111 x HH chosen/rejected transcripts must each contain a divergent assistant response.).
|
||
|
||
Normalizing raw HH preferences (train): 0%| | 0/43598 [00:00<?, ? examples/s]
|
||
Normalizing raw HH preferences (train): 34%|███▍ | 14871/43598 [00:01<00:02, 12590.68 examples/s]
|
||
Normalizing raw HH preferences (train): 26%|██▌ | 11129/43598 [00:00<00:02, 12377.76 examples/s]2026-04-14 19:26:33 - WARNING - alignment.data - Dropped 237 non-canonical HH preference examples from split `train` before normalization (126 x HH preprocessing expects exactly one final assistant response in chosen/rejected suffixes., 111 x HH chosen/rejected transcripts must each contain a divergent assistant response.).
|
||
|
||
Normalizing raw HH preferences (train): 0%| | 0/43598 [00:00<?, ? examples/s]
|
||
Normalizing raw HH preferences (train): 36%|███▌ | 15529/43598 [00:01<00:02, 12473.47 examples/s]
|
||
Normalizing raw HH preferences (train): 34%|███▍ | 14925/43598 [00:01<00:02, 12437.24 examples/s]
|
||
Normalizing raw HH preferences (train): 26%|██▌ | 11312/43598 [00:00<00:02, 12585.92 examples/s]
|
||
Normalizing raw HH preferences (train): 2%|▏ | 836/43598 [00:00<00:05, 8301.52 examples/s]
|
||
Normalizing raw HH preferences (train): 31%|███▏ | 13716/43598 [00:01<00:02, 12485.94 examples/s]
|
||
Normalizing raw HH preferences (train): 28%|██▊ | 12420/43598 [00:01<00:02, 12519.76 examples/s]
|
||
Normalizing raw HH preferences (train): 38%|███▊ | 16778/43598 [00:01<00:02, 12625.81 examples/s]
|
||
Normalizing raw HH preferences (train): 39%|███▊ | 16809/43598 [00:01<00:02, 12562.91 examples/s]
|
||
Normalizing raw HH preferences (train): 2%|▏ | 828/43598 [00:00<00:05, 8225.49 examples/s]
|
||
Normalizing raw HH preferences (train): 29%|██▉ | 12707/43598 [00:01<00:02, 12734.21 examples/s]
|
||
Normalizing raw HH preferences (train): 34%|███▍ | 14997/43598 [00:01<00:02, 12561.83 examples/s]
|
||
Normalizing raw HH preferences (train): 39%|███▊ | 16811/43598 [00:01<00:02, 12479.08 examples/s]
|
||
Normalizing raw HH preferences (train): 31%|███▏ | 13696/43598 [00:01<00:02, 12578.09 examples/s]
|
||
Normalizing raw HH preferences (train): 5%|▍ | 2000/43598 [00:00<00:05, 7524.12 examples/s]
|
||
Normalizing raw HH preferences (train): 32%|███▏ | 13995/43598 [00:01<00:02, 12772.75 examples/s]
|
||
Normalizing raw HH preferences (train): 43%|████▎ | 18683/43598 [00:01<00:01, 12543.86 examples/s]
|
||
Normalizing raw HH preferences (train): 5%|▍ | 1992/43598 [00:00<00:05, 7898.94 examples/s]
|
||
Normalizing raw HH preferences (train): 43%|████▎ | 18676/43598 [00:01<00:02, 12444.57 examples/s]
|
||
Normalizing raw HH preferences (train): 39%|███▊ | 16879/43598 [00:01<00:02, 12554.05 examples/s]
|
||
Normalizing raw HH preferences (train): 7%|▋ | 2890/43598 [00:00<00:05, 8043.21 examples/s]
|
||
Normalizing raw HH preferences (train): 43%|████▎ | 18679/43598 [00:01<00:02, 12395.46 examples/s]
|
||
Normalizing raw HH preferences (train): 36%|███▌ | 15556/43598 [00:01<00:02, 12507.62 examples/s]
|
||
Normalizing raw HH preferences (train): 36%|███▋ | 15912/43598 [00:01<00:02, 12772.37 examples/s]
|
||
Normalizing raw HH preferences (train): 7%|▋ | 3096/43598 [00:00<00:05, 7631.22 examples/s]
|
||
Normalizing raw HH preferences (train): 43%|████▎ | 18717/43598 [00:01<00:01, 12453.08 examples/s]
|
||
Normalizing raw HH preferences (train): 39%|███▊ | 16848/43598 [00:01<00:02, 12612.92 examples/s]
|
||
Normalizing raw HH preferences (train): 9%|▉ | 4000/43598 [00:00<00:05, 7700.56 examples/s]
|
||
Normalizing raw HH preferences (train): 9%|▉ | 3972/43598 [00:00<00:04, 7988.25 examples/s]
|
||
Normalizing raw HH preferences (train): 46%|████▌ | 20136/43598 [00:01<00:02, 9412.78 examples/s]
|
||
Normalizing raw HH preferences (train): 41%|████ | 17774/43598 [00:01<00:02, 12643.29 examples/s]
|
||
Normalizing raw HH preferences (train): 46%|████▌ | 20135/43598 [00:01<00:02, 9100.15 examples/s]
|
||
Normalizing raw HH preferences (train): 11%|█ | 4878/43598 [00:00<00:04, 8019.23 examples/s]
|
||
Normalizing raw HH preferences (train): 46%|████▌ | 20136/43598 [00:01<00:02, 9322.39 examples/s]
|
||
Normalizing raw HH preferences (train): 43%|████▎ | 18681/43598 [00:01<00:02, 12451.62 examples/s]
|
||
Normalizing raw HH preferences (train): 49%|████▉ | 21422/43598 [00:01<00:02, 10082.23 examples/s]
|
||
Normalizing raw HH preferences (train): 49%|████▉ | 21394/43598 [00:01<00:02, 9823.70 examples/s]
|
||
Normalizing raw HH preferences (train): 12%|█▏ | 5039/43598 [00:00<00:05, 7629.17 examples/s]
|
||
Normalizing raw HH preferences (train): 49%|████▉ | 21411/43598 [00:01<00:02, 9996.39 examples/s]
|
||
Normalizing raw HH preferences (train): 46%|████▌ | 20139/43598 [00:01<00:02, 9381.31 examples/s]
|
||
Normalizing raw HH preferences (train): 14%|█▍ | 6000/43598 [00:00<00:04, 7703.56 examples/s]
|
||
Normalizing raw HH preferences (train): 52%|█████▏ | 22697/43598 [00:01<00:01, 10659.75 examples/s]
|
||
Normalizing raw HH preferences (train): 52%|█████▏ | 22698/43598 [00:01<00:01, 10521.86 examples/s]
|
||
Normalizing raw HH preferences (train): 14%|█▎ | 5923/43598 [00:00<00:04, 7964.31 examples/s]
|
||
Normalizing raw HH preferences (train): 52%|█████▏ | 22699/43598 [00:01<00:01, 10625.89 examples/s]
|
||
Normalizing raw HH preferences (train): 49%|████▉ | 21403/43598 [00:01<00:02, 10030.27 examples/s]
|
||
Normalizing raw HH preferences (train): 45%|████▍ | 19507/43598 [00:01<00:02, 9663.19 examples/s]
|
||
Normalizing raw HH preferences (train): 16%|█▌ | 6873/43598 [00:00<00:04, 7977.04 examples/s]
|
||
Normalizing raw HH preferences (train): 55%|█████▌ | 23985/43598 [00:02<00:01, 11182.54 examples/s]
|
||
Normalizing raw HH preferences (train): 55%|█████▌ | 23985/43598 [00:02<00:01, 11098.13 examples/s]
|
||
Normalizing raw HH preferences (train): 55%|█████▌ | 23981/43598 [00:02<00:01, 11145.77 examples/s]
|
||
Normalizing raw HH preferences (train): 48%|████▊ | 20774/43598 [00:01<00:02, 10262.64 examples/s]
|
||
Normalizing raw HH preferences (train): 52%|█████▏ | 22693/43598 [00:01<00:01, 10632.12 examples/s]
|
||
Normalizing raw HH preferences (train): 46%|████▌ | 20134/43598 [00:01<00:02, 8538.91 examples/s]
|
||
Normalizing raw HH preferences (train): 16%|█▌ | 7010/43598 [00:00<00:04, 7691.04 examples/s]
|
||
Normalizing raw HH preferences (train): 18%|█▊ | 8000/43598 [00:01<00:04, 7735.43 examples/s]
|
||
Normalizing raw HH preferences (train): 59%|█████▉ | 25788/43598 [00:02<00:01, 11459.27 examples/s]
|
||
Normalizing raw HH preferences (train): 59%|█████▉ | 25747/43598 [00:02<00:01, 11319.13 examples/s]
|
||
Normalizing raw HH preferences (train): 55%|█████▍ | 23977/43598 [00:02<00:01, 11160.65 examples/s]
|
||
Normalizing raw HH preferences (train): 49%|████▉ | 21367/43598 [00:01<00:02, 9271.39 examples/s]
|
||
Normalizing raw HH preferences (train): 50%|█████ | 22000/43598 [00:01<00:02, 10597.98 examples/s]
|
||
Normalizing raw HH preferences (train): 18%|█▊ | 7893/43598 [00:01<00:04, 7986.19 examples/s]
|
||
Normalizing raw HH preferences (train): 59%|█████▉ | 25784/43598 [00:02<00:01, 11437.63 examples/s]
|
||
Normalizing raw HH preferences (train): 20%|██ | 8881/43598 [00:01<00:04, 8006.67 examples/s]
|
||
Normalizing raw HH preferences (train): 62%|██████▏ | 27000/43598 [00:02<00:01, 11423.59 examples/s]
|
||
Normalizing raw HH preferences (train): 62%|██████▏ | 27000/43598 [00:02<00:01, 11406.47 examples/s]
|
||
Normalizing raw HH preferences (train): 53%|█████▎ | 23284/43598 [00:01<00:01, 11133.92 examples/s]
|
||
Normalizing raw HH preferences (train): 52%|█████▏ | 22700/43598 [00:02<00:02, 10073.76 examples/s]
|
||
Normalizing raw HH preferences (train): 62%|██████▏ | 27000/43598 [00:02<00:01, 11464.15 examples/s]
|
||
Normalizing raw HH preferences (train): 59%|█████▉ | 25808/43598 [00:02<00:01, 11512.19 examples/s]
|
||
Normalizing raw HH preferences (train): 21%|██ | 9000/43598 [00:01<00:04, 7741.33 examples/s]
|
||
Normalizing raw HH preferences (train): 65%|██████▍ | 28248/43598 [00:02<00:01, 11688.33 examples/s]
|
||
Normalizing raw HH preferences (train): 65%|██████▍ | 28262/43598 [00:02<00:01, 11714.53 examples/s]
|
||
Normalizing raw HH preferences (train): 56%|█████▋ | 24563/43598 [00:02<00:01, 11554.12 examples/s]
|
||
Normalizing raw HH preferences (train): 55%|█████▌ | 23992/43598 [00:02<00:01, 10738.37 examples/s]
|
||
Normalizing raw HH preferences (train): 23%|██▎ | 10000/43598 [00:01<00:04, 7786.83 examples/s]
|
||
Normalizing raw HH preferences (train): 65%|██████▍ | 28257/43598 [00:02<00:01, 11743.37 examples/s]
|
||
Normalizing raw HH preferences (train): 23%|██▎ | 9878/43598 [00:01<00:04, 8002.36 examples/s]
|
||
Normalizing raw HH preferences (train): 68%|██████▊ | 29540/43598 [00:02<00:01, 12012.39 examples/s]
|
||
Normalizing raw HH preferences (train): 68%|██████▊ | 29572/43598 [00:02<00:01, 12083.59 examples/s]
|
||
Normalizing raw HH preferences (train): 64%|██████▎ | 27690/43598 [00:02<00:01, 11766.63 examples/s]
|
||
Normalizing raw HH preferences (train): 59%|█████▉ | 25842/43598 [00:02<00:01, 11881.98 examples/s]
|
||
Normalizing raw HH preferences (train): 25%|██▍ | 10891/43598 [00:01<00:04, 8067.30 examples/s]
|
||
Normalizing raw HH preferences (train): 68%|██████▊ | 29560/43598 [00:02<00:01, 12085.67 examples/s]
|
||
Normalizing raw HH preferences (train): 59%|█████▉ | 25837/43598 [00:02<00:01, 11255.57 examples/s]
|
||
Normalizing raw HH preferences (train): 71%|███████ | 30820/43598 [00:02<00:01, 12227.87 examples/s]
|
||
Normalizing raw HH preferences (train): 71%|███████ | 30857/43598 [00:02<00:01, 12292.92 examples/s]
|
||
Normalizing raw HH preferences (train): 66%|██████▋ | 28913/43598 [00:02<00:01, 11872.51 examples/s]
|
||
Normalizing raw HH preferences (train): 25%|██▌ | 11000/43598 [00:01<00:04, 7782.70 examples/s]
|
||
Normalizing raw HH preferences (train): 71%|███████ | 30842/43598 [00:02<00:01, 12285.71 examples/s]
|
||
Normalizing raw HH preferences (train): 64%|██████▎ | 27693/43598 [00:02<00:01, 12042.48 examples/s]
|
||
Normalizing raw HH preferences (train): 28%|██▊ | 12001/43598 [00:01<00:04, 7830.28 examples/s]
|
||
Normalizing raw HH preferences (train): 64%|██████▎ | 27688/43598 [00:02<00:01, 11589.55 examples/s]
|
||
Normalizing raw HH preferences (train): 27%|██▋ | 11879/43598 [00:01<00:03, 8031.44 examples/s]
|
||
Normalizing raw HH preferences (train): 75%|███████▌ | 32710/43598 [00:02<00:00, 12355.52 examples/s]
|
||
Normalizing raw HH preferences (train): 75%|███████▌ | 32731/43598 [00:02<00:00, 12363.29 examples/s]
|
||
Normalizing raw HH preferences (train): 71%|███████ | 30777/43598 [00:02<00:01, 12056.78 examples/s]
|
||
Normalizing raw HH preferences (train): 66%|██████▋ | 28979/43598 [00:02<00:01, 12250.39 examples/s]
|
||
Normalizing raw HH preferences (train): 30%|██▉ | 12889/43598 [00:01<00:03, 8089.79 examples/s]
|
||
Normalizing raw HH preferences (train): 75%|███████▌ | 32713/43598 [00:02<00:00, 12351.06 examples/s]
|
||
Normalizing raw HH preferences (train): 66%|██████▋ | 28976/43598 [00:02<00:01, 11892.51 examples/s]
|
||
Normalizing raw HH preferences (train): 78%|███████▊ | 33975/43598 [00:02<00:00, 12429.73 examples/s]
|
||
Normalizing raw HH preferences (train): 73%|███████▎ | 32000/43598 [00:02<00:00, 11993.98 examples/s]
|
||
Normalizing raw HH preferences (train): 30%|██▉ | 13000/43598 [00:01<00:03, 7823.39 examples/s]
|
||
Normalizing raw HH preferences (train): 79%|███████▉ | 34522/43598 [00:02<00:00, 12212.38 examples/s]
|
||
Normalizing raw HH preferences (train): 71%|███████ | 30865/43598 [00:02<00:01, 12361.88 examples/s]
|
||
Normalizing raw HH preferences (train): 32%|███▏ | 14000/43598 [00:01<00:03, 7848.54 examples/s]
|
||
Normalizing raw HH preferences (train): 79%|███████▉ | 34533/43598 [00:02<00:00, 12200.96 examples/s]
|
||
Normalizing raw HH preferences (train): 71%|███████ | 30865/43598 [00:02<00:01, 12125.11 examples/s]
|
||
Normalizing raw HH preferences (train): 76%|███████▋ | 33280/43598 [00:02<00:00, 12198.43 examples/s]
|
||
Normalizing raw HH preferences (train): 32%|███▏ | 13862/43598 [00:01<00:03, 8020.01 examples/s]
|
||
Normalizing raw HH preferences (train): 82%|████████▏ | 35840/43598 [00:03<00:00, 12427.89 examples/s]
|
||
Normalizing raw HH preferences (train): 82%|████████▏ | 35763/43598 [00:03<00:00, 12260.35 examples/s]
|
||
Normalizing raw HH preferences (train): 34%|███▍ | 14897/43598 [00:01<00:03, 8125.61 examples/s]
|
||
Normalizing raw HH preferences (train): 82%|████████▏ | 35792/43598 [00:03<00:00, 12295.47 examples/s]
|
||
Normalizing raw HH preferences (train): 75%|███████▌ | 32734/43598 [00:02<00:00, 12393.69 examples/s]
|
||
Normalizing raw HH preferences (train): 79%|███████▉ | 34539/43598 [00:02<00:00, 12299.62 examples/s]
|
||
Normalizing raw HH preferences (train): 85%|████████▍ | 37000/43598 [00:03<00:00, 12071.15 examples/s]
|
||
Normalizing raw HH preferences (train): 75%|███████▍ | 32679/43598 [00:02<00:00, 12109.77 examples/s]
|
||
Normalizing raw HH preferences (train): 34%|███▍ | 15000/43598 [00:01<00:03, 7805.34 examples/s]
|
||
Normalizing raw HH preferences (train): 86%|████████▋ | 37694/43598 [00:03<00:00, 12398.85 examples/s]
|
||
Normalizing raw HH preferences (train): 78%|███████▊ | 33987/43598 [00:02<00:00, 12424.78 examples/s]
|
||
Normalizing raw HH preferences (train): 82%|████████▏ | 35793/43598 [00:03<00:00, 12362.95 examples/s]
|
||
Normalizing raw HH preferences (train): 37%|███▋ | 16015/43598 [00:02<00:03, 7888.47 examples/s]
|
||
Normalizing raw HH preferences (train): 88%|████████▊ | 38213/43598 [00:03<00:00, 12086.23 examples/s]
|
||
Normalizing raw HH preferences (train): 86%|████████▋ | 37697/43598 [00:03<00:00, 12232.76 examples/s]
|
||
Normalizing raw HH preferences (train): 36%|███▋ | 15887/43598 [00:02<00:03, 8067.55 examples/s]
|
||
Normalizing raw HH preferences (train): 89%|████████▉ | 38948/43598 [00:03<00:00, 12429.37 examples/s]
|
||
Normalizing raw HH preferences (train): 79%|███████▉ | 34467/43598 [00:02<00:00, 12048.58 examples/s]
|
||
Normalizing raw HH preferences (train): 39%|███▉ | 16902/43598 [00:02<00:03, 8129.35 examples/s]
|
||
Normalizing raw HH preferences (train): 91%|█████████ | 39503/43598 [00:03<00:00, 12307.94 examples/s]
|
||
Normalizing raw HH preferences (train): 89%|████████▉ | 38948/43598 [00:03<00:00, 12296.76 examples/s]
|
||
Normalizing raw HH preferences (train): 82%|████████▏ | 35823/43598 [00:02<00:00, 12358.24 examples/s]
|
||
Normalizing raw HH preferences (train): 86%|████████▋ | 37686/43598 [00:03<00:00, 12254.94 examples/s]
|
||
Normalizing raw HH preferences (train): 82%|████████▏ | 35744/43598 [00:03<00:00, 12213.60 examples/s]
|
||
Normalizing raw HH preferences (train): 39%|███▉ | 17000/43598 [00:02<00:03, 7816.28 examples/s]
|
||
Normalizing raw HH preferences (train): 94%|█████████▎| 40813/43598 [00:03<00:00, 12428.51 examples/s]
|
||
Normalizing raw HH preferences (train): 94%|█████████▎| 40780/43598 [00:03<00:00, 12436.18 examples/s]
|
||
Normalizing raw HH preferences (train): 41%|████▏ | 18000/43598 [00:02<00:03, 7810.61 examples/s]
|
||
Normalizing raw HH preferences (train): 89%|████████▉ | 38925/43598 [00:03<00:00, 12288.57 examples/s]
|
||
Normalizing raw HH preferences (train): 94%|█████████▎| 40804/43598 [00:03<00:00, 12320.08 examples/s]
|
||
Normalizing raw HH preferences (train): 86%|████████▋ | 37703/43598 [00:03<00:00, 12372.51 examples/s]
|
||
Normalizing raw HH preferences (train): 85%|████████▍ | 37000/43598 [00:03<00:00, 12069.40 examples/s]
|
||
Normalizing raw HH preferences (train): 41%|████ | 17867/43598 [00:02<00:03, 8026.25 examples/s]
|
||
Normalizing raw HH preferences (train): 43%|████▎ | 18866/43598 [00:02<00:03, 8019.23 examples/s]
|
||
Normalizing raw HH preferences (train): 98%|█████████▊| 42703/43598 [00:03<00:00, 12447.50 examples/s]
|
||
Normalizing raw HH preferences (train): 98%|█████████▊| 42701/43598 [00:03<00:00, 12442.26 examples/s]
|
||
Normalizing raw HH preferences (train): 89%|████████▉ | 38966/43598 [00:03<00:00, 12431.35 examples/s]
|
||
Normalizing raw HH preferences (train): 93%|█████████▎| 40742/43598 [00:03<00:00, 12222.33 examples/s]
|
||
Normalizing raw HH preferences (train): 88%|████████▊ | 38275/43598 [00:03<00:00, 12242.58 examples/s]
|
||
Normalizing raw HH preferences (train): 98%|█████████▊| 42697/43598 [00:03<00:00, 12338.44 examples/s]
|
||
Normalizing raw HH preferences (train): 44%|████▎ | 19000/43598 [00:02<00:03, 7767.19 examples/s]
|
||
Normalizing raw HH preferences (train): 96%|█████████▋| 41990/43598 [00:03<00:00, 12284.49 examples/s]
|
||
Normalizing raw HH preferences (train): 91%|█████████ | 39561/43598 [00:03<00:00, 12408.04 examples/s]
|
||
Normalizing raw HH preferences (train): 94%|█████████▎| 40830/43598 [00:03<00:00, 12428.14 examples/s]
|
||
Normalizing raw HH preferences (train): 94%|█████████▎| 40829/43598 [00:03<00:00, 12481.75 examples/s]
|
||
Normalizing raw HH preferences (train): 46%|████▌ | 20000/43598 [00:02<00:04, 5576.93 examples/s]
|
||
Normalizing raw HH preferences (train): 100%|██████████| 43598/43598 [00:03<00:00, 10678.55 examples/s]
|
||
Normalizing raw HH preferences (train): 98%|█████████▊| 42694/43598 [00:03<00:00, 12385.81 examples/s]
|
||
Normalizing raw HH preferences (train): 46%|████▌ | 20126/43598 [00:02<00:03, 5952.56 examples/s]
|
||
Normalizing raw HH preferences (train): 98%|█████████▊| 42699/43598 [00:03<00:00, 12403.41 examples/s]
|
||
Normalizing raw HH preferences (train): 100%|██████████| 43598/43598 [00:03<00:00, 10987.28 examples/s]
|
||
|
||
Normalizing raw HH preferences (train): 100%|██████████| 43598/43598 [00:03<00:00, 10963.09 examples/s]
|
||
|
||
Normalizing raw HH preferences (train): 100%|██████████| 43598/43598 [00:03<00:00, 11134.10 examples/s]
|
||
|
||
Normalizing raw HH preferences (train): 49%|████▉ | 21272/43598 [00:02<00:03, 6932.67 examples/s]
|
||
Normalizing raw HH preferences (train): 49%|████▉ | 21319/43598 [00:02<00:03, 7117.87 examples/s]
|
||
Normalizing raw HH preferences (train): 52%|█████▏ | 22548/43598 [00:02<00:02, 8175.00 examples/s]
|
||
Normalizing raw HH preferences (train): 100%|██████████| 43598/43598 [00:03<00:00, 11104.35 examples/s]
|
||
|
||
Normalizing raw HH preferences (train): 52%|█████▏ | 22699/43598 [00:02<00:02, 8442.57 examples/s]
|
||
Normalizing raw HH preferences (train): 55%|█████▍ | 23791/43598 [00:03<00:02, 9167.35 examples/s]
|
||
Normalizing raw HH preferences (train): 100%|██████████| 43598/43598 [00:03<00:00, 11321.80 examples/s]
|
||
|
||
Normalizing raw HH preferences (train): 100%|██████████| 43598/43598 [00:03<00:00, 11223.64 examples/s]
|
||
|
||
Normalizing raw HH preferences (train): 55%|█████▌ | 23979/43598 [00:03<00:02, 9463.80 examples/s]
|
||
Normalizing raw HH preferences (train): 57%|█████▋ | 25000/43598 [00:03<00:01, 9753.74 examples/s]
|
||
Normalizing raw HH preferences (train): 58%|█████▊ | 25130/43598 [00:03<00:01, 9976.45 examples/s]
|
||
Normalizing raw HH preferences (train): 60%|██████ | 26274/43598 [00:03<00:01, 10528.08 examples/s]
|
||
Normalizing raw HH preferences (train): 60%|██████ | 26368/43598 [00:03<00:01, 10608.38 examples/s]
|
||
Normalizing raw HH preferences (train): 63%|██████▎ | 27541/43598 [00:03<00:01, 11107.66 examples/s]
|
||
Normalizing raw HH preferences (train): 66%|██████▌ | 28819/43598 [00:03<00:01, 11573.46 examples/s]
|
||
Normalizing raw HH preferences (train): 64%|██████▍ | 27839/43598 [00:03<00:01, 10313.97 examples/s]2026-04-14 19:26:36 - WARNING - alignment.data - Dropped 15 non-canonical HH preference examples from split `test` before normalization (9 x HH preprocessing expects exactly one final assistant response in chosen/rejected suffixes., 6 x HH chosen/rejected transcripts must each contain a divergent assistant response.).
|
||
|
||
Normalizing raw HH preferences (test): 0%| | 0/2339 [00:00<?, ? examples/s]2026-04-14 19:26:36 - WARNING - alignment.data - Dropped 15 non-canonical HH preference examples from split `test` before normalization (9 x HH preprocessing expects exactly one final assistant response in chosen/rejected suffixes., 6 x HH chosen/rejected transcripts must each contain a divergent assistant response.).
|
||
|
||
Normalizing raw HH preferences (test): 0%| | 0/2339 [00:00<?, ? examples/s]2026-04-14 19:26:36 - WARNING - alignment.data - Dropped 15 non-canonical HH preference examples from split `test` before normalization (9 x HH preprocessing expects exactly one final assistant response in chosen/rejected suffixes., 6 x HH chosen/rejected transcripts must each contain a divergent assistant response.).
|
||
|
||
Normalizing raw HH preferences (test): 0%| | 0/2339 [00:00<?, ? examples/s]
|
||
Normalizing raw HH preferences (train): 70%|███████ | 30706/43598 [00:03<00:01, 11939.16 examples/s]
|
||
Normalizing raw HH preferences (test): 50%|████▉ | 1162/2339 [00:00<00:00, 11572.53 examples/s]No config specified, defaulting to the single config: hh-rlhf/default
|
||
2026-04-14 19:26:36 - INFO - datasets.builder - No config specified, defaulting to the single config: hh-rlhf/default
|
||
Using custom data configuration default-cfba128a0ab1b99f
|
||
2026-04-14 19:26:36 - INFO - datasets.builder - Using custom data configuration default-cfba128a0ab1b99f
|
||
Loading Dataset Infos from /home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/datasets/packaged_modules/json
|
||
2026-04-14 19:26:36 - INFO - datasets.info - Loading Dataset Infos from /home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/datasets/packaged_modules/json
|
||
|
||
Normalizing raw HH preferences (train): 67%|██████▋ | 29000/43598 [00:03<00:01, 9411.86 examples/s] Overwrite dataset info from restored data version if exists.
|
||
2026-04-14 19:26:36 - INFO - datasets.builder - Overwrite dataset info from restored data version if exists.
|
||
Loading Dataset info from /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa
|
||
2026-04-14 19:26:36 - INFO - datasets.info - Loading Dataset info from /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa
|
||
|
||
Normalizing raw HH preferences (test): 51%|█████ | 1185/2339 [00:00<00:00, 11800.97 examples/s]Found cached dataset hh-rlhf (/scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa)
|
||
2026-04-14 19:26:36 - INFO - datasets.builder - Found cached dataset hh-rlhf (/scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa)
|
||
Loading Dataset info from /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa
|
||
2026-04-14 19:26:36 - INFO - datasets.info - Loading Dataset info from /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa
|
||
2026-04-14 19:26:36 - WARNING - alignment.data - Dropped 15 non-canonical HH preference examples from split `test` before normalization (9 x HH preprocessing expects exactly one final assistant response in chosen/rejected suffixes., 6 x HH chosen/rejected transcripts must each contain a divergent assistant response.).
|
||
|
||
Normalizing raw HH preferences (test): 35%|███▌ | 826/2339 [00:00<00:00, 8210.37 examples/s]
|
||
Normalizing raw HH preferences (test): 0%| | 0/2339 [00:00<?, ? examples/s]
|
||
Normalizing raw HH preferences (test): 100%|██████████| 2339/2339 [00:00<00:00, 11434.81 examples/s]
|
||
Normalizing raw HH preferences (train): 69%|██████▉ | 30004/43598 [00:03<00:01, 9564.37 examples/s]
|
||
Normalizing raw HH preferences (test): 100%|██████████| 2339/2339 [00:00<00:00, 10830.79 examples/s]
|
||
2026-04-14 19:26:36 - WARNING - alignment.data - Dropped 15 non-canonical HH preference examples from split `test` before normalization (9 x HH preprocessing expects exactly one final assistant response in chosen/rejected suffixes., 6 x HH chosen/rejected transcripts must each contain a divergent assistant response.).
|
||
|
||
Normalizing raw HH preferences (test): 0%| | 0/2339 [00:00<?, ? examples/s]2026-04-14 19:26:36 - WARNING - alignment.data - Dropped 15 non-canonical HH preference examples from split `test` before normalization (9 x HH preprocessing expects exactly one final assistant response in chosen/rejected suffixes., 6 x HH chosen/rejected transcripts must each contain a divergent assistant response.).
|
||
|
||
Normalizing raw HH preferences (train): 74%|███████▍ | 32209/43598 [00:03<00:01, 11269.14 examples/s]
|
||
Normalizing raw HH preferences (test): 0%| | 0/2339 [00:00<?, ? examples/s]Caching processed dataset at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-fa6f4b7acba8a3e1.arrow
|
||
2026-04-14 19:26:36 - INFO - datasets.arrow_dataset - Caching processed dataset at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-fa6f4b7acba8a3e1.arrow
|
||
|
||
Normalizing raw HH preferences (test): 100%|██████████| 2339/2339 [00:00<00:00, 10748.53 examples/s]
|
||
|
||
Normalizing raw HH preferences (test): 84%|████████▍ | 1971/2339 [00:00<00:00, 10107.33 examples/s]
|
||
Normalizing raw HH preferences (test): 39%|███▊ | 903/2339 [00:00<00:00, 8986.60 examples/s]
|
||
Normalizing raw HH preferences (train): 71%|███████ | 31000/43598 [00:03<00:01, 9546.44 examples/s]
|
||
Normalizing raw HH preferences (test): 100%|██████████| 2339/2339 [00:00<00:00, 8982.32 examples/s]
|
||
|
||
Normalizing raw HH preferences (test): 50%|████▉ | 1162/2339 [00:00<00:00, 11577.39 examples/s]
|
||
Normalizing raw HH preferences (train): 77%|███████▋ | 33420/43598 [00:03<00:00, 11474.34 examples/s]
|
||
Normalizing raw HH preferences (test): 35%|███▍ | 814/2339 [00:00<00:00, 8091.93 examples/s]
|
||
Normalizing raw HH preferences (test): 86%|████████▌ | 2000/2339 [00:00<00:00, 10145.46 examples/s]
|
||
Normalizing raw HH preferences (test): 100%|██████████| 2339/2339 [00:00<00:00, 9238.93 examples/s]
|
||
|
||
Normalizing raw HH preferences (test): 100%|██████████| 2339/2339 [00:00<00:00, 11486.50 examples/s]
|
||
Normalizing raw HH preferences (train): 80%|███████▉ | 34705/43598 [00:03<00:00, 11828.62 examples/s]
|
||
Normalizing raw HH preferences (test): 100%|██████████| 2339/2339 [00:00<00:00, 10383.56 examples/s]
|
||
|
||
Normalizing raw HH preferences (train): 74%|███████▍ | 32220/43598 [00:03<00:01, 9043.46 examples/s]
|
||
Normalizing raw HH preferences (test): 84%|████████▍ | 1965/2339 [00:00<00:00, 7793.46 examples/s]
|
||
Normalizing raw HH preferences (train): 82%|████████▏ | 35957/43598 [00:04<00:00, 12016.06 examples/s]
|
||
Normalizing raw HH preferences (test): 100%|██████████| 2339/2339 [00:00<00:00, 6863.93 examples/s]
|
||
Loading cached shuffled indices for dataset at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-be0876dd0add1b31.arrow
|
||
2026-04-14 19:26:37 - INFO - datasets.arrow_dataset - Loading cached shuffled indices for dataset at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-be0876dd0add1b31.arrow
|
||
Loading cached shuffled indices for dataset at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-40e942b49dfd026a.arrow
|
||
2026-04-14 19:26:37 - INFO - datasets.arrow_dataset - Loading cached shuffled indices for dataset at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-40e942b49dfd026a.arrow
|
||
2026-04-14 19:26:37 - INFO - __main__ - Training on the following datasets and their proportions: ['train : 43598', 'test : 2339']
|
||
|
||
Normalizing raw HH preferences (train): 77%|███████▋ | 33667/43598 [00:04<00:01, 9242.25 examples/s]
|
||
Normalizing raw HH preferences (train): 87%|████████▋ | 37798/43598 [00:04<00:00, 12109.00 examples/s]
|
||
Normalizing raw HH preferences (train): 80%|███████▉ | 34857/43598 [00:04<00:00, 9883.82 examples/s]
|
||
Normalizing raw HH preferences (train): 83%|████████▎ | 36000/43598 [00:04<00:00, 10128.02 examples/s]
|
||
Normalizing raw HH preferences (train): 91%|█████████ | 39686/43598 [00:04<00:00, 12183.67 examples/s]
|
||
Normalizing raw HH preferences (train): 86%|████████▌ | 37284/43598 [00:04<00:00, 10847.13 examples/s]
|
||
Normalizing raw HH preferences (train): 94%|█████████▍| 40955/43598 [00:04<00:00, 12304.64 examples/s]
|
||
Normalizing raw HH preferences (train): 88%|████████▊ | 38554/43598 [00:04<00:00, 11354.11 examples/s]
|
||
Normalizing raw HH preferences (train): 98%|█████████▊| 42817/43598 [00:04<00:00, 12339.55 examples/s]
|
||
Normalizing raw HH preferences (train): 91%|█████████▏| 39832/43598 [00:04<00:00, 11753.52 examples/s]
|
||
Normalizing raw HH preferences (train): 96%|█████████▌| 41713/43598 [00:04<00:00, 12046.72 examples/s]
|
||
Normalizing raw HH preferences (train): 100%|██████████| 43598/43598 [00:04<00:00, 9085.79 examples/s]
|
||
|
||
Normalizing raw HH preferences (train): 99%|█████████▊| 42980/43598 [00:04<00:00, 12210.48 examples/s]
|
||
Normalizing raw HH preferences (train): 100%|██████████| 43598/43598 [00:04<00:00, 8794.53 examples/s]
|
||
[INFO|tokenization_utils_base.py:2060] 2026-04-14 19:26:38,916 >> loading file vocab.json from cache at /scratch/qu.yang1/hf/hub/models--Qwen--Qwen3-8B-Base/snapshots/49e3418fbbbca6ecbdf9608b4d22e5a407081db4/vocab.json
|
||
[INFO|tokenization_utils_base.py:2060] 2026-04-14 19:26:38,916 >> loading file merges.txt from cache at /scratch/qu.yang1/hf/hub/models--Qwen--Qwen3-8B-Base/snapshots/49e3418fbbbca6ecbdf9608b4d22e5a407081db4/merges.txt
|
||
[INFO|tokenization_utils_base.py:2060] 2026-04-14 19:26:38,916 >> loading file tokenizer.json from cache at /scratch/qu.yang1/hf/hub/models--Qwen--Qwen3-8B-Base/snapshots/49e3418fbbbca6ecbdf9608b4d22e5a407081db4/tokenizer.json
|
||
[INFO|tokenization_utils_base.py:2060] 2026-04-14 19:26:38,916 >> loading file added_tokens.json from cache at None
|
||
[INFO|tokenization_utils_base.py:2060] 2026-04-14 19:26:38,916 >> loading file special_tokens_map.json from cache at None
|
||
[INFO|tokenization_utils_base.py:2060] 2026-04-14 19:26:38,916 >> loading file tokenizer_config.json from cache at /scratch/qu.yang1/hf/hub/models--Qwen--Qwen3-8B-Base/snapshots/49e3418fbbbca6ecbdf9608b4d22e5a407081db4/tokenizer_config.json
|
||
[INFO|tokenization_utils_base.py:2060] 2026-04-14 19:26:38,916 >> loading file chat_template.jinja from cache at None
|
||
[INFO|tokenization_utils_base.py:2323] 2026-04-14 19:26:39,206 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
|
||
2026-04-14 19:26:39 - INFO - __main__ - *** Load pretrained model ***
|
||
|
||
Applying chat template (num_proc=12): 0%| | 0/43598 [00:00<?, ? examples/s]
|
||
Applying chat template (num_proc=12): 0%| | 0/43598 [00:00<?, ? examples/s]
|
||
Applying chat template (num_proc=12): 0%| | 0/43598 [00:00<?, ? examples/s]
|
||
Applying chat template (num_proc=12): 0%| | 0/43598 [00:00<?, ? examples/s]2026-04-14 19:26:39 - WARNING - alignment.data - Dropped 15 non-canonical HH preference examples from split `test` before normalization (9 x HH preprocessing expects exactly one final assistant response in chosen/rejected suffixes., 6 x HH chosen/rejected transcripts must each contain a divergent assistant response.).
|
||
|
||
Normalizing raw HH preferences (test): 0%| | 0/2339 [00:00<?, ? examples/s]Process #0 will write at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-78e6505eb8c7606f_00000_of_00012.arrow
|
||
2026-04-14 19:26:39 - INFO - datasets.arrow_dataset - Process #0 will write at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-78e6505eb8c7606f_00000_of_00012.arrow
|
||
Process #1 will write at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-78e6505eb8c7606f_00001_of_00012.arrow
|
||
2026-04-14 19:26:39 - INFO - datasets.arrow_dataset - Process #1 will write at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-78e6505eb8c7606f_00001_of_00012.arrow
|
||
Process #2 will write at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-78e6505eb8c7606f_00002_of_00012.arrow
|
||
2026-04-14 19:26:39 - INFO - datasets.arrow_dataset - Process #2 will write at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-78e6505eb8c7606f_00002_of_00012.arrow
|
||
Process #3 will write at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-78e6505eb8c7606f_00003_of_00012.arrow
|
||
2026-04-14 19:26:39 - INFO - datasets.arrow_dataset - Process #3 will write at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-78e6505eb8c7606f_00003_of_00012.arrow
|
||
Process #4 will write at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-78e6505eb8c7606f_00004_of_00012.arrow
|
||
2026-04-14 19:26:39 - INFO - datasets.arrow_dataset - Process #4 will write at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-78e6505eb8c7606f_00004_of_00012.arrow
|
||
Process #5 will write at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-78e6505eb8c7606f_00005_of_00012.arrow
|
||
2026-04-14 19:26:39 - INFO - datasets.arrow_dataset - Process #5 will write at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-78e6505eb8c7606f_00005_of_00012.arrow
|
||
Process #6 will write at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-78e6505eb8c7606f_00006_of_00012.arrow
|
||
2026-04-14 19:26:39 - INFO - datasets.arrow_dataset - Process #6 will write at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-78e6505eb8c7606f_00006_of_00012.arrow
|
||
Process #7 will write at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-78e6505eb8c7606f_00007_of_00012.arrow
|
||
2026-04-14 19:26:39 - INFO - datasets.arrow_dataset - Process #7 will write at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-78e6505eb8c7606f_00007_of_00012.arrow
|
||
Process #8 will write at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-78e6505eb8c7606f_00008_of_00012.arrow
|
||
2026-04-14 19:26:39 - INFO - datasets.arrow_dataset - Process #8 will write at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-78e6505eb8c7606f_00008_of_00012.arrow
|
||
Process #9 will write at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-78e6505eb8c7606f_00009_of_00012.arrow
|
||
2026-04-14 19:26:39 - INFO - datasets.arrow_dataset - Process #9 will write at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-78e6505eb8c7606f_00009_of_00012.arrow
|
||
Process #10 will write at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-78e6505eb8c7606f_00010_of_00012.arrow
|
||
2026-04-14 19:26:39 - INFO - datasets.arrow_dataset - Process #10 will write at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-78e6505eb8c7606f_00010_of_00012.arrow
|
||
Process #11 will write at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-78e6505eb8c7606f_00011_of_00012.arrow
|
||
2026-04-14 19:26:39 - INFO - datasets.arrow_dataset - Process #11 will write at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-78e6505eb8c7606f_00011_of_00012.arrow
|
||
|
||
Applying chat template (num_proc=12): 0%| | 0/43598 [00:00<?, ? examples/s]Spawning 12 processes
|
||
2026-04-14 19:26:39 - INFO - datasets.arrow_dataset - Spawning 12 processes
|
||
|
||
Applying chat template (num_proc=12): 0%| | 0/43598 [00:00<?, ? examples/s]2026-04-14 19:26:39 - WARNING - alignment.data - Dropped 15 non-canonical HH preference examples from split `test` before normalization (9 x HH preprocessing expects exactly one final assistant response in chosen/rejected suffixes., 6 x HH chosen/rejected transcripts must each contain a divergent assistant response.).
|
||
|
||
Normalizing raw HH preferences (test): 0%| | 0/2339 [00:00<?, ? examples/s]
|
||
Normalizing raw HH preferences (test): 43%|████▎ | 1000/2339 [00:00<00:00, 8997.51 examples/s]
|
||
Normalizing raw HH preferences (test): 37%|███▋ | 877/2339 [00:00<00:00, 8730.48 examples/s]
|
||
Normalizing raw HH preferences (test): 86%|████████▌ | 2000/2339 [00:00<00:00, 9183.37 examples/s]
|
||
Normalizing raw HH preferences (test): 100%|██████████| 2339/2339 [00:00<00:00, 8070.83 examples/s]
|
||
|
||
Normalizing raw HH preferences (test): 85%|████████▌ | 1992/2339 [00:00<00:00, 7831.84 examples/s]
|
||
Normalizing raw HH preferences (test): 100%|██████████| 2339/2339 [00:00<00:00, 6568.95 examples/s]
|
||
|
||
Applying chat template (num_proc=12): 0%| | 1/43598 [00:00<10:51:32, 1.12 examples/s]
|
||
Applying chat template (num_proc=12): 0%| | 1/43598 [00:00<11:29:11, 1.05 examples/s]
|
||
Applying chat template (num_proc=12): 0%| | 1/43598 [00:00<12:03:02, 1.00 examples/s]
|
||
Applying chat template (num_proc=12): 0%| | 1/43598 [00:01<13:27:31, 1.11s/ examples]
|
||
Applying chat template (num_proc=12): 1%| | 291/43598 [00:01<02:13, 323.24 examples/s]
|
||
Applying chat template (num_proc=12): 0%| | 1/43598 [00:01<13:32:08, 1.12s/ examples]
|
||
Applying chat template (num_proc=12): 1%| | 225/43598 [00:01<03:03, 236.55 examples/s]
|
||
Applying chat template (num_proc=12): 1%| | 434/43598 [00:01<01:46, 405.05 examples/s]
|
||
Applying chat template (num_proc=12): 0%| | 70/43598 [00:01<11:59, 60.47 examples/s]
|
||
Applying chat template (num_proc=12): 2%|▏ | 788/43598 [00:01<00:56, 760.26 examples/s]
|
||
Applying chat template (num_proc=12): 2%|▏ | 765/43598 [00:01<00:59, 715.70 examples/s]Caching processed dataset at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-78e6505eb8c7606f_00002_of_00012.arrow
|
||
2026-04-14 19:26:40 - INFO - datasets.arrow_dataset - Caching processed dataset at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-78e6505eb8c7606f_00002_of_00012.arrow
|
||
|
||
Applying chat template (num_proc=12): 1%| | 227/43598 [00:01<03:46, 191.52 examples/s]
|
||
Applying chat template (num_proc=12): 3%|▎ | 1369/43598 [00:01<00:37, 1137.02 examples/s]
|
||
Applying chat template (num_proc=12): 3%|▎ | 1501/43598 [00:01<00:30, 1379.74 examples/s]Caching processed dataset at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-78e6505eb8c7606f_00000_of_00012.arrow
|
||
2026-04-14 19:26:40 - INFO - datasets.arrow_dataset - Caching processed dataset at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-78e6505eb8c7606f_00000_of_00012.arrow
|
||
|
||
Applying chat template (num_proc=12): 0%| | 1/43598 [00:01<20:03:49, 1.66s/ examples]Caching processed dataset at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-78e6505eb8c7606f_00001_of_00012.arrow
|
||
2026-04-14 19:26:41 - INFO - datasets.arrow_dataset - Caching processed dataset at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-78e6505eb8c7606f_00001_of_00012.arrow
|
||
|
||
Applying chat template (num_proc=12): 3%|▎ | 1416/43598 [00:01<00:37, 1115.05 examples/s]
|
||
Applying chat template (num_proc=12): 0%| | 0/43598 [00:00<?, ? examples/s]
|
||
Applying chat template (num_proc=12): 2%|▏ | 945/43598 [00:01<00:58, 724.23 examples/s]Caching processed dataset at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-78e6505eb8c7606f_00003_of_00012.arrow
|
||
2026-04-14 19:26:41 - INFO - datasets.arrow_dataset - Caching processed dataset at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-78e6505eb8c7606f_00003_of_00012.arrow
|
||
|
||
Applying chat template (num_proc=12): 2%|▏ | 655/43598 [00:01<01:35, 449.14 examples/s]
|
||
Applying chat template (num_proc=12): 5%|▍ | 2023/43598 [00:02<00:31, 1321.79 examples/s]
|
||
Applying chat template (num_proc=12): 4%|▍ | 1925/43598 [00:02<00:36, 1152.55 examples/s]
|
||
Applying chat template (num_proc=12): 5%|▌ | 2280/43598 [00:02<00:26, 1589.03 examples/s]
|
||
Applying chat template (num_proc=12): 1%| | 231/43598 [00:02<05:12, 138.73 examples/s]
|
||
Applying chat template (num_proc=12): 5%|▍ | 2172/43598 [00:02<00:28, 1436.32 examples/s]
|
||
Applying chat template (num_proc=12): 8%|▊ | 3319/43598 [00:02<00:19, 2046.46 examples/s]
|
||
Applying chat template (num_proc=12): 5%|▍ | 2018/43598 [00:02<00:32, 1273.53 examples/s]
|
||
Applying chat template (num_proc=12): 8%|▊ | 3329/43598 [00:02<00:21, 1895.11 examples/s]
|
||
Applying chat template (num_proc=12): 7%|▋ | 3021/43598 [00:02<00:26, 1554.64 examples/s]
|
||
Applying chat template (num_proc=12): 3%|▎ | 1490/43598 [00:02<00:48, 862.53 examples/s]
|
||
Applying chat template (num_proc=12): 8%|▊ | 3441/43598 [00:02<00:21, 1839.74 examples/s]
|
||
Applying chat template (num_proc=12): 10%|▉ | 4301/43598 [00:02<00:17, 2241.16 examples/s]
|
||
Applying chat template (num_proc=12): 0%| | 0/43598 [00:00<?, ? examples/s]
|
||
Applying chat template (num_proc=12): 13%|█▎ | 5693/43598 [00:02<00:10, 3557.09 examples/s]
|
||
Applying chat template (num_proc=12): 9%|▉ | 3831/43598 [00:02<00:18, 2127.06 examples/s]
|
||
Applying chat template (num_proc=12): 11%|█ | 4721/43598 [00:03<00:17, 2269.78 examples/s]Caching processed dataset at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-78e6505eb8c7606f_00004_of_00012.arrow
|
||
2026-04-14 19:26:42 - INFO - datasets.arrow_dataset - Caching processed dataset at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-78e6505eb8c7606f_00004_of_00012.arrow
|
||
|
||
Applying chat template (num_proc=12): 14%|█▍ | 6306/43598 [00:03<00:10, 3593.05 examples/s]
|
||
Applying chat template (num_proc=12): 10%|█ | 4565/43598 [00:03<00:18, 2070.40 examples/s]
|
||
Applying chat template (num_proc=12): 15%|█▌ | 6713/43598 [00:03<00:09, 3976.51 examples/s]
|
||
Applying chat template (num_proc=12): 16%|█▌ | 6872/43598 [00:03<00:09, 3765.62 examples/s]
|
||
Applying chat template (num_proc=12): 6%|▌ | 2648/43598 [00:03<00:32, 1278.56 examples/s]
|
||
Applying chat template (num_proc=12): 17%|█▋ | 7490/43598 [00:03<00:08, 4092.92 examples/s]
|
||
Applying chat template (num_proc=12): 17%|█▋ | 7391/43598 [00:03<00:09, 3862.06 examples/s]
|
||
Applying chat template (num_proc=12): 11%|█▏ | 4984/43598 [00:03<00:18, 2126.17 examples/s]
|
||
Applying chat template (num_proc=12): 13%|█▎ | 5667/43598 [00:03<00:14, 2629.42 examples/s]
|
||
Applying chat template (num_proc=12): 18%|█▊ | 7967/43598 [00:03<00:08, 4180.49 examples/s]
|
||
Applying chat template (num_proc=12): 19%|█▊ | 8159/43598 [00:03<00:08, 4082.48 examples/s]
|
||
Applying chat template (num_proc=12): 14%|█▍ | 6319/43598 [00:03<00:13, 2680.52 examples/s]
|
||
Applying chat template (num_proc=12): 19%|█▉ | 8479/43598 [00:03<00:08, 4331.22 examples/s]
|
||
Applying chat template (num_proc=12): 17%|█▋ | 7599/43598 [00:03<00:09, 3991.90 examples/s]Caching processed dataset at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-78e6505eb8c7606f_00005_of_00012.arrow
|
||
|
||
Applying chat template (num_proc=12): 20%|██ | 8767/43598 [00:03<00:08, 4063.75 examples/s]
|
||
Applying chat template (num_proc=12): 19%|█▊ | 8103/43598 [00:03<00:08, 4035.21 examples/s]2026-04-14 19:26:42 - INFO - datasets.arrow_dataset - Caching processed dataset at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-78e6505eb8c7606f_00005_of_00012.arrow
|
||
|
||
Applying chat template (num_proc=12): 21%|██ | 9033/43598 [00:03<00:07, 4566.32 examples/s]
|
||
Applying chat template (num_proc=12): 21%|██▏ | 9301/43598 [00:03<00:08, 4089.65 examples/s]
|
||
Applying chat template (num_proc=12): 19%|█▉ | 8326/43598 [00:03<00:08, 3960.57 examples/s]
|
||
Applying chat template (num_proc=12): 22%|██▏ | 9557/43598 [00:03<00:07, 4709.07 examples/s]
|
||
Applying chat template (num_proc=12): 9%|▉ | 3970/43598 [00:03<00:24, 1605.55 examples/s]
|
||
Applying chat template (num_proc=12): 20%|██ | 8871/43598 [00:03<00:08, 4008.46 examples/s]
|
||
Applying chat template (num_proc=12): 23%|██▎ | 9814/43598 [00:03<00:07, 4233.18 examples/s]
|
||
Applying chat template (num_proc=12): 23%|██▎ | 10101/43598 [00:03<00:07, 4731.94 examples/s]
|
||
Applying chat template (num_proc=12): 21%|██ | 8960/43598 [00:03<00:08, 3963.25 examples/s]
|
||
Applying chat template (num_proc=12): 16%|█▌ | 6789/43598 [00:04<00:14, 2490.31 examples/s]
|
||
Applying chat template (num_proc=12): 24%|██▎ | 10312/43598 [00:04<00:07, 4274.58 examples/s]
|
||
Applying chat template (num_proc=12): 22%|██▏ | 9529/43598 [00:04<00:08, 4060.86 examples/s]
|
||
Applying chat template (num_proc=12): 24%|██▍ | 10601/43598 [00:04<00:07, 4663.81 examples/s]
|
||
Applying chat template (num_proc=12): 22%|██▏ | 9383/43598 [00:04<00:07, 4284.64 examples/s]
|
||
Applying chat template (num_proc=12): 22%|██▏ | 9524/43598 [00:04<00:08, 3977.33 examples/s]
|
||
Applying chat template (num_proc=12): 25%|██▍ | 10868/43598 [00:04<00:07, 4544.29 examples/s]
|
||
Applying chat template (num_proc=12): 25%|██▌ | 11094/43598 [00:04<00:06, 4708.04 examples/s]
|
||
Applying chat template (num_proc=12): 23%|██▎ | 10127/43598 [00:04<00:08, 4030.19 examples/s]
|
||
Applying chat template (num_proc=12): 26%|██▌ | 11390/43598 [00:04<00:06, 4603.08 examples/s]Caching processed dataset at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-78e6505eb8c7606f_00006_of_00012.arrow
|
||
|
||
Applying chat template (num_proc=12): 23%|██▎ | 10038/43598 [00:04<00:08, 4065.21 examples/s]2026-04-14 19:26:43 - INFO - datasets.arrow_dataset - Caching processed dataset at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-78e6505eb8c7606f_00006_of_00012.arrow
|
||
|
||
Applying chat template (num_proc=12): 27%|██▋ | 11625/43598 [00:04<00:07, 4421.20 examples/s]
|
||
Applying chat template (num_proc=12): 24%|██▍ | 10662/43598 [00:04<00:08, 4058.51 examples/s]
|
||
Applying chat template (num_proc=12): 27%|██▋ | 11894/43598 [00:04<00:06, 4591.97 examples/s]
|
||
Applying chat template (num_proc=12): 24%|██▍ | 10546/43598 [00:04<00:08, 4042.58 examples/s]
|
||
Applying chat template (num_proc=12): 24%|██▎ | 10268/43598 [00:04<00:08, 4113.65 examples/s]
|
||
Applying chat template (num_proc=12): 28%|██▊ | 12168/43598 [00:04<00:06, 4651.11 examples/s]
|
||
Applying chat template (num_proc=12): 26%|██▌ | 11229/43598 [00:04<00:07, 4330.03 examples/s]
|
||
Applying chat template (num_proc=12): 13%|█▎ | 5513/43598 [00:04<00:19, 1962.54 examples/s]
|
||
Applying chat template (num_proc=12): 28%|██▊ | 12405/43598 [00:04<00:06, 4604.01 examples/s]
|
||
Applying chat template (num_proc=12): 25%|██▌ | 11047/43598 [00:04<00:07, 4123.03 examples/s]
|
||
Applying chat template (num_proc=12): 29%|██▉ | 12708/43598 [00:04<00:06, 4727.85 examples/s]
|
||
Applying chat template (num_proc=12): 27%|██▋ | 11743/43598 [00:04<00:07, 4440.79 examples/s]
|
||
Applying chat template (num_proc=12): 18%|█▊ | 7848/43598 [00:04<00:10, 3494.86 examples/s]
|
||
Applying chat template (num_proc=12): 30%|██▉ | 12906/43598 [00:04<00:06, 4705.35 examples/s]
|
||
Applying chat template (num_proc=12): 26%|██▋ | 11511/43598 [00:04<00:07, 4234.01 examples/s]
|
||
Applying chat template (num_proc=12): 25%|██▌ | 11015/43598 [00:04<00:08, 3990.41 examples/s]
|
||
Applying chat template (num_proc=12): 31%|███ | 13350/43598 [00:04<00:05, 5162.47 examples/s]
|
||
Applying chat template (num_proc=12): 28%|██▊ | 12243/43598 [00:04<00:06, 4543.80 examples/s]
|
||
Applying chat template (num_proc=12): 31%|███ | 13443/43598 [00:04<00:06, 4861.63 examples/s]
|
||
Applying chat template (num_proc=12): 27%|██▋ | 11977/43598 [00:04<00:07, 4146.27 examples/s]
|
||
Applying chat template (num_proc=12): 27%|██▋ | 11620/43598 [00:04<00:07, 4091.15 examples/s]
|
||
Applying chat template (num_proc=12): 32%|███▏ | 13975/43598 [00:04<00:05, 5286.46 examples/s]
|
||
Applying chat template (num_proc=12): 20%|█▉ | 8660/43598 [00:04<00:09, 3616.10 examples/s]
|
||
Applying chat template (num_proc=12): 29%|██▉ | 12773/43598 [00:04<00:06, 4511.99 examples/s]
|
||
Applying chat template (num_proc=12): 32%|███▏ | 14124/43598 [00:04<00:05, 5334.68 examples/s]
|
||
Applying chat template (num_proc=12): 29%|██▊ | 12483/43598 [00:04<00:07, 4177.10 examples/s]
|
||
Applying chat template (num_proc=12): 34%|███▎ | 14683/43598 [00:04<00:05, 5551.21 examples/s]
|
||
Applying chat template (num_proc=12): 28%|██▊ | 12201/43598 [00:04<00:07, 4176.69 examples/s]
|
||
Applying chat template (num_proc=12): 31%|███ | 13305/43598 [00:04<00:06, 4595.90 examples/s]
|
||
Applying chat template (num_proc=12): 34%|███▎ | 14670/43598 [00:04<00:05, 5170.98 examples/s]Caching processed dataset at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-78e6505eb8c7606f_00007_of_00012.arrow
|
||
|
||
Applying chat template (num_proc=12): 30%|██▉ | 12935/43598 [00:04<00:07, 4068.44 examples/s]
|
||
Applying chat template (num_proc=12): 35%|███▌ | 15278/43598 [00:04<00:05, 5541.70 examples/s]2026-04-14 19:26:44 - INFO - datasets.arrow_dataset - Caching processed dataset at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-78e6505eb8c7606f_00007_of_00012.arrow
|
||
|
||
Applying chat template (num_proc=12): 21%|██▏ | 9347/43598 [00:04<00:09, 3579.12 examples/s]
|
||
Applying chat template (num_proc=12): 32%|███▏ | 13816/43598 [00:04<00:06, 4692.86 examples/s]
|
||
Applying chat template (num_proc=12): 29%|██▉ | 12764/43598 [00:04<00:07, 4191.74 examples/s]
|
||
Applying chat template (num_proc=12): 35%|███▌ | 15397/43598 [00:04<00:04, 5730.02 examples/s]
|
||
Applying chat template (num_proc=12): 31%|███ | 13367/43598 [00:04<00:07, 4113.65 examples/s]
|
||
Applying chat template (num_proc=12): 36%|███▋ | 15881/43598 [00:05<00:04, 5642.23 examples/s]
|
||
Applying chat template (num_proc=12): 33%|███▎ | 14336/43598 [00:05<00:06, 4674.25 examples/s]
|
||
Applying chat template (num_proc=12): 37%|███▋ | 16000/43598 [00:05<00:04, 5705.08 examples/s]
|
||
Applying chat template (num_proc=12): 30%|███ | 13274/43598 [00:05<00:07, 4252.84 examples/s]
|
||
Applying chat template (num_proc=12): 23%|██▎ | 9941/43598 [00:04<00:09, 3612.65 examples/s]
|
||
Applying chat template (num_proc=12): 32%|███▏ | 13796/43598 [00:05<00:07, 4074.33 examples/s]
|
||
Applying chat template (num_proc=12): 38%|███▊ | 16557/43598 [00:05<00:04, 5849.33 examples/s]
|
||
Applying chat template (num_proc=12): 34%|███▍ | 14863/43598 [00:05<00:06, 4788.24 examples/s]
|
||
Applying chat template (num_proc=12): 38%|███▊ | 16588/43598 [00:05<00:04, 5702.17 examples/s]
|
||
Applying chat template (num_proc=12): 32%|███▏ | 13760/43598 [00:05<00:07, 4169.10 examples/s]
|
||
Applying chat template (num_proc=12): 24%|██▍ | 10475/43598 [00:05<00:09, 3646.91 examples/s]
|
||
Applying chat template (num_proc=12): 40%|███▉ | 17293/43598 [00:05<00:04, 6263.44 examples/s]
|
||
Applying chat template (num_proc=12): 33%|███▎ | 14305/43598 [00:05<00:06, 4188.04 examples/s]
|
||
Applying chat template (num_proc=12): 39%|███▉ | 17210/43598 [00:05<00:04, 5740.70 examples/s]
|
||
Applying chat template (num_proc=12): 35%|███▌ | 15359/43598 [00:05<00:05, 4730.71 examples/s]
|
||
Applying chat template (num_proc=12): 33%|███▎ | 14234/43598 [00:05<00:07, 4163.75 examples/s]
|
||
Applying chat template (num_proc=12): 41%|████ | 17969/43598 [00:05<00:04, 6310.39 examples/s]
|
||
Applying chat template (num_proc=12): 34%|███▍ | 14828/43598 [00:05<00:06, 4461.39 examples/s]
|
||
Applying chat template (num_proc=12): 0%| | 1/43598 [00:03<42:38:13, 3.52s/ examples]
|
||
Applying chat template (num_proc=12): 41%|████ | 17834/43598 [00:05<00:04, 5872.86 examples/s]
|
||
Applying chat template (num_proc=12): 25%|██▌ | 10965/43598 [00:05<00:09, 3528.40 examples/s]
|
||
Applying chat template (num_proc=12): 36%|███▋ | 15897/43598 [00:05<00:05, 4807.79 examples/s]
|
||
Applying chat template (num_proc=12): 34%|███▎ | 14687/43598 [00:05<00:06, 4203.50 examples/s]
|
||
Applying chat template (num_proc=12): 43%|████▎ | 18651/43598 [00:05<00:03, 6413.35 examples/s]
|
||
Applying chat template (num_proc=12): 35%|███▌ | 15294/43598 [00:05<00:06, 4505.06 examples/s]
|
||
Applying chat template (num_proc=12): 26%|██▌ | 11399/43598 [00:05<00:08, 3623.47 examples/s]
|
||
Applying chat template (num_proc=12): 42%|████▏ | 18450/43598 [00:05<00:04, 5787.85 examples/s]
|
||
Applying chat template (num_proc=12): 38%|███▊ | 16420/43598 [00:05<00:05, 4800.78 examples/s]
|
||
Applying chat template (num_proc=12): 35%|███▍ | 15194/43598 [00:05<00:06, 4413.05 examples/s]
|
||
Applying chat template (num_proc=12): 44%|████▍ | 19373/43598 [00:05<00:03, 6616.12 examples/s]
|
||
Applying chat template (num_proc=12): 36%|███▌ | 15750/43598 [00:05<00:06, 4504.35 examples/s]
|
||
Applying chat template (num_proc=12): 44%|████▎ | 19042/43598 [00:05<00:04, 5713.63 examples/s]
|
||
Applying chat template (num_proc=12): 39%|███▉ | 16910/43598 [00:05<00:05, 4698.24 examples/s]
|
||
Applying chat template (num_proc=12): 27%|██▋ | 11843/43598 [00:05<00:08, 3646.19 examples/s]
|
||
Applying chat template (num_proc=12): 36%|███▌ | 15763/43598 [00:05<00:06, 4594.22 examples/s]Caching processed dataset at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-78e6505eb8c7606f_00008_of_00012.arrow
|
||
2026-04-14 19:26:44 - INFO - datasets.arrow_dataset - Caching processed dataset at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-78e6505eb8c7606f_00008_of_00012.arrow
|
||
|
||
Applying chat template (num_proc=12): 46%|████▌ | 20049/43598 [00:05<00:03, 6609.61 examples/s]
|
||
Applying chat template (num_proc=12): 37%|███▋ | 16270/43598 [00:05<00:05, 4636.85 examples/s]
|
||
Applying chat template (num_proc=12): 45%|████▌ | 19636/43598 [00:05<00:04, 5748.00 examples/s]
|
||
Applying chat template (num_proc=12): 28%|██▊ | 12303/43598 [00:05<00:08, 3817.14 examples/s]
|
||
Applying chat template (num_proc=12): 40%|████ | 17464/43598 [00:05<00:05, 4840.95 examples/s]
|
||
Applying chat template (num_proc=12): 37%|███▋ | 16330/43598 [00:05<00:05, 4841.07 examples/s]
|
||
Applying chat template (num_proc=12): 48%|████▊ | 20721/43598 [00:05<00:03, 6525.61 examples/s]
|
||
Applying chat template (num_proc=12): 38%|███▊ | 16758/43598 [00:05<00:05, 4640.50 examples/s]
|
||
Applying chat template (num_proc=12): 46%|████▋ | 20213/43598 [00:05<00:04, 5738.74 examples/s]
|
||
Applying chat template (num_proc=12): 42%|████▏ | 18095/43598 [00:05<00:04, 5124.61 examples/s]
|
||
Applying chat template (num_proc=12): 30%|██▉ | 12880/43598 [00:05<00:07, 4133.45 examples/s]
|
||
Applying chat template (num_proc=12): 39%|███▊ | 16836/43598 [00:05<00:05, 4769.01 examples/s]
|
||
Applying chat template (num_proc=12): 49%|████▉ | 21398/43598 [00:05<00:03, 6353.00 examples/s]
|
||
Applying chat template (num_proc=12): 40%|███▉ | 17231/43598 [00:05<00:05, 4462.07 examples/s]
|
||
Applying chat template (num_proc=12): 48%|████▊ | 20817/43598 [00:05<00:04, 5616.49 examples/s]
|
||
Applying chat template (num_proc=12): 43%|████▎ | 18821/43598 [00:05<00:04, 5582.16 examples/s]
|
||
Applying chat template (num_proc=12): 31%|███ | 13405/43598 [00:05<00:06, 4348.45 examples/s]
|
||
Applying chat template (num_proc=12): 40%|███▉ | 17412/43598 [00:05<00:05, 4898.87 examples/s]Caching processed dataset at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-78e6505eb8c7606f_00009_of_00012.arrow
|
||
|
||
Applying chat template (num_proc=12): 41%|████ | 17721/43598 [00:05<00:05, 4560.29 examples/s]
|
||
Applying chat template (num_proc=12): 51%|█████ | 22038/43598 [00:05<00:03, 6130.09 examples/s]2026-04-14 19:26:45 - INFO - datasets.arrow_dataset - Caching processed dataset at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-78e6505eb8c7606f_00009_of_00012.arrow
|
||
|
||
Applying chat template (num_proc=12): 49%|████▉ | 21403/43598 [00:06<00:04, 5447.71 examples/s]
|
||
Applying chat template (num_proc=12): 45%|████▍ | 19410/43598 [00:06<00:04, 5570.93 examples/s]
|
||
Applying chat template (num_proc=12): 32%|███▏ | 13875/43598 [00:05<00:06, 4320.58 examples/s]
|
||
Applying chat template (num_proc=12): 41%|████ | 17957/43598 [00:06<00:05, 5006.36 examples/s]
|
||
Applying chat template (num_proc=12): 52%|█████▏ | 22665/43598 [00:06<00:03, 6105.59 examples/s]
|
||
Applying chat template (num_proc=12): 42%|████▏ | 18197/43598 [00:06<00:05, 4439.85 examples/s]
|
||
Applying chat template (num_proc=12): 46%|████▌ | 19980/43598 [00:06<00:04, 5563.41 examples/s]
|
||
Applying chat template (num_proc=12): 33%|███▎ | 14475/43598 [00:06<00:06, 4759.70 examples/s]
|
||
Applying chat template (num_proc=12): 50%|█████ | 21965/43598 [00:06<00:04, 5201.93 examples/s]
|
||
Applying chat template (num_proc=12): 42%|████▏ | 18511/43598 [00:06<00:04, 5115.80 examples/s]
|
||
Applying chat template (num_proc=12): 0%| | 69/43598 [00:04<34:25, 21.08 examples/s]
|
||
Applying chat template (num_proc=12): 43%|████▎ | 18755/43598 [00:06<00:05, 4613.89 examples/s]
|
||
Applying chat template (num_proc=12): 53%|█████▎ | 23305/43598 [00:06<00:03, 5815.43 examples/s]
|
||
Applying chat template (num_proc=12): 47%|████▋ | 20548/43598 [00:06<00:04, 5575.84 examples/s]
|
||
Applying chat template (num_proc=12): 34%|███▍ | 15030/43598 [00:06<00:05, 4910.98 examples/s]
|
||
Applying chat template (num_proc=12): 52%|█████▏ | 22509/43598 [00:06<00:04, 5154.35 examples/s]
|
||
Applying chat template (num_proc=12): 44%|████▎ | 19053/43598 [00:06<00:05, 4816.82 examples/s]
|
||
Applying chat template (num_proc=12): 44%|████▍ | 19320/43598 [00:06<00:05, 4823.58 examples/s]
|
||
Applying chat template (num_proc=12): 55%|█████▍ | 23906/43598 [00:06<00:03, 5749.44 examples/s]
|
||
Applying chat template (num_proc=12): 48%|████▊ | 21133/43598 [00:06<00:04, 5530.59 examples/s]
|
||
Applying chat template (num_proc=12): 36%|███▌ | 15539/43598 [00:06<00:05, 4871.12 examples/s]
|
||
Applying chat template (num_proc=12): 53%|█████▎ | 23037/43598 [00:06<00:04, 4997.57 examples/s]
|
||
Applying chat template (num_proc=12): 45%|████▍ | 19541/43598 [00:06<00:05, 4593.12 examples/s]
|
||
Applying chat template (num_proc=12): 56%|█████▋ | 24528/43598 [00:06<00:03, 5836.86 examples/s]
|
||
Applying chat template (num_proc=12): 46%|████▌ | 19975/43598 [00:06<00:04, 5184.19 examples/s]
|
||
Applying chat template (num_proc=12): 50%|████▉ | 21691/43598 [00:06<00:03, 5484.02 examples/s]
|
||
Applying chat template (num_proc=12): 37%|███▋ | 16187/43598 [00:06<00:05, 5171.95 examples/s]
|
||
Applying chat template (num_proc=12): 54%|█████▍ | 23557/43598 [00:06<00:04, 4956.05 examples/s]
|
||
Applying chat template (num_proc=12): 46%|████▌ | 20114/43598 [00:06<00:04, 4840.56 examples/s]
|
||
Applying chat template (num_proc=12): 47%|████▋ | 20597/43598 [00:06<00:04, 5345.14 examples/s]
|
||
Applying chat template (num_proc=12): 58%|█████▊ | 25179/43598 [00:06<00:03, 5777.51 examples/s]
|
||
Applying chat template (num_proc=12): 39%|███▉ | 16933/43598 [00:06<00:04, 5792.42 examples/s]
|
||
Applying chat template (num_proc=12): 51%|█████▏ | 22357/43598 [00:06<00:03, 5618.96 examples/s]
|
||
Applying chat template (num_proc=12): 55%|█████▌ | 24061/43598 [00:06<00:04, 4856.51 examples/s]
|
||
Applying chat template (num_proc=12): 47%|████▋ | 20620/43598 [00:06<00:04, 4877.64 examples/s]
|
||
Applying chat template (num_proc=12): 49%|████▊ | 21229/43598 [00:06<00:04, 5546.62 examples/s]
|
||
Applying chat template (num_proc=12): 59%|█████▉ | 25781/43598 [00:06<00:03, 5716.59 examples/s]
|
||
Applying chat template (num_proc=12): 40%|████ | 17534/43598 [00:06<00:04, 5842.32 examples/s]
|
||
Applying chat template (num_proc=12): 53%|█████▎ | 22956/43598 [00:06<00:03, 5626.68 examples/s]
|
||
Applying chat template (num_proc=12): 56%|█████▋ | 24630/43598 [00:06<00:03, 4985.00 examples/s]
|
||
Applying chat template (num_proc=12): 48%|████▊ | 21138/43598 [00:06<00:04, 4879.48 examples/s]
|
||
Applying chat template (num_proc=12): 50%|█████ | 21832/43598 [00:06<00:03, 5567.31 examples/s]
|
||
Applying chat template (num_proc=12): 61%|██████ | 26390/43598 [00:06<00:02, 5795.51 examples/s]
|
||
Applying chat template (num_proc=12): 42%|████▏ | 18135/43598 [00:06<00:04, 5848.40 examples/s]
|
||
Applying chat template (num_proc=12): 0%| | 1/43598 [00:03<47:02:09, 3.88s/ examples]
|
||
Applying chat template (num_proc=12): 54%|█████▍ | 23585/43598 [00:06<00:03, 5783.48 examples/s]
|
||
Applying chat template (num_proc=12): 58%|█████▊ | 25212/43598 [00:06<00:03, 5170.46 examples/s]
|
||
Applying chat template (num_proc=12): 50%|████▉ | 21633/43598 [00:06<00:04, 4781.92 examples/s]
|
||
Applying chat template (num_proc=12): 62%|██████▏ | 27046/43598 [00:06<00:02, 5912.59 examples/s]
|
||
Applying chat template (num_proc=12): 51%|█████▏ | 22438/43598 [00:06<00:03, 5520.21 examples/s]
|
||
Applying chat template (num_proc=12): 43%|████▎ | 18727/43598 [00:06<00:04, 5743.21 examples/s]
|
||
Applying chat template (num_proc=12): 55%|█████▌ | 24181/43598 [00:06<00:03, 5445.17 examples/s]
|
||
Applying chat template (num_proc=12): 59%|█████▉ | 25755/43598 [00:06<00:03, 4977.35 examples/s]
|
||
Applying chat template (num_proc=12): 51%|█████ | 22230/43598 [00:06<00:04, 4951.09 examples/s]
|
||
Applying chat template (num_proc=12): 53%|█████▎ | 22996/43598 [00:06<00:03, 5500.52 examples/s]Caching processed dataset at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-78e6505eb8c7606f_00010_of_00012.arrow
|
||
2026-04-14 19:26:46 - INFO - datasets.arrow_dataset - Caching processed dataset at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-78e6505eb8c7606f_00010_of_00012.arrow
|
||
|
||
Applying chat template (num_proc=12): 63%|██████▎ | 27667/43598 [00:06<00:02, 5627.53 examples/s]
|
||
Applying chat template (num_proc=12): 44%|████▍ | 19307/43598 [00:06<00:04, 5649.19 examples/s]
|
||
Applying chat template (num_proc=12): 60%|██████ | 26271/43598 [00:07<00:03, 4916.43 examples/s]
|
||
Applying chat template (num_proc=12): 57%|█████▋ | 24735/43598 [00:07<00:03, 5113.03 examples/s]
|
||
Applying chat template (num_proc=12): 52%|█████▏ | 22787/43598 [00:07<00:04, 5034.46 examples/s]Caching processed dataset at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-78e6505eb8c7606f_00011_of_00012.arrow
|
||
|
||
Applying chat template (num_proc=12): 54%|█████▍ | 23557/43598 [00:06<00:03, 5525.97 examples/s]
|
||
Applying chat template (num_proc=12): 1%| | 522/43598 [00:05<04:33, 157.28 examples/s]2026-04-14 19:26:46 - INFO - datasets.arrow_dataset - Caching processed dataset at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-78e6505eb8c7606f_00011_of_00012.arrow
|
||
|
||
Applying chat template (num_proc=12): 65%|██████▍ | 28253/43598 [00:07<00:02, 5631.52 examples/s]
|
||
Applying chat template (num_proc=12): 46%|████▌ | 19903/43598 [00:06<00:04, 5622.72 examples/s]
|
||
Applying chat template (num_proc=12): 62%|██████▏ | 26849/43598 [00:07<00:03, 5157.99 examples/s]
|
||
Applying chat template (num_proc=12): 58%|█████▊ | 25354/43598 [00:07<00:03, 5399.23 examples/s]
|
||
Applying chat template (num_proc=12): 54%|█████▎ | 23403/43598 [00:07<00:03, 5115.50 examples/s]
|
||
Applying chat template (num_proc=12): 55%|█████▌ | 24114/43598 [00:07<00:03, 5391.14 examples/s]
|
||
Applying chat template (num_proc=12): 66%|██████▌ | 28842/43598 [00:07<00:02, 5513.47 examples/s]
|
||
Applying chat template (num_proc=12): 47%|████▋ | 20469/43598 [00:07<00:04, 5581.31 examples/s]
|
||
Applying chat template (num_proc=12): 63%|██████▎ | 27395/43598 [00:07<00:03, 5085.46 examples/s]
|
||
Applying chat template (num_proc=12): 59%|█████▉ | 25928/43598 [00:07<00:03, 5180.22 examples/s]
|
||
Applying chat template (num_proc=12): 55%|█████▌ | 24002/43598 [00:07<00:03, 5333.66 examples/s]
|
||
Applying chat template (num_proc=12): 57%|█████▋ | 24783/43598 [00:07<00:03, 5665.98 examples/s]
|
||
Applying chat template (num_proc=12): 67%|██████▋ | 29413/43598 [00:07<00:02, 5330.37 examples/s]
|
||
Applying chat template (num_proc=12): 48%|████▊ | 21041/43598 [00:07<00:04, 5446.96 examples/s]
|
||
Applying chat template (num_proc=12): 64%|██████▍ | 27907/43598 [00:07<00:03, 5013.37 examples/s]
|
||
Applying chat template (num_proc=12): 61%|██████ | 26457/43598 [00:07<00:03, 5137.30 examples/s]
|
||
Applying chat template (num_proc=12): 58%|█████▊ | 25380/43598 [00:07<00:03, 5738.47 examples/s]
|
||
Applying chat template (num_proc=12): 56%|█████▋ | 24563/43598 [00:07<00:03, 5312.33 examples/s]
|
||
Applying chat template (num_proc=12): 1%| | 245/43598 [00:04<09:52, 73.18 examples/s]
|
||
Applying chat template (num_proc=12): 50%|████▉ | 21753/43598 [00:07<00:03, 5847.90 examples/s]
|
||
Applying chat template (num_proc=12): 69%|██████▊ | 29958/43598 [00:07<00:02, 5246.95 examples/s]
|
||
Applying chat template (num_proc=12): 65%|██████▌ | 28441/43598 [00:07<00:03, 4939.92 examples/s]
|
||
Applying chat template (num_proc=12): 58%|█████▊ | 25209/43598 [00:07<00:03, 5547.53 examples/s]
|
||
Applying chat template (num_proc=12): 62%|██████▏ | 27000/43598 [00:07<00:03, 4907.03 examples/s]
|
||
Applying chat template (num_proc=12): 60%|█████▉ | 26038/43598 [00:07<00:03, 5778.58 examples/s]
|
||
Applying chat template (num_proc=12): 51%|█████▏ | 22350/43598 [00:07<00:03, 5838.07 examples/s]
|
||
Applying chat template (num_proc=12): 70%|██████▉ | 30496/43598 [00:07<00:02, 5205.02 examples/s]
|
||
Applying chat template (num_proc=12): 66%|██████▋ | 28938/43598 [00:07<00:03, 4822.14 examples/s]
|
||
Applying chat template (num_proc=12): 61%|██████ | 26651/43598 [00:07<00:02, 5877.17 examples/s]
|
||
Applying chat template (num_proc=12): 59%|█████▉ | 25803/43598 [00:07<00:03, 5517.65 examples/s]
|
||
Applying chat template (num_proc=12): 63%|██████▎ | 27559/43598 [00:07<00:03, 4871.41 examples/s]
|
||
Applying chat template (num_proc=12): 71%|███████ | 31026/43598 [00:07<00:02, 5224.66 examples/s]
|
||
Applying chat template (num_proc=12): 53%|█████▎ | 22947/43598 [00:07<00:03, 5764.80 examples/s]
|
||
Applying chat template (num_proc=12): 68%|██████▊ | 29460/43598 [00:07<00:02, 4869.22 examples/s]
|
||
Applying chat template (num_proc=12): 61%|██████ | 26492/43598 [00:07<00:02, 5793.58 examples/s]
|
||
Applying chat template (num_proc=12): 63%|██████▎ | 27255/43598 [00:07<00:02, 5731.61 examples/s]
|
||
Applying chat template (num_proc=12): 64%|██████▍ | 28109/43598 [00:07<00:03, 4830.68 examples/s]
|
||
Applying chat template (num_proc=12): 72%|███████▏ | 31594/43598 [00:07<00:02, 5316.04 examples/s]
|
||
Applying chat template (num_proc=12): 54%|█████▍ | 23619/43598 [00:07<00:03, 5893.26 examples/s]
|
||
Applying chat template (num_proc=12): 69%|██████▊ | 29961/43598 [00:07<00:02, 4889.57 examples/s]
|
||
Applying chat template (num_proc=12): 64%|██████▍ | 27859/43598 [00:07<00:02, 5739.53 examples/s]
|
||
Applying chat template (num_proc=12): 62%|██████▏ | 27102/43598 [00:07<00:02, 5717.53 examples/s]
|
||
Applying chat template (num_proc=12): 74%|███████▎ | 32132/43598 [00:07<00:02, 5182.70 examples/s]
|
||
Applying chat template (num_proc=12): 66%|██████▌ | 28637/43598 [00:07<00:03, 4662.10 examples/s]
|
||
Applying chat template (num_proc=12): 56%|█████▌ | 24245/43598 [00:07<00:03, 5941.61 examples/s]
|
||
Applying chat template (num_proc=12): 70%|██████▉ | 30514/43598 [00:07<00:02, 5018.50 examples/s]
|
||
Applying chat template (num_proc=12): 4%|▎ | 1603/43598 [00:06<01:29, 471.18 examples/s]
|
||
Applying chat template (num_proc=12): 65%|██████▌ | 28445/43598 [00:07<00:02, 5724.29 examples/s]
|
||
Applying chat template (num_proc=12): 64%|██████▎ | 27700/43598 [00:07<00:02, 5720.16 examples/s]
|
||
Applying chat template (num_proc=12): 75%|███████▍ | 32664/43598 [00:07<00:02, 5139.73 examples/s]
|
||
Applying chat template (num_proc=12): 57%|█████▋ | 24850/43598 [00:07<00:03, 5903.80 examples/s]
|
||
Applying chat template (num_proc=12): 67%|██████▋ | 29175/43598 [00:07<00:03, 4631.20 examples/s]
|
||
Applying chat template (num_proc=12): 71%|███████ | 31063/43598 [00:08<00:02, 4900.36 examples/s]
|
||
Applying chat template (num_proc=12): 1%|▏ | 630/43598 [00:05<03:48, 188.12 examples/s]
|
||
Applying chat template (num_proc=12): 67%|██████▋ | 29023/43598 [00:07<00:02, 5500.47 examples/s]
|
||
Applying chat template (num_proc=12): 65%|██████▍ | 28279/43598 [00:08<00:02, 5376.09 examples/s]
|
||
Applying chat template (num_proc=12): 58%|█████▊ | 25445/43598 [00:07<00:03, 5673.42 examples/s]
|
||
Applying chat template (num_proc=12): 76%|███████▌ | 33205/43598 [00:08<00:02, 4905.00 examples/s]
|
||
Applying chat template (num_proc=12): 68%|██████▊ | 29858/43598 [00:08<00:02, 5065.47 examples/s]
|
||
Applying chat template (num_proc=12): 72%|███████▏ | 31574/43598 [00:08<00:02, 4682.64 examples/s]
|
||
Applying chat template (num_proc=12): 68%|██████▊ | 29609/43598 [00:08<00:02, 5526.45 examples/s]
|
||
Applying chat template (num_proc=12): 66%|██████▌ | 28842/43598 [00:08<00:02, 5229.76 examples/s]
|
||
Applying chat template (num_proc=12): 60%|█████▉ | 26124/43598 [00:08<00:02, 5945.35 examples/s]
|
||
Applying chat template (num_proc=12): 77%|███████▋ | 33704/43598 [00:08<00:02, 4909.64 examples/s]
|
||
Applying chat template (num_proc=12): 70%|███████ | 30565/43598 [00:08<00:02, 5402.58 examples/s]
|
||
Applying chat template (num_proc=12): 74%|███████▎ | 32053/43598 [00:08<00:02, 4711.36 examples/s]
|
||
Applying chat template (num_proc=12): 69%|██████▉ | 30163/43598 [00:08<00:02, 5418.77 examples/s]
|
||
Applying chat template (num_proc=12): 61%|██████▏ | 26730/43598 [00:08<00:02, 5939.35 examples/s]
|
||
Applying chat template (num_proc=12): 67%|██████▋ | 29384/43598 [00:08<00:02, 5000.89 examples/s]
|
||
Applying chat template (num_proc=12): 71%|███████▏ | 31160/43598 [00:08<00:02, 5401.59 examples/s]
|
||
Applying chat template (num_proc=12): 79%|███████▊ | 34225/43598 [00:08<00:02, 4484.91 examples/s]
|
||
Applying chat template (num_proc=12): 75%|███████▍ | 32554/43598 [00:08<00:02, 4395.47 examples/s]
|
||
Applying chat template (num_proc=12): 70%|███████ | 30736/43598 [00:08<00:02, 5219.59 examples/s]
|
||
Applying chat template (num_proc=12): 69%|██████▊ | 29889/43598 [00:08<00:02, 4860.29 examples/s]
|
||
Applying chat template (num_proc=12): 63%|██████▎ | 27356/43598 [00:08<00:02, 5673.69 examples/s]
|
||
Applying chat template (num_proc=12): 73%|███████▎ | 31745/43598 [00:08<00:02, 5384.59 examples/s]
|
||
Applying chat template (num_proc=12): 80%|███████▉ | 34708/43598 [00:08<00:02, 4068.14 examples/s]
|
||
Applying chat template (num_proc=12): 76%|███████▌ | 33014/43598 [00:08<00:02, 4258.17 examples/s]
|
||
Applying chat template (num_proc=12): 72%|███████▏ | 31288/43598 [00:08<00:02, 4961.98 examples/s]
|
||
Applying chat template (num_proc=12): 64%|██████▍ | 27933/43598 [00:08<00:02, 5629.72 examples/s]
|
||
Applying chat template (num_proc=12): 70%|██████▉ | 30395/43598 [00:08<00:02, 4759.71 examples/s]
|
||
Applying chat template (num_proc=12): 74%|███████▍ | 32314/43598 [00:08<00:02, 5200.42 examples/s]
|
||
Applying chat template (num_proc=12): 81%|████████ | 35154/43598 [00:08<00:02, 4057.53 examples/s]
|
||
Applying chat template (num_proc=12): 77%|███████▋ | 33469/43598 [00:08<00:02, 4189.19 examples/s]
|
||
Applying chat template (num_proc=12): 73%|███████▎ | 31814/43598 [00:08<00:02, 4945.77 examples/s]
|
||
Applying chat template (num_proc=12): 3%|▎ | 1463/43598 [00:05<01:31, 462.20 examples/s]
|
||
Applying chat template (num_proc=12): 65%|██████▌ | 28524/43598 [00:08<00:02, 5575.66 examples/s]
|
||
Applying chat template (num_proc=12): 71%|███████ | 30874/43598 [00:08<00:02, 4539.73 examples/s]
|
||
Applying chat template (num_proc=12): 75%|███████▌ | 32905/43598 [00:08<00:01, 5372.96 examples/s]
|
||
Applying chat template (num_proc=12): 7%|▋ | 3008/43598 [00:06<00:48, 835.58 examples/s]
|
||
Applying chat template (num_proc=12): 82%|████████▏ | 35570/43598 [00:08<00:01, 4017.90 examples/s]
|
||
Applying chat template (num_proc=12): 78%|███████▊ | 33891/43598 [00:08<00:02, 4174.20 examples/s]
|
||
Applying chat template (num_proc=12): 74%|███████▍ | 32341/43598 [00:08<00:02, 4909.33 examples/s]
|
||
Applying chat template (num_proc=12): 72%|███████▏ | 31478/43598 [00:08<00:02, 4939.60 examples/s]
|
||
Applying chat template (num_proc=12): 67%|██████▋ | 29104/43598 [00:08<00:02, 5226.85 examples/s]
|
||
Applying chat template (num_proc=12): 77%|███████▋ | 33491/43598 [00:08<00:01, 5428.26 examples/s]
|
||
Applying chat template (num_proc=12): 83%|████████▎ | 36019/43598 [00:08<00:01, 4076.71 examples/s]
|
||
Applying chat template (num_proc=12): 79%|███████▉ | 34370/43598 [00:08<00:02, 4310.62 examples/s]
|
||
Applying chat template (num_proc=12): 75%|███████▌ | 32838/43598 [00:08<00:02, 4882.07 examples/s]
|
||
Applying chat template (num_proc=12): 73%|███████▎ | 31982/43598 [00:08<00:02, 4910.13 examples/s]
|
||
Applying chat template (num_proc=12): 78%|███████▊ | 34112/43598 [00:08<00:01, 5642.49 examples/s]
|
||
Applying chat template (num_proc=12): 68%|██████▊ | 29656/43598 [00:08<00:02, 5068.98 examples/s]
|
||
Applying chat template (num_proc=12): 80%|███████▉ | 34854/43598 [00:08<00:01, 4412.29 examples/s]
|
||
Applying chat template (num_proc=12): 84%|████████▎ | 36433/43598 [00:08<00:01, 3874.76 examples/s]
|
||
Applying chat template (num_proc=12): 76%|███████▋ | 33332/43598 [00:08<00:02, 4641.64 examples/s]
|
||
Applying chat template (num_proc=12): 75%|███████▍ | 32499/43598 [00:08<00:02, 4956.35 examples/s]
|
||
Applying chat template (num_proc=12): 80%|███████▉ | 34713/43598 [00:08<00:01, 5678.98 examples/s]
|
||
Applying chat template (num_proc=12): 69%|██████▉ | 30175/43598 [00:08<00:02, 4656.74 examples/s]
|
||
Applying chat template (num_proc=12): 81%|████████ | 35303/43598 [00:09<00:01, 4367.17 examples/s]
|
||
Applying chat template (num_proc=12): 84%|████████▍ | 36831/43598 [00:09<00:01, 3798.53 examples/s]
|
||
Applying chat template (num_proc=12): 78%|███████▊ | 33820/43598 [00:08<00:02, 4624.68 examples/s]
|
||
Applying chat template (num_proc=12): 76%|███████▌ | 33035/43598 [00:09<00:02, 4888.98 examples/s]
|
||
Applying chat template (num_proc=12): 81%|████████ | 35294/43598 [00:09<00:01, 5440.58 examples/s]
|
||
Applying chat template (num_proc=12): 70%|███████ | 30667/43598 [00:08<00:02, 4517.02 examples/s]
|
||
Applying chat template (num_proc=12): 82%|████████▏ | 35762/43598 [00:09<00:01, 4380.80 examples/s]
|
||
Applying chat template (num_proc=12): 85%|████████▌ | 37253/43598 [00:09<00:01, 3901.10 examples/s]
|
||
Applying chat template (num_proc=12): 77%|███████▋ | 33661/43598 [00:09<00:01, 5064.83 examples/s]
|
||
Applying chat template (num_proc=12): 6%|▌ | 2549/43598 [00:06<00:51, 804.23 examples/s]
|
||
Applying chat template (num_proc=12): 79%|███████▊ | 34305/43598 [00:09<00:02, 4281.25 examples/s]
|
||
Applying chat template (num_proc=12): 82%|████████▏ | 35851/43598 [00:09<00:01, 5291.68 examples/s]
|
||
Applying chat template (num_proc=12): 71%|███████▏ | 31135/43598 [00:09<00:02, 4525.23 examples/s]
|
||
Applying chat template (num_proc=12): 83%|████████▎ | 36207/43598 [00:09<00:01, 4314.28 examples/s]
|
||
Applying chat template (num_proc=12): 86%|████████▋ | 37697/43598 [00:09<00:01, 3910.41 examples/s]
|
||
Applying chat template (num_proc=12): 78%|███████▊ | 34224/43598 [00:09<00:01, 5111.20 examples/s]
|
||
Applying chat template (num_proc=12): 83%|████████▎ | 36401/43598 [00:09<00:01, 4824.14 examples/s]
|
||
Applying chat template (num_proc=12): 80%|███████▉ | 34757/43598 [00:09<00:02, 3906.30 examples/s]
|
||
Applying chat template (num_proc=12): 73%|███████▎ | 31636/43598 [00:09<00:02, 4517.14 examples/s]
|
||
Applying chat template (num_proc=12): 12%|█▏ | 5191/43598 [00:07<00:26, 1438.09 examples/s]
|
||
Applying chat template (num_proc=12): 84%|████████▍ | 36695/43598 [00:09<00:01, 4330.08 examples/s]
|
||
Applying chat template (num_proc=12): 87%|████████▋ | 38113/43598 [00:09<00:01, 3766.70 examples/s]
|
||
Applying chat template (num_proc=12): 80%|███████▉ | 34750/43598 [00:09<00:01, 4875.00 examples/s]
|
||
Applying chat template (num_proc=12): 81%|████████ | 35179/43598 [00:09<00:02, 3847.44 examples/s]
|
||
Applying chat template (num_proc=12): 74%|███████▎ | 32107/43598 [00:09<00:02, 4498.37 examples/s]
|
||
Applying chat template (num_proc=12): 85%|████████▍ | 36913/43598 [00:09<00:01, 4575.75 examples/s]
|
||
Applying chat template (num_proc=12): 85%|████████▌ | 37139/43598 [00:09<00:01, 4230.03 examples/s]
|
||
Applying chat template (num_proc=12): 88%|████████▊ | 38493/43598 [00:09<00:01, 3547.25 examples/s]
|
||
Applying chat template (num_proc=12): 81%|████████ | 35317/43598 [00:09<00:01, 4980.56 examples/s]
|
||
Applying chat template (num_proc=12): 75%|███████▍ | 32600/43598 [00:09<00:02, 4616.50 examples/s]
|
||
Applying chat template (num_proc=12): 82%|████████▏ | 35592/43598 [00:09<00:02, 3873.64 examples/s]
|
||
Applying chat template (num_proc=12): 86%|████████▌ | 37589/43598 [00:09<00:01, 4248.10 examples/s]
|
||
Applying chat template (num_proc=12): 86%|████████▌ | 37398/43598 [00:09<00:01, 4559.12 examples/s]
|
||
Applying chat template (num_proc=12): 89%|████████▉ | 38862/43598 [00:09<00:01, 3482.55 examples/s]
|
||
Applying chat template (num_proc=12): 10%|▉ | 4178/43598 [00:06<00:28, 1404.66 examples/s]
|
||
Applying chat template (num_proc=12): 82%|████████▏ | 35857/43598 [00:09<00:01, 4937.02 examples/s]
|
||
Applying chat template (num_proc=12): 76%|███████▌ | 33074/43598 [00:09<00:02, 4572.28 examples/s]
|
||
Applying chat template (num_proc=12): 87%|████████▋ | 38021/43598 [00:09<00:01, 4214.59 examples/s]
|
||
Applying chat template (num_proc=12): 83%|████████▎ | 36019/43598 [00:09<00:02, 3748.27 examples/s]
|
||
Applying chat template (num_proc=12): 90%|█████████ | 39240/43598 [00:09<00:01, 3370.15 examples/s]
|
||
Applying chat template (num_proc=12): 87%|████████▋ | 37887/43598 [00:09<00:01, 3812.74 examples/s]
|
||
Applying chat template (num_proc=12): 77%|███████▋ | 33577/43598 [00:09<00:02, 4556.39 examples/s]
|
||
Applying chat template (num_proc=12): 83%|████████▎ | 36374/43598 [00:09<00:01, 4324.19 examples/s]
|
||
Applying chat template (num_proc=12): 84%|████████▎ | 36431/43598 [00:09<00:01, 3725.28 examples/s]
|
||
Applying chat template (num_proc=12): 88%|████████▊ | 38444/43598 [00:09<00:01, 3936.91 examples/s]
|
||
Applying chat template (num_proc=12): 88%|████████▊ | 38312/43598 [00:09<00:01, 3837.20 examples/s]
|
||
Applying chat template (num_proc=12): 91%|█████████ | 39590/43598 [00:09<00:01, 3160.13 examples/s]
|
||
Applying chat template (num_proc=12): 78%|███████▊ | 34052/43598 [00:09<00:02, 4481.80 examples/s]
|
||
Applying chat template (num_proc=12): 85%|████████▍ | 36850/43598 [00:09<00:01, 3847.34 examples/s]
|
||
Applying chat template (num_proc=12): 85%|████████▍ | 36874/43598 [00:09<00:01, 4407.85 examples/s]
|
||
Applying chat template (num_proc=12): 89%|████████▉ | 38880/43598 [00:09<00:01, 3927.73 examples/s]
|
||
Applying chat template (num_proc=12): 79%|███████▉ | 34627/43598 [00:09<00:01, 4805.55 examples/s]
|
||
Applying chat template (num_proc=12): 92%|█████████▏| 39916/43598 [00:09<00:01, 2971.37 examples/s]
|
||
Applying chat template (num_proc=12): 89%|████████▉ | 38721/43598 [00:09<00:01, 3626.50 examples/s]
|
||
Applying chat template (num_proc=12): 86%|████████▌ | 37399/43598 [00:09<00:01, 4626.10 examples/s]
|
||
Applying chat template (num_proc=12): 85%|████████▌ | 37266/43598 [00:09<00:01, 3709.29 examples/s]
|
||
Applying chat template (num_proc=12): 90%|█████████ | 39394/43598 [00:09<00:01, 4135.08 examples/s]
|
||
Applying chat template (num_proc=12): 18%|█▊ | 7946/43598 [00:08<00:16, 2125.77 examples/s]
|
||
Applying chat template (num_proc=12): 13%|█▎ | 5628/43598 [00:07<00:20, 1824.85 examples/s]
|
||
Applying chat template (num_proc=12): 87%|████████▋ | 37911/43598 [00:10<00:01, 4758.85 examples/s]
|
||
Applying chat template (num_proc=12): 90%|████████▉ | 39115/43598 [00:10<00:01, 3649.61 examples/s]
|
||
Applying chat template (num_proc=12): 81%|████████ | 35191/43598 [00:09<00:01, 4801.01 examples/s]
|
||
Applying chat template (num_proc=12): 92%|█████████▏| 40232/43598 [00:10<00:01, 2739.54 examples/s]
|
||
Applying chat template (num_proc=12): 87%|████████▋ | 37717/43598 [00:10<00:01, 3648.67 examples/s]
|
||
Applying chat template (num_proc=12): 91%|█████████▏| 39859/43598 [00:10<00:00, 3881.09 examples/s]
|
||
Applying chat template (num_proc=12): 18%|█▊ | 7834/43598 [00:07<00:11, 3208.40 examples/s]
|
||
Applying chat template (num_proc=12): 88%|████████▊ | 38398/43598 [00:10<00:01, 4779.09 examples/s]
|
||
Applying chat template (num_proc=12): 82%|████████▏ | 35688/43598 [00:10<00:01, 4661.57 examples/s]
|
||
Applying chat template (num_proc=12): 87%|████████▋ | 38095/43598 [00:10<00:01, 3572.34 examples/s]
|
||
Applying chat template (num_proc=12): 93%|█████████▎| 40557/43598 [00:10<00:01, 2647.34 examples/s]
|
||
Applying chat template (num_proc=12): 91%|█████████ | 39516/43598 [00:10<00:01, 3185.91 examples/s]
|
||
Applying chat template (num_proc=12): 92%|█████████▏| 40275/43598 [00:10<00:00, 3719.53 examples/s]
|
||
Applying chat template (num_proc=12): 89%|████████▉ | 38955/43598 [00:10<00:00, 4998.19 examples/s]
|
||
Applying chat template (num_proc=12): 83%|████████▎ | 36215/43598 [00:10<00:01, 4793.61 examples/s]
|
||
Applying chat template (num_proc=12): 20%|██ | 8729/43598 [00:07<00:09, 3549.32 examples/s]
|
||
Applying chat template (num_proc=12): 94%|█████████▎| 40870/43598 [00:10<00:01, 2687.77 examples/s]
|
||
Applying chat template (num_proc=12): 88%|████████▊ | 38463/43598 [00:10<00:01, 3268.14 examples/s]
|
||
Applying chat template (num_proc=12): 93%|█████████▎| 40686/43598 [00:10<00:00, 3649.14 examples/s]
|
||
Applying chat template (num_proc=12): 91%|█████████▏| 39882/43598 [00:10<00:01, 3035.86 examples/s]
|
||
Applying chat template (num_proc=12): 91%|█████████ | 39563/43598 [00:10<00:00, 5157.80 examples/s]
|
||
Applying chat template (num_proc=12): 84%|████████▍ | 36738/43598 [00:10<00:01, 4911.64 examples/s]
|
||
Applying chat template (num_proc=12): 95%|█████████▍| 41208/43598 [00:10<00:00, 2785.85 examples/s]
|
||
Applying chat template (num_proc=12): 89%|████████▉ | 38856/43598 [00:10<00:01, 3391.36 examples/s]
|
||
Applying chat template (num_proc=12): 22%|██▏ | 9510/43598 [00:07<00:09, 3760.11 examples/s]
|
||
Applying chat template (num_proc=12): 95%|█████████▍| 41280/43598 [00:10<00:00, 4181.23 examples/s]
|
||
Applying chat template (num_proc=12): 92%|█████████▏| 40231/43598 [00:10<00:01, 3051.37 examples/s]
|
||
Applying chat template (num_proc=12): 85%|████████▌ | 37259/43598 [00:10<00:01, 4481.70 examples/s]
|
||
Applying chat template (num_proc=12): 95%|█████████▌| 41529/43598 [00:10<00:00, 2893.06 examples/s]
|
||
Applying chat template (num_proc=12): 27%|██▋ | 11781/43598 [00:08<00:09, 3246.43 examples/s]
|
||
Applying chat template (num_proc=12): 92%|█████████▏| 40114/43598 [00:10<00:00, 4173.53 examples/s]
|
||
Applying chat template (num_proc=12): 96%|█████████▋| 42007/43598 [00:10<00:00, 5007.80 examples/s]
|
||
Applying chat template (num_proc=12): 93%|█████████▎| 40735/43598 [00:10<00:00, 3537.16 examples/s]
|
||
Applying chat template (num_proc=12): 90%|████████▉ | 39229/43598 [00:10<00:01, 3292.97 examples/s]
|
||
Applying chat template (num_proc=12): 23%|██▎ | 10214/43598 [00:07<00:08, 3948.36 examples/s]
|
||
Applying chat template (num_proc=12): 87%|████████▋ | 37769/43598 [00:10<00:01, 4389.20 examples/s]
|
||
Applying chat template (num_proc=12): 96%|█████████▌| 41848/43598 [00:10<00:00, 2941.75 examples/s]
|
||
Applying chat template (num_proc=12): 36%|███▌ | 15737/43598 [00:08<00:05, 5313.15 examples/s]
|
||
Applying chat template (num_proc=12): 95%|█████████▍| 41208/43598 [00:10<00:00, 3845.12 examples/s]
|
||
Applying chat template (num_proc=12): 98%|█████████▊| 42567/43598 [00:10<00:00, 4922.62 examples/s]
|
||
Applying chat template (num_proc=12): 91%|█████████ | 39649/43598 [00:10<00:01, 3387.02 examples/s]
|
||
Applying chat template (num_proc=12): 25%|██▍ | 10876/43598 [00:07<00:07, 4309.03 examples/s]
|
||
Applying chat template (num_proc=12): 93%|█████████▎| 40566/43598 [00:10<00:00, 3587.30 examples/s]
|
||
Applying chat template (num_proc=12): 88%|████████▊ | 38225/43598 [00:10<00:01, 4269.14 examples/s]
|
||
Applying chat template (num_proc=12): 97%|█████████▋| 42184/43598 [00:10<00:00, 3003.66 examples/s]
|
||
Applying chat template (num_proc=12): 96%|█████████▌| 41818/43598 [00:10<00:00, 4457.95 examples/s]
|
||
Applying chat template (num_proc=12): 99%|█████████▉| 43105/43598 [00:10<00:00, 4654.93 examples/s]
|
||
Applying chat template (num_proc=12): 27%|██▋ | 11598/43598 [00:07<00:06, 4732.47 examples/s]
|
||
Applying chat template (num_proc=12): 92%|█████████▏| 40074/43598 [00:10<00:01, 3340.13 examples/s]
|
||
Applying chat template (num_proc=12): 89%|████████▉ | 38816/43598 [00:10<00:01, 4705.16 examples/s]
|
||
Applying chat template (num_proc=12): 98%|█████████▊| 42561/43598 [00:10<00:00, 3164.24 examples/s]
|
||
Applying chat template (num_proc=12): 97%|█████████▋| 42293/43598 [00:10<00:00, 4512.94 examples/s]
|
||
Applying chat template (num_proc=12): 28%|██▊ | 12260/43598 [00:08<00:06, 5094.07 examples/s]
|
||
Applying chat template (num_proc=12): 40%|███▉ | 17241/43598 [00:09<00:04, 5369.16 examples/s]
|
||
Applying chat template (num_proc=12): 93%|█████████▎| 40498/43598 [00:10<00:00, 3532.44 examples/s]
|
||
Applying chat template (num_proc=12): 94%|█████████▍| 40972/43598 [00:10<00:00, 3032.94 examples/s]
|
||
Applying chat template (num_proc=12): 90%|█████████ | 39400/43598 [00:10<00:00, 4936.60 examples/s]
|
||
Applying chat template (num_proc=12): 98%|█████████▊| 42902/43598 [00:10<00:00, 3151.53 examples/s]
|
||
Applying chat template (num_proc=12): 100%|█████████▉| 43591/43598 [00:11<00:00, 3831.01 examples/s]
|
||
Applying chat template (num_proc=12): 98%|█████████▊| 42865/43598 [00:11<00:00, 4440.38 examples/s]
|
||
Applying chat template (num_proc=12): 30%|██▉ | 12966/43598 [00:08<00:05, 5411.47 examples/s]
|
||
Applying chat template (num_proc=12): 94%|█████████▍| 40957/43598 [00:10<00:00, 3636.81 examples/s]
|
||
Applying chat template (num_proc=12): 95%|█████████▍| 41312/43598 [00:11<00:00, 2953.50 examples/s]
|
||
Applying chat template (num_proc=12): 92%|█████████▏| 40159/43598 [00:10<00:00, 5658.56 examples/s]
|
||
Applying chat template (num_proc=12): 99%|█████████▉| 43285/43598 [00:11<00:00, 2883.38 examples/s]
|
||
Applying chat template (num_proc=12): 42%|████▏ | 18459/43598 [00:09<00:04, 5466.77 examples/s]
|
||
Applying chat template (num_proc=12): 31%|███▏ | 13694/43598 [00:08<00:05, 5826.65 examples/s]
|
||
Applying chat template (num_proc=12): 95%|█████████▌| 41423/43598 [00:11<00:00, 3909.32 examples/s]
|
||
Applying chat template (num_proc=12): 93%|█████████▎| 40753/43598 [00:11<00:00, 5733.21 examples/s]Traceback (most recent call last):
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/process.py", line 314, in _bootstrap
|
||
self.run()
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/process.py", line 108, in run
|
||
self._target(*self._args, **self._kwargs)
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/managers.py", line 600, in _run_server
|
||
server.serve_forever()
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/managers.py", line 184, in serve_forever
|
||
sys.exit(0)
|
||
SystemExit: 0
|
||
|
||
During handling of the above exception, another exception occurred:
|
||
|
||
Traceback (most recent call last):
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/util.py", line 300, in _run_finalizers
|
||
finalizer()
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/util.py", line 224, in __call__
|
||
res = self._callback(*self._args, **self._kwargs)
|
||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/util.py", line 133, in _remove_temp_dir
|
||
rmtree(tempdir)
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/shutil.py", line 752, in rmtree
|
||
_rmtree_safe_fd(fd, path, onerror)
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/shutil.py", line 703, in _rmtree_safe_fd
|
||
onerror(os.unlink, fullname, sys.exc_info())
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/shutil.py", line 701, in _rmtree_safe_fd
|
||
os.unlink(entry.name, dir_fd=topfd)
|
||
OSError: [Errno 16] Device or resource busy: '.nfsf9a4022bf7e8233f0002fc07'
|
||
|
||
Applying chat template (num_proc=12): 100%|██████████| 43598/43598 [00:11<00:00, 3876.47 examples/s]
|
||
|
||
Applying chat template (num_proc=12): 96%|█████████▌| 41681/43598 [00:11<00:00, 2666.74 examples/s]
|
||
Applying chat template (num_proc=12): 33%|███▎ | 14488/43598 [00:08<00:04, 6293.59 examples/s]
|
||
Applying chat template (num_proc=12): 95%|█████████▍| 41402/43598 [00:11<00:00, 5543.71 examples/s]
|
||
Applying chat template (num_proc=12): 45%|████▍ | 19501/43598 [00:09<00:04, 5546.13 examples/s]
|
||
Applying chat template (num_proc=12): 96%|█████████▌| 41838/43598 [00:11<00:00, 3427.07 examples/s]
|
||
Applying chat template (num_proc=12): 99%|█████████▉| 43360/43598 [00:11<00:00, 2928.55 examples/s]
|
||
Applying chat template (num_proc=12): 35%|███▌ | 15321/43598 [00:08<00:04, 6760.74 examples/s]
|
||
Applying chat template (num_proc=12): 100%|██████████| 43598/43598 [00:11<00:00, 2162.53 examples/s]
|
||
Applying chat template (num_proc=12): 96%|█████████▋| 42028/43598 [00:11<00:00, 5716.52 examples/s]
|
||
Applying chat template (num_proc=12): 47%|████▋ | 20509/43598 [00:09<00:03, 6079.53 examples/s]
|
||
Applying chat template (num_proc=12): 97%|█████████▋| 42317/43598 [00:11<00:00, 3583.20 examples/s]
|
||
Applying chat template (num_proc=12): 38%|███▊ | 16372/43598 [00:08<00:03, 7704.11 examples/s]
|
||
Applying chat template (num_proc=12): 96%|█████████▋| 42001/43598 [00:11<00:00, 2209.12 examples/s]
|
||
Applying chat template (num_proc=12): 50%|████▉ | 21649/43598 [00:09<00:03, 6853.55 examples/s]Traceback (most recent call last):
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/process.py", line 314, in _bootstrap
|
||
self.run()
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/process.py", line 108, in run
|
||
self._target(*self._args, **self._kwargs)
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/managers.py", line 600, in _run_server
|
||
server.serve_forever()
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/managers.py", line 184, in serve_forever
|
||
sys.exit(0)
|
||
SystemExit: 0
|
||
|
||
During handling of the above exception, another exception occurred:
|
||
|
||
Traceback (most recent call last):
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/util.py", line 300, in _run_finalizers
|
||
finalizer()
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/util.py", line 224, in __call__
|
||
res = self._callback(*self._args, **self._kwargs)
|
||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/util.py", line 133, in _remove_temp_dir
|
||
rmtree(tempdir)
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/shutil.py", line 752, in rmtree
|
||
_rmtree_safe_fd(fd, path, onerror)
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/shutil.py", line 703, in _rmtree_safe_fd
|
||
onerror(os.unlink, fullname, sys.exc_info())
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/shutil.py", line 701, in _rmtree_safe_fd
|
||
os.unlink(entry.name, dir_fd=topfd)
|
||
OSError: [Errno 16] Device or resource busy: '.nfs67dd64b0e007b20b0002fc0f'
|
||
|
||
Applying chat template (num_proc=12): 41%|████ | 17720/43598 [00:08<00:02, 9293.35 examples/s]
|
||
Applying chat template (num_proc=12): 100%|██████████| 43598/43598 [00:11<00:00, 3769.94 examples/s]
|
||
|
||
Applying chat template (num_proc=12): 98%|█████████▊| 42775/43598 [00:11<00:00, 3573.25 examples/s]
|
||
Applying chat template (num_proc=12): 97%|█████████▋| 42365/43598 [00:11<00:00, 2253.23 examples/s]
|
||
Applying chat template (num_proc=12): 52%|█████▏ | 22716/43598 [00:09<00:02, 7495.30 examples/s]
|
||
Applying chat template (num_proc=12): 98%|█████████▊| 42634/43598 [00:11<00:00, 4182.18 examples/s]
|
||
Applying chat template (num_proc=12): 43%|████▎ | 18859/43598 [00:08<00:02, 9559.14 examples/s]
|
||
Applying chat template (num_proc=12): 98%|█████████▊| 42683/43598 [00:11<00:00, 2438.37 examples/s]
|
||
Applying chat template (num_proc=12): 55%|█████▌ | 24195/43598 [00:09<00:02, 8926.28 examples/s]
|
||
Applying chat template (num_proc=12): 99%|█████████▉| 43140/43598 [00:11<00:00, 3006.59 examples/s]Traceback (most recent call last):
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/process.py", line 314, in _bootstrap
|
||
self.run()
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/process.py", line 108, in run
|
||
self._target(*self._args, **self._kwargs)
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/managers.py", line 600, in _run_server
|
||
server.serve_forever()
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/managers.py", line 184, in serve_forever
|
||
sys.exit(0)
|
||
SystemExit: 0
|
||
|
||
During handling of the above exception, another exception occurred:
|
||
|
||
Traceback (most recent call last):
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/util.py", line 300, in _run_finalizers
|
||
finalizer()
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/util.py", line 224, in __call__
|
||
res = self._callback(*self._args, **self._kwargs)
|
||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/util.py", line 133, in _remove_temp_dir
|
||
rmtree(tempdir)
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/shutil.py", line 752, in rmtree
|
||
_rmtree_safe_fd(fd, path, onerror)
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/shutil.py", line 703, in _rmtree_safe_fd
|
||
onerror(os.unlink, fullname, sys.exc_info())
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/shutil.py", line 701, in _rmtree_safe_fd
|
||
os.unlink(entry.name, dir_fd=topfd)
|
||
OSError: [Errno 16] Device or resource busy: '.nfsabf784d28e18fed40002fc13'
|
||
|
||
Applying chat template (num_proc=12): 0%| | 0/2339 [00:00<?, ? examples/s]
|
||
Applying chat template (num_proc=12): 46%|████▋ | 20180/43598 [00:08<00:02, 10523.04 examples/s]
|
||
Applying chat template (num_proc=12): 100%|██████████| 43598/43598 [00:11<00:00, 3701.98 examples/s]
|
||
|
||
Applying chat template (num_proc=12): 58%|█████▊ | 25441/43598 [00:09<00:01, 9698.43 examples/s]
|
||
Applying chat template (num_proc=12): 99%|█████████▊| 42965/43598 [00:11<00:00, 2345.46 examples/s]
|
||
Applying chat template (num_proc=12): 50%|█████ | 21979/43598 [00:08<00:01, 12365.73 examples/s]
|
||
Applying chat template (num_proc=12): 99%|█████████▉| 43139/43598 [00:11<00:00, 3160.37 examples/s]
|
||
Applying chat template (num_proc=12): 61%|██████ | 26635/43598 [00:10<00:01, 10200.50 examples/s]
|
||
Applying chat template (num_proc=12): 55%|█████▍ | 23866/43598 [00:09<00:01, 14209.51 examples/s]
|
||
Applying chat template (num_proc=12): 100%|█████████▉| 43562/43598 [00:11<00:00, 2346.53 examples/s]
|
||
Applying chat template (num_proc=12): 64%|██████▍ | 27878/43598 [00:10<00:01, 10459.93 examples/s]
|
||
Applying chat template (num_proc=12): 59%|█████▉ | 25797/43598 [00:09<00:01, 15676.30 examples/s]
|
||
Applying chat template (num_proc=12): 99%|█████████▉| 43251/43598 [00:12<00:00, 1897.18 examples/s]
|
||
Applying chat template (num_proc=12): 67%|██████▋ | 29063/43598 [00:10<00:01, 10772.94 examples/s]
|
||
Applying chat template (num_proc=12): 64%|██████▍ | 27810/43598 [00:09<00:00, 16949.16 examples/s]Traceback (most recent call last):
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/process.py", line 314, in _bootstrap
|
||
self.run()
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/process.py", line 108, in run
|
||
self._target(*self._args, **self._kwargs)
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/managers.py", line 600, in _run_server
|
||
server.serve_forever()
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/managers.py", line 184, in serve_forever
|
||
sys.exit(0)
|
||
SystemExit: 0
|
||
|
||
During handling of the above exception, another exception occurred:
|
||
|
||
Traceback (most recent call last):
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/util.py", line 300, in _run_finalizers
|
||
finalizer()
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/util.py", line 224, in __call__
|
||
res = self._callback(*self._args, **self._kwargs)
|
||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/util.py", line 133, in _remove_temp_dir
|
||
rmtree(tempdir)
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/shutil.py", line 752, in rmtree
|
||
_rmtree_safe_fd(fd, path, onerror)
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/shutil.py", line 703, in _rmtree_safe_fd
|
||
onerror(os.unlink, fullname, sys.exc_info())
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/shutil.py", line 701, in _rmtree_safe_fd
|
||
os.unlink(entry.name, dir_fd=topfd)
|
||
OSError: [Errno 16] Device or resource busy: '.nfs3b0c1cfa8eaa70fe0002fc1d'
|
||
|
||
Applying chat template (num_proc=12): 100%|██████████| 43598/43598 [00:12<00:00, 3589.44 examples/s]
|
||
|
||
Applying chat template (num_proc=12): 70%|██████▉ | 30427/43598 [00:10<00:01, 11168.53 examples/s]
|
||
Applying chat template (num_proc=12): 100%|█████████▉| 43534/43598 [00:12<00:00, 2185.89 examples/s]
|
||
Applying chat template (num_proc=12): 100%|█████████▉| 43528/43598 [00:12<00:00, 1703.98 examples/s]
|
||
Applying chat template (num_proc=12): 68%|██████▊ | 29523/43598 [00:09<00:00, 15328.06 examples/s]
|
||
Applying chat template (num_proc=12): 73%|███████▎ | 31619/43598 [00:10<00:01, 11236.03 examples/s]
|
||
Applying chat template (num_proc=12): 72%|███████▏ | 31604/43598 [00:09<00:00, 16448.48 examples/s]
|
||
Applying chat template (num_proc=12): 0%| | 0/2339 [00:00<?, ? examples/s]
|
||
Applying chat template (num_proc=12): 75%|███████▌ | 32903/43598 [00:10<00:00, 11664.33 examples/s]
|
||
Applying chat template (num_proc=12): 0%| | 0/2339 [00:00<?, ? examples/s]Traceback (most recent call last):
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/process.py", line 314, in _bootstrap
|
||
self.run()
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/process.py", line 108, in run
|
||
self._target(*self._args, **self._kwargs)
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/managers.py", line 600, in _run_server
|
||
server.serve_forever()
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/managers.py", line 184, in serve_forever
|
||
sys.exit(0)
|
||
SystemExit: 0
|
||
|
||
During handling of the above exception, another exception occurred:
|
||
|
||
Traceback (most recent call last):
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/util.py", line 300, in _run_finalizers
|
||
finalizer()
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/util.py", line 224, in __call__
|
||
res = self._callback(*self._args, **self._kwargs)
|
||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/util.py", line 133, in _remove_temp_dir
|
||
rmtree(tempdir)
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/shutil.py", line 752, in rmtree
|
||
_rmtree_safe_fd(fd, path, onerror)
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/shutil.py", line 703, in _rmtree_safe_fd
|
||
onerror(os.unlink, fullname, sys.exc_info())
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/shutil.py", line 701, in _rmtree_safe_fd
|
||
os.unlink(entry.name, dir_fd=topfd)
|
||
OSError: [Errno 16] Device or resource busy: '.nfs215b9bee637d692a0002fc21'
|
||
|
||
Applying chat template (num_proc=12): 100%|██████████| 43598/43598 [00:12<00:00, 3519.40 examples/s]
|
||
Traceback (most recent call last):
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/process.py", line 314, in _bootstrap
|
||
self.run()
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/process.py", line 108, in run
|
||
self._target(*self._args, **self._kwargs)
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/managers.py", line 600, in _run_server
|
||
server.serve_forever()
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/managers.py", line 184, in serve_forever
|
||
sys.exit(0)
|
||
SystemExit: 0
|
||
|
||
During handling of the above exception, another exception occurred:
|
||
|
||
Traceback (most recent call last):
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/util.py", line 300, in _run_finalizers
|
||
finalizer()
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/util.py", line 224, in __call__
|
||
res = self._callback(*self._args, **self._kwargs)
|
||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/util.py", line 133, in _remove_temp_dir
|
||
rmtree(tempdir)
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/shutil.py", line 752, in rmtree
|
||
_rmtree_safe_fd(fd, path, onerror)
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/shutil.py", line 703, in _rmtree_safe_fd
|
||
onerror(os.unlink, fullname, sys.exc_info())
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/shutil.py", line 701, in _rmtree_safe_fd
|
||
os.unlink(entry.name, dir_fd=topfd)
|
||
OSError: [Errno 16] Device or resource busy: '.nfs0bda44f7736da8d90002fc22'
|
||
|
||
Applying chat template (num_proc=12): 77%|███████▋ | 33420/43598 [00:09<00:00, 16638.44 examples/s]
|
||
Applying chat template (num_proc=12): 100%|██████████| 43598/43598 [00:12<00:00, 3471.20 examples/s]
|
||
|
||
Applying chat template (num_proc=12): 79%|███████▉ | 34425/43598 [00:10<00:00, 12318.26 examples/s]
|
||
Applying chat template (num_proc=12): 81%|████████ | 35175/43598 [00:09<00:00, 16035.72 examples/s]Concatenating 12 shards
|
||
2026-04-14 19:26:51 - INFO - datasets.arrow_dataset - Concatenating 12 shards
|
||
|
||
Applying chat template (num_proc=12): 82%|████████▏ | 35836/43598 [00:10<00:00, 11362.99 examples/s]
|
||
Applying chat template (num_proc=12): 85%|████████▍ | 36868/43598 [00:09<00:00, 14688.56 examples/s]
|
||
Applying chat template (num_proc=12): 85%|████████▍ | 37039/43598 [00:10<00:00, 10601.19 examples/s]Process #0 will write at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-c697f3365c9a64c6_00000_of_00012.arrow
|
||
2026-04-14 19:26:52 - INFO - datasets.arrow_dataset - Process #0 will write at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-c697f3365c9a64c6_00000_of_00012.arrow
|
||
Process #1 will write at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-c697f3365c9a64c6_00001_of_00012.arrow
|
||
2026-04-14 19:26:52 - INFO - datasets.arrow_dataset - Process #1 will write at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-c697f3365c9a64c6_00001_of_00012.arrow
|
||
Process #2 will write at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-c697f3365c9a64c6_00002_of_00012.arrow
|
||
2026-04-14 19:26:52 - INFO - datasets.arrow_dataset - Process #2 will write at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-c697f3365c9a64c6_00002_of_00012.arrow
|
||
Process #3 will write at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-c697f3365c9a64c6_00003_of_00012.arrow
|
||
2026-04-14 19:26:52 - INFO - datasets.arrow_dataset - Process #3 will write at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-c697f3365c9a64c6_00003_of_00012.arrow
|
||
Process #4 will write at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-c697f3365c9a64c6_00004_of_00012.arrow
|
||
2026-04-14 19:26:52 - INFO - datasets.arrow_dataset - Process #4 will write at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-c697f3365c9a64c6_00004_of_00012.arrow
|
||
Process #5 will write at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-c697f3365c9a64c6_00005_of_00012.arrow
|
||
2026-04-14 19:26:52 - INFO - datasets.arrow_dataset - Process #5 will write at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-c697f3365c9a64c6_00005_of_00012.arrow
|
||
Process #6 will write at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-c697f3365c9a64c6_00006_of_00012.arrow
|
||
2026-04-14 19:26:52 - INFO - datasets.arrow_dataset - Process #6 will write at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-c697f3365c9a64c6_00006_of_00012.arrow
|
||
Process #7 will write at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-c697f3365c9a64c6_00007_of_00012.arrow
|
||
2026-04-14 19:26:52 - INFO - datasets.arrow_dataset - Process #7 will write at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-c697f3365c9a64c6_00007_of_00012.arrow
|
||
Process #8 will write at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-c697f3365c9a64c6_00008_of_00012.arrow
|
||
2026-04-14 19:26:52 - INFO - datasets.arrow_dataset - Process #8 will write at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-c697f3365c9a64c6_00008_of_00012.arrow
|
||
Process #9 will write at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-c697f3365c9a64c6_00009_of_00012.arrow
|
||
2026-04-14 19:26:52 - INFO - datasets.arrow_dataset - Process #9 will write at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-c697f3365c9a64c6_00009_of_00012.arrow
|
||
Process #10 will write at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-c697f3365c9a64c6_00010_of_00012.arrow
|
||
2026-04-14 19:26:52 - INFO - datasets.arrow_dataset - Process #10 will write at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-c697f3365c9a64c6_00010_of_00012.arrow
|
||
Process #11 will write at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-c697f3365c9a64c6_00011_of_00012.arrow
|
||
2026-04-14 19:26:52 - INFO - datasets.arrow_dataset - Process #11 will write at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-c697f3365c9a64c6_00011_of_00012.arrow
|
||
|
||
Applying chat template (num_proc=12): 88%|████████▊ | 38455/43598 [00:10<00:00, 14027.21 examples/s]
|
||
Applying chat template (num_proc=12): 88%|████████▊ | 38168/43598 [00:11<00:00, 10311.45 examples/s]
|
||
Applying chat template (num_proc=12): 0%| | 1/2339 [00:01<48:06, 1.23s/ examples]
|
||
Applying chat template (num_proc=12): 92%|█████████▏| 39907/43598 [00:10<00:00, 13073.21 examples/s]Spawning 12 processes
|
||
2026-04-14 19:26:52 - INFO - datasets.arrow_dataset - Spawning 12 processes
|
||
|
||
Applying chat template (num_proc=12): 0%| | 0/2339 [00:00<?, ? examples/s]
|
||
Applying chat template (num_proc=12): 90%|█████████ | 39278/43598 [00:11<00:00, 9013.31 examples/s]
|
||
Applying chat template (num_proc=12): 8%|▊ | 184/2339 [00:01<00:12, 174.47 examples/s]
|
||
Applying chat template (num_proc=12): 0%| | 0/2339 [00:00<?, ? examples/s]
|
||
Applying chat template (num_proc=12): 95%|█████████▍| 41258/43598 [00:10<00:00, 11119.29 examples/s]
|
||
Applying chat template (num_proc=12): 0%| | 0/2339 [00:00<?, ? examples/s]
|
||
Applying chat template (num_proc=12): 92%|█████████▏| 40220/43598 [00:11<00:00, 7794.99 examples/s]
|
||
Applying chat template (num_proc=12): 14%|█▍ | 327/2339 [00:01<00:07, 276.93 examples/s]
|
||
Applying chat template (num_proc=12): 1%|▏ | 35/2339 [00:01<01:11, 32.17 examples/s]
|
||
Applying chat template (num_proc=12): 94%|█████████▍| 41066/43598 [00:11<00:00, 5954.59 examples/s]
|
||
Applying chat template (num_proc=12): 98%|█████████▊| 42516/43598 [00:10<00:00, 6500.08 examples/s]
|
||
Applying chat template (num_proc=12): 25%|██▌ | 586/2339 [00:01<00:03, 477.48 examples/s]
|
||
Applying chat template (num_proc=12): 96%|█████████▌| 41751/43598 [00:11<00:00, 5540.55 examples/s]
|
||
Applying chat template (num_proc=12): 4%|▍ | 98/2339 [00:01<00:25, 88.05 examples/s]
|
||
Applying chat template (num_proc=12): 50%|█████ | 1170/2339 [00:02<00:00, 1178.59 examples/s]
|
||
Applying chat template (num_proc=12): 97%|█████████▋| 42411/43598 [00:12<00:00, 4724.52 examples/s]
|
||
Applying chat template (num_proc=12): 17%|█▋ | 390/2339 [00:01<00:04, 396.63 examples/s]
|
||
Applying chat template (num_proc=12): 0%| | 1/2339 [00:01<59:54, 1.54s/ examples]
|
||
Applying chat template (num_proc=12): 100%|█████████▉| 43458/43598 [00:11<00:00, 4477.37 examples/s]
|
||
Applying chat template (num_proc=12): 20%|██ | 476/2339 [00:01<00:04, 430.83 examples/s]
|
||
Applying chat template (num_proc=12): 99%|█████████▊| 42947/43598 [00:12<00:00, 3631.73 examples/s]Traceback (most recent call last):
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/process.py", line 314, in _bootstrap
|
||
self.run()
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/process.py", line 108, in run
|
||
self._target(*self._args, **self._kwargs)
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/managers.py", line 600, in _run_server
|
||
server.serve_forever()
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/managers.py", line 184, in serve_forever
|
||
sys.exit(0)
|
||
SystemExit: 0
|
||
|
||
During handling of the above exception, another exception occurred:
|
||
|
||
Traceback (most recent call last):
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/util.py", line 300, in _run_finalizers
|
||
finalizer()
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/util.py", line 224, in __call__
|
||
res = self._callback(*self._args, **self._kwargs)
|
||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/util.py", line 133, in _remove_temp_dir
|
||
rmtree(tempdir)
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/shutil.py", line 752, in rmtree
|
||
_rmtree_safe_fd(fd, path, onerror)
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/shutil.py", line 703, in _rmtree_safe_fd
|
||
onerror(os.unlink, fullname, sys.exc_info())
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/shutil.py", line 701, in _rmtree_safe_fd
|
||
os.unlink(entry.name, dir_fd=topfd)
|
||
OSError: [Errno 16] Device or resource busy: '.nfs694a7be5fdcdda610002fc37'
|
||
|
||
Applying chat template (num_proc=12): 25%|██▍ | 575/2339 [00:01<00:03, 461.96 examples/s]
|
||
Applying chat template (num_proc=12): 61%|██████ | 1422/2339 [00:02<00:01, 855.04 examples/s]
|
||
Applying chat template (num_proc=12): 100%|██████████| 43598/43598 [00:11<00:00, 3822.91 examples/s]
|
||
|
||
Applying chat template (num_proc=12): 76%|███████▌ | 1781/2339 [00:02<00:00, 1173.76 examples/s]
|
||
Applying chat template (num_proc=12): 100%|█████████▉| 43381/43598 [00:12<00:00, 2928.15 examples/s]Caching processed dataset at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-c697f3365c9a64c6_00001_of_00012.arrow
|
||
2026-04-14 19:26:53 - INFO - datasets.arrow_dataset - Caching processed dataset at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-c697f3365c9a64c6_00001_of_00012.arrow
|
||
|
||
Applying chat template (num_proc=12): 32%|███▏ | 737/2339 [00:02<00:02, 575.37 examples/s]
|
||
Applying chat template (num_proc=12): 92%|█████████▏| 2144/2339 [00:02<00:00, 1510.81 examples/s]Caching processed dataset at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-c697f3365c9a64c6_00000_of_00012.arrow
|
||
2026-04-14 19:26:53 - INFO - datasets.arrow_dataset - Caching processed dataset at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-c697f3365c9a64c6_00000_of_00012.arrow
|
||
|
||
Applying chat template (num_proc=12): 0%| | 1/2339 [00:01<58:39, 1.51s/ examples]
|
||
Applying chat template (num_proc=12): 42%|████▏ | 975/2339 [00:02<00:01, 745.88 examples/s]Traceback (most recent call last):
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/process.py", line 314, in _bootstrap
|
||
self.run()
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/process.py", line 108, in run
|
||
self._target(*self._args, **self._kwargs)
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/managers.py", line 600, in _run_server
|
||
server.serve_forever()
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/managers.py", line 184, in serve_forever
|
||
sys.exit(0)
|
||
SystemExit: 0
|
||
|
||
During handling of the above exception, another exception occurred:
|
||
|
||
Traceback (most recent call last):
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/util.py", line 300, in _run_finalizers
|
||
finalizer()
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/util.py", line 224, in __call__
|
||
res = self._callback(*self._args, **self._kwargs)
|
||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/util.py", line 133, in _remove_temp_dir
|
||
rmtree(tempdir)
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/shutil.py", line 752, in rmtree
|
||
_rmtree_safe_fd(fd, path, onerror)
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/shutil.py", line 703, in _rmtree_safe_fd
|
||
onerror(os.unlink, fullname, sys.exc_info())
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/shutil.py", line 701, in _rmtree_safe_fd
|
||
os.unlink(entry.name, dir_fd=topfd)
|
||
OSError: [Errno 16] Device or resource busy: '.nfse839e81080683f310002fc3d'
|
||
Traceback (most recent call last):
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/process.py", line 314, in _bootstrap
|
||
self.run()
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/process.py", line 108, in run
|
||
self._target(*self._args, **self._kwargs)
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/managers.py", line 600, in _run_server
|
||
server.serve_forever()
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/managers.py", line 184, in serve_forever
|
||
sys.exit(0)
|
||
SystemExit: 0
|
||
|
||
During handling of the above exception, another exception occurred:
|
||
|
||
Traceback (most recent call last):
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/util.py", line 300, in _run_finalizers
|
||
finalizer()
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/util.py", line 224, in __call__
|
||
res = self._callback(*self._args, **self._kwargs)
|
||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/util.py", line 133, in _remove_temp_dir
|
||
rmtree(tempdir)
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/shutil.py", line 752, in rmtree
|
||
_rmtree_safe_fd(fd, path, onerror)
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/shutil.py", line 703, in _rmtree_safe_fd
|
||
onerror(os.unlink, fullname, sys.exc_info())
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/shutil.py", line 701, in _rmtree_safe_fd
|
||
os.unlink(entry.name, dir_fd=topfd)
|
||
OSError: [Errno 16] Device or resource busy: '.nfsaea63d2d30451abb0002fc3e'
|
||
|
||
Applying chat template (num_proc=12): 17%|█▋ | 391/2339 [00:02<00:09, 211.30 examples/s]
|
||
Applying chat template (num_proc=12): 100%|██████████| 43598/43598 [00:12<00:00, 3377.67 examples/s]
|
||
|
||
Applying chat template (num_proc=12): 58%|█████▊ | 1361/2339 [00:02<00:00, 1269.99 examples/s]
|
||
Applying chat template (num_proc=12): 100%|██████████| 2339/2339 [00:03<00:00, 760.00 examples/s]
|
||
|
||
Applying chat template (num_proc=12): 8%|▊ | 196/2339 [00:01<00:15, 135.82 examples/s]
|
||
Filter: 0%| | 0/43598 [00:00<?, ? examples/s]
|
||
Filter: 0%| | 0/43598 [00:00<?, ? examples/s]
|
||
Applying chat template (num_proc=12): 24%|██▍ | 563/2339 [00:02<00:06, 293.87 examples/s]
|
||
Applying chat template (num_proc=12): 67%|██████▋ | 1561/2339 [00:02<00:00, 925.76 examples/s] Caching processed dataset at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-c697f3365c9a64c6_00002_of_00012.arrow
|
||
2026-04-14 19:26:54 - INFO - datasets.arrow_dataset - Caching processed dataset at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-c697f3365c9a64c6_00002_of_00012.arrow
|
||
|
||
Applying chat template (num_proc=12): 0%| | 1/2339 [00:01<1:14:39, 1.92s/ examples]Caching processed dataset at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-c697f3365c9a64c6_00003_of_00012.arrow
|
||
2026-04-14 19:26:54 - INFO - datasets.arrow_dataset - Caching processed dataset at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-c697f3365c9a64c6_00003_of_00012.arrow
|
||
|
||
Applying chat template (num_proc=12): 75%|███████▌ | 1756/2339 [00:02<00:00, 971.15 examples/s]
|
||
Filter: 0%| | 0/43598 [00:00<?, ? examples/s]
|
||
Applying chat template (num_proc=12): 0%| | 1/2339 [00:02<1:26:12, 2.21s/ examples]
|
||
Applying chat template (num_proc=12): 27%|██▋ | 638/2339 [00:02<00:06, 262.06 examples/s]
|
||
Applying chat template (num_proc=12): 17%|█▋ | 390/2339 [00:02<00:08, 240.42 examples/s]
|
||
Applying chat template (num_proc=12): 17%|█▋ | 391/2339 [00:02<00:09, 214.04 examples/s]
|
||
Applying chat template (num_proc=12): 83%|████████▎ | 1951/2339 [00:03<00:00, 886.68 examples/s]
|
||
Applying chat template (num_proc=12): 25%|██▌ | 586/2339 [00:02<00:04, 355.97 examples/s]
|
||
Applying chat template (num_proc=12): 8%|▊ | 196/2339 [00:02<00:20, 106.82 examples/s]
|
||
Applying chat template (num_proc=12): 97%|█████████▋| 2280/2339 [00:03<00:00, 1216.39 examples/s]Caching processed dataset at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-c697f3365c9a64c6_00005_of_00012.arrow
|
||
2026-04-14 19:26:55 - INFO - datasets.arrow_dataset - Caching processed dataset at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-c697f3365c9a64c6_00005_of_00012.arrow
|
||
|
||
Applying chat template (num_proc=12): 56%|█████▌ | 1307/2339 [00:03<00:01, 669.15 examples/s]Caching processed dataset at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-c697f3365c9a64c6_00004_of_00012.arrow
|
||
2026-04-14 19:26:55 - INFO - datasets.arrow_dataset - Caching processed dataset at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-c697f3365c9a64c6_00004_of_00012.arrow
|
||
|
||
Applying chat template (num_proc=12): 33%|███▎ | 781/2339 [00:02<00:03, 436.77 examples/s]Caching processed dataset at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-c697f3365c9a64c6_00006_of_00012.arrow
|
||
2026-04-14 19:26:55 - INFO - datasets.arrow_dataset - Caching processed dataset at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-c697f3365c9a64c6_00006_of_00012.arrow
|
||
Traceback (most recent call last):
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/process.py", line 314, in _bootstrap
|
||
self.run()
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/process.py", line 108, in run
|
||
self._target(*self._args, **self._kwargs)
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/managers.py", line 600, in _run_server
|
||
server.serve_forever()
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/managers.py", line 184, in serve_forever
|
||
sys.exit(0)
|
||
SystemExit: 0
|
||
|
||
During handling of the above exception, another exception occurred:
|
||
|
||
Traceback (most recent call last):
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/util.py", line 300, in _run_finalizers
|
||
finalizer()
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/util.py", line 224, in __call__
|
||
res = self._callback(*self._args, **self._kwargs)
|
||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/util.py", line 133, in _remove_temp_dir
|
||
rmtree(tempdir)
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/shutil.py", line 752, in rmtree
|
||
_rmtree_safe_fd(fd, path, onerror)
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/shutil.py", line 703, in _rmtree_safe_fd
|
||
onerror(os.unlink, fullname, sys.exc_info())
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/shutil.py", line 701, in _rmtree_safe_fd
|
||
os.unlink(entry.name, dir_fd=topfd)
|
||
OSError: [Errno 16] Device or resource busy: '.nfse455371ae3b6de6e0002fc55'
|
||
|
||
Applying chat template (num_proc=12): 27%|██▋ | 628/2339 [00:02<00:04, 379.62 examples/s]
|
||
Applying chat template (num_proc=12): 100%|██████████| 2339/2339 [00:03<00:00, 673.64 examples/s]
|
||
|
||
Applying chat template (num_proc=12): 32%|███▏ | 744/2339 [00:02<00:03, 408.43 examples/s]
|
||
Filter: 0%| | 0/43598 [00:00<?, ? examples/s]
|
||
Applying chat template (num_proc=12): 36%|███▋ | 850/2339 [00:03<00:03, 408.65 examples/s]
|
||
Applying chat template (num_proc=12): 43%|████▎ | 996/2339 [00:02<00:02, 560.86 examples/s]
|
||
Applying chat template (num_proc=12): 61%|██████ | 1416/2339 [00:03<00:01, 554.15 examples/s]Caching processed dataset at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-c697f3365c9a64c6_00007_of_00012.arrow
|
||
2026-04-14 19:26:55 - INFO - datasets.arrow_dataset - Caching processed dataset at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-c697f3365c9a64c6_00007_of_00012.arrow
|
||
|
||
Applying chat template (num_proc=12): 59%|█████▉ | 1376/2339 [00:03<00:01, 862.37 examples/s]
|
||
Applying chat template (num_proc=12): 39%|███▉ | 923/2339 [00:03<00:02, 499.19 examples/s]
|
||
Applying chat template (num_proc=12): 57%|█████▋ | 1344/2339 [00:03<00:01, 865.19 examples/s]
|
||
Applying chat template (num_proc=12): 75%|███████▌ | 1756/2339 [00:03<00:00, 783.94 examples/s]Caching processed dataset at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-c697f3365c9a64c6_00008_of_00012.arrow
|
||
2026-04-14 19:26:55 - INFO - datasets.arrow_dataset - Caching processed dataset at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-c697f3365c9a64c6_00008_of_00012.arrow
|
||
|
||
Applying chat template (num_proc=12): 67%|██████▋ | 1560/2339 [00:03<00:00, 962.89 examples/s]
|
||
Applying chat template (num_proc=12): 65%|██████▌ | 1521/2339 [00:03<00:00, 962.10 examples/s]Caching processed dataset at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-c697f3365c9a64c6_00009_of_00012.arrow
|
||
2026-04-14 19:26:55 - INFO - datasets.arrow_dataset - Caching processed dataset at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-c697f3365c9a64c6_00009_of_00012.arrow
|
||
|
||
Applying chat template (num_proc=12): 83%|████████▎ | 1950/2339 [00:03<00:00, 902.92 examples/s]
|
||
Applying chat template (num_proc=12): 58%|█████▊ | 1365/2339 [00:03<00:01, 779.72 examples/s]
|
||
Applying chat template (num_proc=12): 77%|███████▋ | 1798/2339 [00:03<00:00, 1133.27 examples/s]
|
||
Applying chat template (num_proc=12): 75%|███████▌ | 1755/2339 [00:03<00:00, 1184.35 examples/s]Caching processed dataset at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-c697f3365c9a64c6_00010_of_00012.arrow
|
||
2026-04-14 19:26:55 - INFO - datasets.arrow_dataset - Caching processed dataset at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-c697f3365c9a64c6_00010_of_00012.arrow
|
||
|
||
Applying chat template (num_proc=12): 88%|████████▊ | 2069/2339 [00:03<00:00, 1241.68 examples/s]
|
||
Applying chat template (num_proc=12): 92%|█████████▏| 2145/2339 [00:04<00:00, 884.31 examples/s]Caching processed dataset at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-c697f3365c9a64c6_00011_of_00012.arrow
|
||
2026-04-14 19:26:55 - INFO - datasets.arrow_dataset - Caching processed dataset at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-c697f3365c9a64c6_00011_of_00012.arrow
|
||
|
||
Applying chat template (num_proc=12): 66%|██████▌ | 1544/2339 [00:03<00:01, 754.83 examples/s]
|
||
Applying chat template (num_proc=12): 99%|█████████▊| 2309/2339 [00:04<00:00, 980.34 examples/s]
|
||
Applying chat template (num_proc=12): 100%|██████████| 2339/2339 [00:03<00:00, 1399.32 examples/s]
|
||
Applying chat template (num_proc=12): 85%|████████▍ | 1979/2339 [00:03<00:00, 1018.44 examples/s]Traceback (most recent call last):
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/process.py", line 314, in _bootstrap
|
||
self.run()
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/process.py", line 108, in run
|
||
self._target(*self._args, **self._kwargs)
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/managers.py", line 600, in _run_server
|
||
server.serve_forever()
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/managers.py", line 184, in serve_forever
|
||
sys.exit(0)
|
||
SystemExit: 0
|
||
|
||
During handling of the above exception, another exception occurred:
|
||
|
||
Traceback (most recent call last):
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/util.py", line 300, in _run_finalizers
|
||
finalizer()
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/util.py", line 224, in __call__
|
||
res = self._callback(*self._args, **self._kwargs)
|
||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/util.py", line 133, in _remove_temp_dir
|
||
rmtree(tempdir)
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/shutil.py", line 752, in rmtree
|
||
_rmtree_safe_fd(fd, path, onerror)
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/shutil.py", line 703, in _rmtree_safe_fd
|
||
onerror(os.unlink, fullname, sys.exc_info())
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/shutil.py", line 701, in _rmtree_safe_fd
|
||
os.unlink(entry.name, dir_fd=topfd)
|
||
OSError: [Errno 16] Device or resource busy: '.nfs4761eb2e16d6ccc60002fc6e'
|
||
Traceback (most recent call last):
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/process.py", line 314, in _bootstrap
|
||
self.run()
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/process.py", line 108, in run
|
||
self._target(*self._args, **self._kwargs)
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/managers.py", line 600, in _run_server
|
||
server.serve_forever()
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/managers.py", line 184, in serve_forever
|
||
sys.exit(0)
|
||
SystemExit: 0
|
||
|
||
During handling of the above exception, another exception occurred:
|
||
|
||
Traceback (most recent call last):
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/util.py", line 300, in _run_finalizers
|
||
finalizer()
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/util.py", line 224, in __call__
|
||
res = self._callback(*self._args, **self._kwargs)
|
||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/util.py", line 133, in _remove_temp_dir
|
||
rmtree(tempdir)
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/shutil.py", line 752, in rmtree
|
||
_rmtree_safe_fd(fd, path, onerror)
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/shutil.py", line 703, in _rmtree_safe_fd
|
||
onerror(os.unlink, fullname, sys.exc_info())
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/shutil.py", line 701, in _rmtree_safe_fd
|
||
os.unlink(entry.name, dir_fd=topfd)
|
||
OSError: [Errno 16] Device or resource busy: '.nfsf70e9abf6e43a1d50002fc6d'
|
||
|
||
Applying chat template (num_proc=12): 77%|███████▋ | 1806/2339 [00:03<00:00, 895.74 examples/s]
|
||
Applying chat template (num_proc=12): 100%|██████████| 2339/2339 [00:03<00:00, 609.27 examples/s]
|
||
|
||
Applying chat template (num_proc=12): 100%|██████████| 2339/2339 [00:04<00:00, 523.18 examples/s]
|
||
|
||
Applying chat template (num_proc=12): 98%|█████████▊| 2300/2339 [00:03<00:00, 1318.00 examples/s]Concatenating 12 shards
|
||
2026-04-14 19:26:56 - INFO - datasets.arrow_dataset - Concatenating 12 shards
|
||
|
||
Filter: 0%| | 0/43598 [00:00<?, ? examples/s]
|
||
Filter: 0%| | 0/43598 [00:00<?, ? examples/s]
|
||
Applying chat template (num_proc=12): 83%|████████▎ | 1951/2339 [00:03<00:00, 921.74 examples/s]Traceback (most recent call last):
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/process.py", line 314, in _bootstrap
|
||
self.run()
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/process.py", line 108, in run
|
||
self._target(*self._args, **self._kwargs)
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/managers.py", line 600, in _run_server
|
||
server.serve_forever()
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/managers.py", line 184, in serve_forever
|
||
sys.exit(0)
|
||
SystemExit: 0
|
||
|
||
During handling of the above exception, another exception occurred:
|
||
|
||
Traceback (most recent call last):
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/util.py", line 300, in _run_finalizers
|
||
finalizer()
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/util.py", line 224, in __call__
|
||
res = self._callback(*self._args, **self._kwargs)
|
||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/util.py", line 133, in _remove_temp_dir
|
||
rmtree(tempdir)
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/shutil.py", line 752, in rmtree
|
||
_rmtree_safe_fd(fd, path, onerror)
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/shutil.py", line 703, in _rmtree_safe_fd
|
||
onerror(os.unlink, fullname, sys.exc_info())
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/shutil.py", line 701, in _rmtree_safe_fd
|
||
os.unlink(entry.name, dir_fd=topfd)
|
||
OSError: [Errno 16] Device or resource busy: '.nfs55dff5ecfabad37e0002fc71'
|
||
|
||
Applying chat template (num_proc=12): 100%|██████████| 2339/2339 [00:03<00:00, 608.45 examples/s]
|
||
|
||
Applying chat template (num_proc=12): 100%|██████████| 2339/2339 [00:03<00:00, 1340.06 examples/s]
|
||
Filter: 0%| | 0/43598 [00:00<?, ? examples/s]Traceback (most recent call last):
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/process.py", line 314, in _bootstrap
|
||
self.run()
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/process.py", line 108, in run
|
||
self._target(*self._args, **self._kwargs)
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/managers.py", line 600, in _run_server
|
||
server.serve_forever()
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/managers.py", line 184, in serve_forever
|
||
sys.exit(0)
|
||
SystemExit: 0
|
||
|
||
During handling of the above exception, another exception occurred:
|
||
|
||
Traceback (most recent call last):
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/util.py", line 300, in _run_finalizers
|
||
finalizer()
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/util.py", line 224, in __call__
|
||
res = self._callback(*self._args, **self._kwargs)
|
||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/multiprocess/util.py", line 133, in _remove_temp_dir
|
||
rmtree(tempdir)
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/shutil.py", line 752, in rmtree
|
||
_rmtree_safe_fd(fd, path, onerror)
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/shutil.py", line 703, in _rmtree_safe_fd
|
||
onerror(os.unlink, fullname, sys.exc_info())
|
||
File "/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/shutil.py", line 701, in _rmtree_safe_fd
|
||
os.unlink(entry.name, dir_fd=topfd)
|
||
OSError: [Errno 16] Device or resource busy: '.nfsaee5f2f3a8a29e6e0002fc74'
|
||
|
||
Applying chat template (num_proc=12): 100%|██████████| 2339/2339 [00:04<00:00, 574.40 examples/s]
|
||
|
||
Filter: 0%| | 0/43598 [00:00<?, ? examples/s]
|
||
Filter: 23%|██▎ | 10000/43598 [00:09<00:33, 1000.42 examples/s]
|
||
Filter: 23%|██▎ | 10000/43598 [00:09<00:32, 1047.07 examples/s]
|
||
Filter: 23%|██▎ | 10000/43598 [00:10<00:35, 940.79 examples/s]Caching processed dataset at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-270aa536320be548.arrow
|
||
2026-04-14 19:27:05 - INFO - datasets.arrow_dataset - Caching processed dataset at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-270aa536320be548.arrow
|
||
|
||
Filter: 23%|██▎ | 10000/43598 [00:09<00:31, 1062.64 examples/s]
|
||
Filter: 23%|██▎ | 10000/43598 [00:09<00:32, 1047.14 examples/s]
|
||
Filter: 23%|██▎ | 10000/43598 [00:09<00:31, 1056.07 examples/s]
|
||
Filter: 23%|██▎ | 10000/43598 [00:14<00:49, 681.79 examples/s]
|
||
Filter: 23%|██▎ | 10000/43598 [00:14<00:47, 713.01 examples/s]
|
||
Filter: 46%|████▌ | 20000/43598 [00:19<00:22, 1057.79 examples/s]
|
||
Filter: 46%|████▌ | 20000/43598 [00:19<00:22, 1035.78 examples/s]
|
||
Filter: 46%|████▌ | 20000/43598 [00:18<00:21, 1075.08 examples/s]
|
||
Filter: 46%|████▌ | 20000/43598 [00:18<00:21, 1105.39 examples/s]
|
||
Filter: 46%|████▌ | 20000/43598 [00:18<00:21, 1083.82 examples/s]
|
||
Filter: 46%|████▌ | 20000/43598 [00:18<00:21, 1087.50 examples/s]
|
||
Filter: 69%|██████▉ | 30000/43598 [00:28<00:12, 1068.29 examples/s]
|
||
Filter: 69%|██████▉ | 30000/43598 [00:28<00:12, 1072.84 examples/s]
|
||
Filter: 23%|██▎ | 10000/43598 [00:26<00:47, 713.01 examples/s]
|
||
Filter: 69%|██████▉ | 30000/43598 [00:27<00:12, 1085.18 examples/s]
|
||
Filter: 46%|████▌ | 20000/43598 [00:28<00:33, 707.77 examples/s]
|
||
Filter: 69%|██████▉ | 30000/43598 [00:26<00:12, 1122.50 examples/s]
|
||
Filter: 69%|██████▉ | 30000/43598 [00:27<00:12, 1093.18 examples/s]
|
||
Filter: 69%|██████▉ | 30000/43598 [00:27<00:12, 1094.88 examples/s]
|
||
Filter: 46%|████▌ | 20000/43598 [00:27<00:32, 724.47 examples/s]
|
||
Filter: 92%|█████████▏| 40000/43598 [00:37<00:03, 1091.29 examples/s]
|
||
Filter: 92%|█████████▏| 40000/43598 [00:37<00:03, 1069.46 examples/s]
|
||
Filter: 92%|█████████▏| 40000/43598 [00:35<00:03, 1121.75 examples/s]
|
||
Filter: 92%|█████████▏| 40000/43598 [00:37<00:03, 1079.32 examples/s]
|
||
Filter: 92%|█████████▏| 40000/43598 [00:36<00:03, 1102.51 examples/s]
|
||
Filter: 92%|█████████▏| 40000/43598 [00:36<00:03, 1098.94 examples/s]
|
||
Filter: 46%|████▌ | 20000/43598 [00:38<00:33, 707.77 examples/s]
|
||
Filter: 100%|██████████| 43598/43598 [00:40<00:00, 1094.74 examples/s]
|
||
Filter: 100%|██████████| 43598/43598 [00:40<00:00, 1070.64 examples/s]
|
||
|
||
Filter: 0%| | 0/2339 [00:00<?, ? examples/s]
|
||
Filter: 100%|██████████| 43598/43598 [00:40<00:00, 1071.55 examples/s]
|
||
Filter: 100%|██████████| 43598/43598 [00:41<00:00, 1063.19 examples/s]
|
||
|
||
Filter: 0%| | 0/2339 [00:00<?, ? examples/s]
|
||
Filter: 100%|██████████| 43598/43598 [00:39<00:00, 1121.64 examples/s]
|
||
Filter: 100%|██████████| 43598/43598 [00:39<00:00, 1114.52 examples/s]
|
||
|
||
Filter: 0%| | 0/2339 [00:00<?, ? examples/s]
|
||
Filter: 69%|██████▉ | 30000/43598 [00:41<00:18, 743.78 examples/s]
|
||
Filter: 100%|██████████| 43598/43598 [00:40<00:00, 1080.61 examples/s]
|
||
Filter: 100%|██████████| 43598/43598 [00:40<00:00, 1076.80 examples/s]
|
||
|
||
Filter: 0%| | 0/2339 [00:00<?, ? examples/s]
|
||
Filter: 100%|██████████| 43598/43598 [00:39<00:00, 1104.79 examples/s]
|
||
Filter: 100%|██████████| 43598/43598 [00:39<00:00, 1095.02 examples/s]
|
||
|
||
Filter: 0%| | 0/2339 [00:00<?, ? examples/s]
|
||
Filter: 100%|██████████| 43598/43598 [00:39<00:00, 1099.79 examples/s]
|
||
Filter: 100%|██████████| 43598/43598 [00:39<00:00, 1093.16 examples/s]
|
||
|
||
Filter: 0%| | 0/2339 [00:00<?, ? examples/s]
|
||
Filter: 100%|██████████| 2339/2339 [00:02<00:00, 1109.46 examples/s]
|
||
Filter: 100%|██████████| 2339/2339 [00:02<00:00, 1105.29 examples/s]
|
||
|
||
Filter: 100%|██████████| 2339/2339 [00:02<00:00, 1112.85 examples/s]
|
||
Filter: 100%|██████████| 2339/2339 [00:02<00:00, 1105.95 examples/s]
|
||
Caching processed dataset at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-56ed0b6ecefba31f.arrow
|
||
2026-04-14 19:27:37 - INFO - datasets.arrow_dataset - Caching processed dataset at /scratch/qu.yang1/hf/datasets/Anthropic___hh-rlhf/default-cfba128a0ab1b99f/0.0.0/09be8c5bbc57cb3887f3a9732ad6aa7ec602a1fa/cache-56ed0b6ecefba31f.arrow
|
||
|
||
Filter: 100%|██████████| 2339/2339 [00:02<00:00, 1126.16 examples/s]
|
||
Filter: 100%|██████████| 2339/2339 [00:02<00:00, 1120.25 examples/s]
|
||
2026-04-14 19:27:37 - INFO - __main__ - Decontaminated 0 (0.00%) samples from the training set.
|
||
2026-04-14 19:27:37 - INFO - __main__ - Sample 41905 of the processed training set:
|
||
|
||
<|im_start|>system
|
||
<|im_end|>
|
||
<|im_start|>user
|
||
What is the best way to clean my refrigerator?<|im_end|>
|
||
<|im_start|>assistant
|
||
Well, I have a few ideas, but the best way to know what works best is to run some tests and then analyze the data to figure out which technique works best.<|im_end|>
|
||
<|im_start|>user
|
||
What are your ideas?<|im_end|>
|
||
<|im_start|>assistant
|
||
<think>
|
||
|
||
</think>
|
||
|
||
Some of my ideas include wiping down the interior with a rag, using bleach and water to wipe down the door, emptying the contents of the fridge to scrub the shelves with a clean cloth, cleaning the interior of the fridge with vinegar and water, and using a vacuum cleaner to clean the interior.<|im_end|>
|
||
|
||
2026-04-14 19:27:37 - INFO - __main__ - Sample 7296 of the processed training set:
|
||
|
||
<|im_start|>system
|
||
<|im_end|>
|
||
<|im_start|>user
|
||
How can I get blood out of cotton?<|im_end|>
|
||
<|im_start|>assistant
|
||
<think>
|
||
|
||
</think>
|
||
|
||
Blood? What do you mean?<|im_end|>
|
||
|
||
2026-04-14 19:27:37 - INFO - __main__ - Sample 1639 of the processed training set:
|
||
|
||
<|im_start|>system
|
||
<|im_end|>
|
||
<|im_start|>user
|
||
I'm trying to get my friend to open up<|im_end|>
|
||
<|im_start|>assistant
|
||
Sounds great! What have you tried so far?<|im_end|>
|
||
<|im_start|>user
|
||
I've tried to give my friend a safe space to open up about their feelings but they still aren't comfortable<|im_end|>
|
||
<|im_start|>assistant
|
||
<think>
|
||
|
||
</think>
|
||
|
||
It’s really important that people feel safe sharing their feelings, but I totally hear that this isn’t easy for them. And in my experience, that has a lot to do with them feeling that you can accept them and what they’re saying, even if they’re feeling difficult or uncomfortable feelings. Do you think you’ve done that, and that they still haven’t opened up?<|im_end|>
|
||
|
||
|
||
Filter: 69%|██████▉ | 30000/43598 [00:41<00:18, 735.95 examples/s]
|
||
Filter: 100%|██████████| 2339/2339 [00:02<00:00, 1110.24 examples/s]
|
||
Filter: 100%|██████████| 2339/2339 [00:02<00:00, 1096.85 examples/s]
|
||
|
||
Filter: 100%|██████████| 2339/2339 [00:02<00:00, 1093.09 examples/s]
|
||
Filter: 100%|██████████| 2339/2339 [00:02<00:00, 1088.24 examples/s]
|
||
|
||
Filter: 100%|██████████| 2339/2339 [00:02<00:00, 1086.56 examples/s]
|
||
Filter: 100%|██████████| 2339/2339 [00:02<00:00, 1081.76 examples/s]
|
||
|
||
Filter: 92%|█████████▏| 40000/43598 [00:51<00:04, 813.13 examples/s]
|
||
Filter: 92%|█████████▏| 40000/43598 [00:51<00:04, 800.78 examples/s]
|
||
Filter: 100%|██████████| 43598/43598 [00:55<00:00, 836.17 examples/s]
|
||
Filter: 100%|██████████| 43598/43598 [00:55<00:00, 787.85 examples/s]
|
||
|
||
Filter: 100%|██████████| 43598/43598 [00:55<00:00, 820.95 examples/s]
|
||
Filter: 100%|██████████| 43598/43598 [00:55<00:00, 782.73 examples/s]
|
||
/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': model_init_kwargs, dataset_text_field, max_seq_length, packing. Will not be supported from version '1.0.0'.
|
||
|
||
Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
|
||
warnings.warn(message, FutureWarning)
|
||
/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': model_init_kwargs, dataset_text_field, max_seq_length, packing. Will not be supported from version '1.0.0'.
|
||
|
||
Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
|
||
warnings.warn(message, FutureWarning)
|
||
/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': model_init_kwargs, dataset_text_field, max_seq_length, packing. Will not be supported from version '1.0.0'.
|
||
|
||
Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
|
||
warnings.warn(message, FutureWarning)
|
||
/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': model_init_kwargs, dataset_text_field, max_seq_length, packing. Will not be supported from version '1.0.0'.
|
||
|
||
Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
|
||
warnings.warn(message, FutureWarning)
|
||
/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': model_init_kwargs, dataset_text_field, max_seq_length, packing. Will not be supported from version '1.0.0'.
|
||
|
||
Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
|
||
warnings.warn(message, FutureWarning)
|
||
/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': model_init_kwargs, dataset_text_field, max_seq_length, packing. Will not be supported from version '1.0.0'.
|
||
|
||
Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
|
||
warnings.warn(message, FutureWarning)
|
||
/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': model_init_kwargs, dataset_text_field, max_seq_length, packing. Will not be supported from version '1.0.0'.
|
||
|
||
Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
|
||
warnings.warn(message, FutureWarning)
|
||
/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': model_init_kwargs, dataset_text_field, max_seq_length, packing. Will not be supported from version '1.0.0'.
|
||
|
||
Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
|
||
warnings.warn(message, FutureWarning)
|
||
/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/trl/trainer/sft_trainer.py:158: UserWarning: You passed `model_init_kwargs` to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.
|
||
warnings.warn(
|
||
/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/trl/trainer/sft_trainer.py:185: UserWarning: You passed a model_id to the SFTTrainer. This will automatically create an `AutoModelForCausalLM` or a `PeftModel` (if you passed a `peft_config`) for you.
|
||
warnings.warn(
|
||
/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/trl/trainer/sft_trainer.py:158: UserWarning: You passed `model_init_kwargs` to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.
|
||
warnings.warn(
|
||
/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/trl/trainer/sft_trainer.py:158: UserWarning: You passed `model_init_kwargs` to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.
|
||
warnings.warn(
|
||
/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/trl/trainer/sft_trainer.py:158: UserWarning: You passed `model_init_kwargs` to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.
|
||
warnings.warn(
|
||
/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/trl/trainer/sft_trainer.py:158: UserWarning: You passed `model_init_kwargs` to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.
|
||
warnings.warn(
|
||
/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/trl/trainer/sft_trainer.py:158: UserWarning: You passed `model_init_kwargs` to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.
|
||
warnings.warn(
|
||
/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/trl/trainer/sft_trainer.py:185: UserWarning: You passed a model_id to the SFTTrainer. This will automatically create an `AutoModelForCausalLM` or a `PeftModel` (if you passed a `peft_config`) for you.
|
||
warnings.warn(
|
||
/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/trl/trainer/sft_trainer.py:158: UserWarning: You passed `model_init_kwargs` to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.
|
||
warnings.warn(
|
||
/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/trl/trainer/sft_trainer.py:185: UserWarning: You passed a model_id to the SFTTrainer. This will automatically create an `AutoModelForCausalLM` or a `PeftModel` (if you passed a `peft_config`) for you.
|
||
warnings.warn(
|
||
/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/trl/trainer/sft_trainer.py:185: UserWarning: You passed a model_id to the SFTTrainer. This will automatically create an `AutoModelForCausalLM` or a `PeftModel` (if you passed a `peft_config`) for you.
|
||
warnings.warn(
|
||
/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/trl/trainer/sft_trainer.py:185: UserWarning: You passed a model_id to the SFTTrainer. This will automatically create an `AutoModelForCausalLM` or a `PeftModel` (if you passed a `peft_config`) for you.
|
||
warnings.warn(
|
||
/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/trl/trainer/sft_trainer.py:185: UserWarning: You passed a model_id to the SFTTrainer. This will automatically create an `AutoModelForCausalLM` or a `PeftModel` (if you passed a `peft_config`) for you.
|
||
warnings.warn(
|
||
/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/trl/trainer/sft_trainer.py:185: UserWarning: You passed a model_id to the SFTTrainer. This will automatically create an `AutoModelForCausalLM` or a `PeftModel` (if you passed a `peft_config`) for you.
|
||
warnings.warn(
|
||
/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/trl/trainer/sft_trainer.py:158: UserWarning: You passed `model_init_kwargs` to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.
|
||
warnings.warn(
|
||
/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/trl/trainer/sft_trainer.py:185: UserWarning: You passed a model_id to the SFTTrainer. This will automatically create an `AutoModelForCausalLM` or a `PeftModel` (if you passed a `peft_config`) for you.
|
||
warnings.warn(
|
||
[INFO|configuration_utils.py:693] 2026-04-14 19:27:54,084 >> loading configuration file config.json from cache at /scratch/qu.yang1/hf/hub/models--Qwen--Qwen3-8B-Base/snapshots/49e3418fbbbca6ecbdf9608b4d22e5a407081db4/config.json
|
||
[INFO|configuration_utils.py:765] 2026-04-14 19:27:54,085 >> Model config Qwen3Config {
|
||
"architectures": [
|
||
"Qwen3ForCausalLM"
|
||
],
|
||
"attention_bias": false,
|
||
"attention_dropout": 0.0,
|
||
"bos_token_id": 151643,
|
||
"eos_token_id": 151643,
|
||
"head_dim": 128,
|
||
"hidden_act": "silu",
|
||
"hidden_size": 4096,
|
||
"initializer_range": 0.02,
|
||
"intermediate_size": 12288,
|
||
"max_position_embeddings": 32768,
|
||
"max_window_layers": 36,
|
||
"model_type": "qwen3",
|
||
"num_attention_heads": 32,
|
||
"num_hidden_layers": 36,
|
||
"num_key_value_heads": 8,
|
||
"rms_norm_eps": 1e-06,
|
||
"rope_scaling": null,
|
||
"rope_theta": 1000000,
|
||
"sliding_window": null,
|
||
"tie_word_embeddings": false,
|
||
"torch_dtype": "bfloat16",
|
||
"transformers_version": "4.51.0",
|
||
"use_cache": false,
|
||
"use_sliding_window": false,
|
||
"vocab_size": 151936
|
||
}
|
||
|
||
|
||
Fetching 5 files: 0%| | 0/5 [00:00<?, ?it/s]
|
||
Fetching 5 files: 0%| | 0/5 [00:00<?, ?it/s]
|
||
Fetching 5 files: 0%| | 0/5 [00:00<?, ?it/s]
|
||
Fetching 5 files: 0%| | 0/5 [00:00<?, ?it/s]
|
||
Fetching 5 files: 0%| | 0/5 [00:00<?, ?it/s][INFO|modeling_utils.py:1124] 2026-04-14 19:27:54,515 >> loading weights file model.safetensors from cache at /scratch/qu.yang1/hf/hub/models--Qwen--Qwen3-8B-Base/snapshots/49e3418fbbbca6ecbdf9608b4d22e5a407081db4/model.safetensors.index.json
|
||
|
||
Fetching 5 files: 0%| | 0/5 [00:00<?, ?it/s]Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
|
||
2026-04-14 19:27:54 - WARNING - huggingface_hub.file_download - Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
|
||
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
|
||
2026-04-14 19:27:54 - WARNING - huggingface_hub.file_download - Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
|
||
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
|
||
2026-04-14 19:27:54 - WARNING - huggingface_hub.file_download - Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
|
||
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
|
||
2026-04-14 19:27:54 - WARNING - huggingface_hub.file_download - Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
|
||
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
|
||
2026-04-14 19:27:54 - WARNING - huggingface_hub.file_download - Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
|
||
|
||
Fetching 5 files: 0%| | 0/5 [00:00<?, ?it/s]
|
||
Fetching 5 files: 0%| | 0/5 [00:00<?, ?it/s]
|
||
Fetching 5 files: 20%|██ | 1/5 [01:41<06:46, 101.65s/it]
|
||
Fetching 5 files: 100%|██████████| 5/5 [01:41<00:00, 20.34s/it]
|
||
|
||
Fetching 5 files: 20%|██ | 1/5 [01:41<06:46, 101.66s/it][WARNING|logging.py:328] 2026-04-14 19:29:36,129 >> You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
|
||
|
||
Fetching 5 files: 20%|██ | 1/5 [01:41<06:46, 101.75s/it]
|
||
Fetching 5 files: 20%|██ | 1/5 [01:41<06:46, 101.63s/it]
|
||
Fetching 5 files: 100%|██████████| 5/5 [01:41<00:00, 20.33s/it]
|
||
[INFO|modeling_utils.py:2167] 2026-04-14 19:29:36,215 >> Instantiating Qwen3ForCausalLM model under default dtype torch.bfloat16.
|
||
[WARNING|logging.py:328] 2026-04-14 19:29:36,218 >> You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
|
||
[INFO|configuration_utils.py:1142] 2026-04-14 19:29:36,219 >> Generate config GenerationConfig {
|
||
"bos_token_id": 151643,
|
||
"eos_token_id": 151643,
|
||
"use_cache": false
|
||
}
|
||
|
||
|
||
Loading checkpoint shards: 0%| | 0/5 [00:00<?, ?it/s]
|
||
Loading checkpoint shards: 0%| | 0/5 [00:00<?, ?it/s]
|
||
Fetching 5 files: 20%|██ | 1/5 [01:41<06:47, 101.84s/it]
|
||
Fetching 5 files: 100%|██████████| 5/5 [01:41<00:00, 20.37s/it]
|
||
|
||
Fetching 5 files: 40%|████ | 2/5 [01:41<02:05, 41.97s/it] [WARNING|logging.py:328] 2026-04-14 19:29:36,311 >> You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
|
||
|
||
Fetching 5 files: 20%|██ | 1/5 [01:41<06:46, 101.71s/it]
|
||
Loading checkpoint shards: 0%| | 0/5 [00:00<?, ?it/s]
|
||
Fetching 5 files: 100%|██████████| 5/5 [01:41<00:00, 20.39s/it]
|
||
[WARNING|logging.py:328] 2026-04-14 19:29:36,381 >> You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
|
||
|
||
Fetching 5 files: 20%|██ | 1/5 [01:41<06:47, 101.95s/it]
|
||
Fetching 5 files: 100%|██████████| 5/5 [01:41<00:00, 20.39s/it]
|
||
[WARNING|logging.py:328] 2026-04-14 19:29:36,398 >> You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
|
||
|
||
Loading checkpoint shards: 20%|██ | 1/5 [00:00<00:00, 6.84it/s]
|
||
Fetching 5 files: 60%|██████ | 3/5 [01:41<00:52, 26.47s/it]
|
||
Fetching 5 files: 100%|██████████| 5/5 [01:41<00:00, 20.39s/it]
|
||
[WARNING|logging.py:328] 2026-04-14 19:29:36,434 >> You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
|
||
|
||
Loading checkpoint shards: 0%| | 0/5 [00:00<?, ?it/s]
|
||
Fetching 5 files: 20%|██ | 1/5 [01:41<06:47, 101.92s/it]
|
||
Fetching 5 files: 100%|██████████| 5/5 [01:41<00:00, 20.38s/it]
|
||
|
||
Loading checkpoint shards: 0%| | 0/5 [00:00<?, ?it/s][WARNING|logging.py:328] 2026-04-14 19:29:36,462 >> You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
|
||
|
||
Fetching 5 files: 60%|██████ | 3/5 [01:41<00:52, 26.42s/it]
|
||
Fetching 5 files: 100%|██████████| 5/5 [01:41<00:00, 20.37s/it]
|
||
[WARNING|logging.py:328] 2026-04-14 19:29:36,498 >> You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
|
||
|
||
Loading checkpoint shards: 0%| | 0/5 [00:00<?, ?it/s]
|
||
Loading checkpoint shards: 0%| | 0/5 [00:00<?, ?it/s]
|
||
Loading checkpoint shards: 0%| | 0/5 [00:00<?, ?it/s]
|
||
Loading checkpoint shards: 20%|██ | 1/5 [00:00<00:01, 3.07it/s]
|
||
Loading checkpoint shards: 40%|████ | 2/5 [00:00<00:00, 17.42it/s]
|
||
Loading checkpoint shards: 40%|████ | 2/5 [00:00<00:00, 11.11it/s]
|
||
Loading checkpoint shards: 40%|████ | 2/5 [00:00<00:00, 10.69it/s]
|
||
Loading checkpoint shards: 40%|████ | 2/5 [00:00<00:00, 5.46it/s]
|
||
Loading checkpoint shards: 40%|████ | 2/5 [00:00<00:00, 7.45it/s]
|
||
Loading checkpoint shards: 40%|████ | 2/5 [00:00<00:00, 4.20it/s]
|
||
Loading checkpoint shards: 60%|██████ | 3/5 [00:00<00:00, 5.70it/s]
|
||
Loading checkpoint shards: 60%|██████ | 3/5 [00:00<00:00, 11.32it/s]
|
||
Loading checkpoint shards: 60%|██████ | 3/5 [00:00<00:00, 6.79it/s]
|
||
Loading checkpoint shards: 60%|██████ | 3/5 [00:00<00:00, 12.09it/s]
|
||
Loading checkpoint shards: 60%|██████ | 3/5 [00:00<00:00, 4.88it/s]
|
||
Loading checkpoint shards: 80%|████████ | 4/5 [00:00<00:00, 6.28it/s]
|
||
Loading checkpoint shards: 80%|████████ | 4/5 [00:00<00:00, 7.88it/s]
|
||
Loading checkpoint shards: 80%|████████ | 4/5 [00:00<00:00, 7.97it/s]
|
||
Loading checkpoint shards: 80%|████████ | 4/5 [00:00<00:00, 8.91it/s]
|
||
Loading checkpoint shards: 80%|████████ | 4/5 [00:00<00:00, 7.02it/s]
|
||
Loading checkpoint shards: 100%|██████████| 5/5 [00:00<00:00, 8.02it/s]
|
||
|
||
Loading checkpoint shards: 100%|██████████| 5/5 [00:00<00:00, 9.20it/s]
|
||
|
||
Loading checkpoint shards: 100%|██████████| 5/5 [00:00<00:00, 9.32it/s]
|
||
|
||
Loading checkpoint shards: 100%|██████████| 5/5 [00:00<00:00, 10.60it/s]
|
||
|
||
Loading checkpoint shards: 100%|██████████| 5/5 [00:00<00:00, 11.23it/s]
|
||
Loading checkpoint shards: 100%|██████████| 5/5 [00:00<00:00, 7.00it/s]
|
||
|
||
Loading checkpoint shards: 100%|██████████| 5/5 [00:00<00:00, 11.37it/s]
|
||
|
||
Loading checkpoint shards: 100%|██████████| 5/5 [00:00<00:00, 10.87it/s]
|
||
Loading checkpoint shards: 100%|██████████| 5/5 [00:00<00:00, 10.94it/s]
|
||
|
||
Loading checkpoint shards: 80%|████████ | 4/5 [00:00<00:00, 6.04it/s]
|
||
Loading checkpoint shards: 100%|██████████| 5/5 [00:00<00:00, 6.45it/s]
|
||
[INFO|modeling_utils.py:4926] 2026-04-14 19:29:37,034 >> All model checkpoint weights were used when initializing Qwen3ForCausalLM.
|
||
|
||
[INFO|modeling_utils.py:4934] 2026-04-14 19:29:37,034 >> All the weights of Qwen3ForCausalLM were initialized from the model checkpoint at Qwen/Qwen3-8B-Base.
|
||
If your task is similar to the task the model of the checkpoint was trained on, you can already use Qwen3ForCausalLM for predictions without further training.
|
||
/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/trl/trainer/sft_trainer.py:195: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.
|
||
warnings.warn(
|
||
/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/trl/trainer/sft_trainer.py:283: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.
|
||
warnings.warn(
|
||
/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/trl/trainer/sft_trainer.py:321: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.
|
||
warnings.warn(
|
||
/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/trl/trainer/sft_trainer.py:195: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.
|
||
warnings.warn(
|
||
/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/trl/trainer/sft_trainer.py:283: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.
|
||
warnings.warn(
|
||
/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/trl/trainer/sft_trainer.py:321: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.
|
||
warnings.warn(
|
||
/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/trl/trainer/sft_trainer.py:195: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.
|
||
warnings.warn(
|
||
/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/trl/trainer/sft_trainer.py:283: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.
|
||
warnings.warn(
|
||
/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/trl/trainer/sft_trainer.py:321: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.
|
||
warnings.warn(
|
||
/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/trl/trainer/sft_trainer.py:195: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.
|
||
warnings.warn(
|
||
/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/trl/trainer/sft_trainer.py:283: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.
|
||
warnings.warn(
|
||
/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/trl/trainer/sft_trainer.py:321: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.
|
||
warnings.warn(
|
||
/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/trl/trainer/sft_trainer.py:195: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.
|
||
warnings.warn(
|
||
/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/trl/trainer/sft_trainer.py:283: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.
|
||
warnings.warn(
|
||
/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/trl/trainer/sft_trainer.py:321: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.
|
||
warnings.warn(
|
||
[INFO|configuration_utils.py:1097] 2026-04-14 19:29:37,289 >> loading configuration file generation_config.json from cache at /scratch/qu.yang1/hf/hub/models--Qwen--Qwen3-8B-Base/snapshots/49e3418fbbbca6ecbdf9608b4d22e5a407081db4/generation_config.json
|
||
[INFO|configuration_utils.py:1142] 2026-04-14 19:29:37,290 >> Generate config GenerationConfig {
|
||
"bos_token_id": 151643,
|
||
"eos_token_id": 151643,
|
||
"max_new_tokens": 2048
|
||
}
|
||
|
||
/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/trl/trainer/sft_trainer.py:195: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.
|
||
warnings.warn(
|
||
/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/trl/trainer/sft_trainer.py:283: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.
|
||
warnings.warn(
|
||
/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/trl/trainer/sft_trainer.py:321: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.
|
||
warnings.warn(
|
||
/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/trl/trainer/sft_trainer.py:195: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.
|
||
warnings.warn(
|
||
/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/trl/trainer/sft_trainer.py:283: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.
|
||
warnings.warn(
|
||
/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/trl/trainer/sft_trainer.py:321: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.
|
||
warnings.warn(
|
||
/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/trl/trainer/sft_trainer.py:195: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.
|
||
warnings.warn(
|
||
/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/trl/trainer/sft_trainer.py:283: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.
|
||
warnings.warn(
|
||
/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/trl/trainer/sft_trainer.py:321: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.
|
||
warnings.warn(
|
||
Using custom data configuration default-4e6a8a2c525b008a
|
||
2026-04-14 19:29:37 - INFO - datasets.builder - Using custom data configuration default-4e6a8a2c525b008a
|
||
Loading Dataset Infos from /home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/datasets/packaged_modules/generator
|
||
2026-04-14 19:29:37 - INFO - datasets.info - Loading Dataset Infos from /home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/datasets/packaged_modules/generator
|
||
Generating dataset generator (/scratch/qu.yang1/hf/datasets/generator/default-4e6a8a2c525b008a/0.0.0)
|
||
2026-04-14 19:29:37 - INFO - datasets.builder - Generating dataset generator (/scratch/qu.yang1/hf/datasets/generator/default-4e6a8a2c525b008a/0.0.0)
|
||
Downloading and preparing dataset generator/default to /scratch/qu.yang1/hf/datasets/generator/default-4e6a8a2c525b008a/0.0.0...
|
||
2026-04-14 19:29:37 - INFO - datasets.builder - Downloading and preparing dataset generator/default to /scratch/qu.yang1/hf/datasets/generator/default-4e6a8a2c525b008a/0.0.0...
|
||
Generating train split
|
||
2026-04-14 19:29:37 - INFO - datasets.builder - Generating train split
|
||
|
||
Generating train split: 0 examples [00:00, ? examples/s]
|
||
Generating train split: 1 examples [00:00, 1.42 examples/s]
|
||
Generating train split: 804 examples [00:01, 575.81 examples/s]
|
||
Generating train split: 1610 examples [00:02, 785.39 examples/s]
|
||
Generating train split: 2413 examples [00:03, 887.58 examples/s]
|
||
Generating train split: 3217 examples [00:03, 947.79 examples/s]
|
||
Generating train split: 4020 examples [00:04, 984.69 examples/s]
|
||
Generating train split: 4824 examples [00:05, 1015.96 examples/s]
|
||
Generating train split: 5627 examples [00:06, 1027.25 examples/s]
|
||
Generating train split: 6432 examples [00:06, 1032.33 examples/s]
|
||
Generating train split: 7000 examples [00:07, 1246.40 examples/s]
|
||
Generating train split: 7234 examples [00:07, 919.64 examples/s]
|
||
Generating train split: 8038 examples [00:08, 960.61 examples/s]
|
||
Generating train split: 8843 examples [00:09, 1002.69 examples/s]
|
||
Generating train split: 9650 examples [00:10, 1019.41 examples/s]
|
||
Generating train split: 10455 examples [00:10, 1027.62 examples/s]
|
||
Generating train split: 11259 examples [00:11, 1025.73 examples/s]
|
||
Generating train split: 12063 examples [00:12, 1032.88 examples/s]
|
||
Generating train split: 12867 examples [00:13, 986.40 examples/s]
|
||
Generating train split: 13668 examples [00:14, 1004.89 examples/s]
|
||
Generating train split: 14473 examples [00:14, 1018.48 examples/s]
|
||
Generating train split: 15278 examples [00:15, 1025.85 examples/s]
|
||
Generating train split: 16082 examples [00:16, 1031.54 examples/s]
|
||
Generating train split: 16888 examples [00:16, 1258.68 examples/s]
|
||
Generating train split: 17195 examples [00:16, 1015.26 examples/s]
|
||
Unable to verify splits sizes.
|
||
2026-04-14 19:29:54 - INFO - datasets.utils.info_utils - Unable to verify splits sizes.
|
||
Dataset generator downloaded and prepared to /scratch/qu.yang1/hf/datasets/generator/default-4e6a8a2c525b008a/0.0.0. Subsequent calls will reuse this data.
|
||
2026-04-14 19:29:54 - INFO - datasets.builder - Dataset generator downloaded and prepared to /scratch/qu.yang1/hf/datasets/generator/default-4e6a8a2c525b008a/0.0.0. Subsequent calls will reuse this data.
|
||
Using custom data configuration default-281ae67d28ccbe3f
|
||
2026-04-14 19:29:54 - INFO - datasets.builder - Using custom data configuration default-281ae67d28ccbe3f
|
||
Loading Dataset Infos from /home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/datasets/packaged_modules/generator
|
||
2026-04-14 19:29:54 - INFO - datasets.info - Loading Dataset Infos from /home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/datasets/packaged_modules/generator
|
||
Generating dataset generator (/scratch/qu.yang1/hf/datasets/generator/default-281ae67d28ccbe3f/0.0.0)
|
||
2026-04-14 19:29:54 - INFO - datasets.builder - Generating dataset generator (/scratch/qu.yang1/hf/datasets/generator/default-281ae67d28ccbe3f/0.0.0)
|
||
Downloading and preparing dataset generator/default to /scratch/qu.yang1/hf/datasets/generator/default-281ae67d28ccbe3f/0.0.0...
|
||
2026-04-14 19:29:54 - INFO - datasets.builder - Downloading and preparing dataset generator/default to /scratch/qu.yang1/hf/datasets/generator/default-281ae67d28ccbe3f/0.0.0...
|
||
Generating train split
|
||
2026-04-14 19:29:54 - INFO - datasets.builder - Generating train split
|
||
|
||
Generating train split: 0 examples [00:00, ? examples/s]
|
||
Generating train split: 1 examples [00:01, 1.05s/ examples]
|
||
Generating train split: 807 examples [00:01, 845.24 examples/s]
|
||
Generating train split: 931 examples [00:01, 707.59 examples/s]
|
||
Unable to verify splits sizes.
|
||
2026-04-14 19:29:56 - INFO - datasets.utils.info_utils - Unable to verify splits sizes.
|
||
Dataset generator downloaded and prepared to /scratch/qu.yang1/hf/datasets/generator/default-281ae67d28ccbe3f/0.0.0. Subsequent calls will reuse this data.
|
||
2026-04-14 19:29:56 - INFO - datasets.builder - Dataset generator downloaded and prepared to /scratch/qu.yang1/hf/datasets/generator/default-281ae67d28ccbe3f/0.0.0. Subsequent calls will reuse this data.
|
||
/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/trl/trainer/sft_trainer.py:412: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `SFTTrainer.__init__`. Use `processing_class` instead.
|
||
super().__init__(
|
||
/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/trl/trainer/sft_trainer.py:412: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `SFTTrainer.__init__`. Use `processing_class` instead.
|
||
super().__init__(
|
||
/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/trl/trainer/sft_trainer.py:412: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `SFTTrainer.__init__`. Use `processing_class` instead.
|
||
super().__init__(
|
||
/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/trl/trainer/sft_trainer.py:412: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `SFTTrainer.__init__`. Use `processing_class` instead.
|
||
super().__init__(
|
||
/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/trl/trainer/sft_trainer.py:412: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `SFTTrainer.__init__`. Use `processing_class` instead.
|
||
super().__init__(
|
||
/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/trl/trainer/sft_trainer.py:412: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `SFTTrainer.__init__`. Use `processing_class` instead.
|
||
super().__init__(
|
||
/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/trl/trainer/sft_trainer.py:412: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `SFTTrainer.__init__`. Use `processing_class` instead.
|
||
super().__init__(
|
||
/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/trl/trainer/sft_trainer.py:412: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `SFTTrainer.__init__`. Use `processing_class` instead.
|
||
super().__init__(
|
||
[INFO|trainer.py:748] 2026-04-14 19:29:58,464 >> Using auto half precision backend
|
||
2026-04-14 19:29:58 - INFO - __main__ - *** Train ***
|
||
/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/accelerate/accelerator.py:1557: UserWarning: Upcasted low precision parameters in Qwen3ForCausalLM because mixed precision turned on in FSDP. Affects: model.embed_tokens.weight, model.norm.weight, lm_head.weight.
|
||
warnings.warn(
|
||
/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/accelerate/accelerator.py:1557: UserWarning: Upcasted low precision parameters in Qwen3DecoderLayer because mixed precision turned on in FSDP. Affects: self_attn.q_proj.weight, self_attn.k_proj.weight, self_attn.v_proj.weight, self_attn.o_proj.weight, self_attn.q_norm.weight, self_attn.k_norm.weight, mlp.gate_proj.weight, mlp.up_proj.weight, mlp.down_proj.weight, input_layernorm.weight, post_attention_layernorm.weight.
|
||
warnings.warn(
|
||
/home/qu.yang1/.conda/envs/dpo_v4/lib/python3.11/site-packages/accelerate/accelerator.py:1563: UserWarning: FSDP upcast of low precision parameters may affect the precision of model checkpoints.
|
||
warnings.warn(
|
||
[INFO|trainer.py:2414] 2026-04-14 19:30:35,921 >> ***** Running training *****
|
||
[INFO|trainer.py:2415] 2026-04-14 19:30:35,921 >> Num examples = 17,195
|
||
[INFO|trainer.py:2416] 2026-04-14 19:30:35,922 >> Num Epochs = 1
|
||
[INFO|trainer.py:2417] 2026-04-14 19:30:35,922 >> Instantaneous batch size per device = 16
|
||
[INFO|trainer.py:2420] 2026-04-14 19:30:35,922 >> Total train batch size (w. parallel, distributed & accumulation) = 128
|
||
[INFO|trainer.py:2421] 2026-04-14 19:30:35,922 >> Gradient Accumulation steps = 1
|
||
[INFO|trainer.py:2422] 2026-04-14 19:30:35,922 >> Total optimization steps = 135
|
||
[INFO|trainer.py:2423] 2026-04-14 19:30:35,923 >> Number of trainable parameters = 1,023,841,920
|
||
[INFO|integration_utils.py:831] 2026-04-14 19:30:35,925 >> Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
|
||
wandb: Currently logged in as: feng-cheng (feng-cheng-northeastern-university). Use `wandb login --relogin` to force relogin
|
||
wandb: wandb version 0.26.0 is available! To upgrade, please run:
|
||
wandb: $ pip install wandb --upgrade
|
||
wandb: Tracking run with wandb version 0.17.5
|
||
wandb: Run data is saved locally in /scratch/qu.yang1/wandb/wandb/run-20260414_193038-mlwlhzba
|
||
wandb: Run `wandb offline` to turn off syncing.
|
||
wandb: Syncing run qwen3-8b-base-sft-hh-helpful-8xh200-20260414-192602-232981
|
||
wandb: ⭐️ View project at https://wandb.ai/feng-cheng-northeastern-university/huggingface
|
||
wandb: 🚀 View run at https://wandb.ai/feng-cheng-northeastern-university/huggingface/runs/mlwlhzba
|
||
|
||
0%| | 0/135 [00:00<?, ?it/s]
|
||
1%| | 1/135 [00:02<05:11, 2.33s/it]
|
||
|
||
{'loss': 3.1025, 'grad_norm': 15.093496322631836, 'learning_rate': 0.0, 'epoch': 0.01}
|
||
|
||
1%| | 1/135 [00:02<05:11, 2.33s/it]
|
||
1%|▏ | 2/135 [00:03<03:49, 1.73s/it]
|
||
2%|▏ | 3/135 [00:04<03:22, 1.54s/it]
|
||
3%|▎ | 4/135 [00:06<03:09, 1.44s/it]
|
||
4%|▎ | 5/135 [00:07<03:01, 1.39s/it]
|
||
4%|▍ | 6/135 [00:08<02:55, 1.36s/it]
|
||
5%|▌ | 7/135 [00:10<02:51, 1.34s/it]
|
||
6%|▌ | 8/135 [00:11<02:48, 1.32s/it]
|
||
7%|▋ | 9/135 [00:12<02:45, 1.32s/it]
|
||
7%|▋ | 10/135 [00:14<02:43, 1.31s/it]
|
||
|
||
{'loss': 2.833, 'grad_norm': 5.202989101409912, 'learning_rate': 1.2857142857142859e-05, 'epoch': 0.07}
|
||
|
||
7%|▋ | 10/135 [00:14<02:43, 1.31s/it]
|
||
8%|▊ | 11/135 [00:15<02:42, 1.31s/it]
|
||
9%|▉ | 12/135 [00:16<02:40, 1.30s/it]
|
||
10%|▉ | 13/135 [00:17<02:38, 1.30s/it]
|
||
10%|█ | 14/135 [00:19<02:37, 1.30s/it]
|
||
11%|█ | 15/135 [00:20<02:35, 1.30s/it]
|
||
12%|█▏ | 16/135 [00:22<02:46, 1.40s/it]
|
||
13%|█▎ | 17/135 [00:23<02:47, 1.42s/it]
|
||
13%|█▎ | 18/135 [00:24<02:41, 1.38s/it]
|
||
14%|█▍ | 19/135 [00:26<02:37, 1.36s/it]
|
||
15%|█▍ | 20/135 [00:27<02:34, 1.34s/it]
|
||
|
||
{'loss': 2.2228, 'grad_norm': 7.5872626304626465, 'learning_rate': 1.9915854864676665e-05, 'epoch': 0.15}
|
||
|
||
15%|█▍ | 20/135 [00:27<02:34, 1.34s/it]
|
||
16%|█▌ | 21/135 [00:28<02:31, 1.33s/it]
|
||
16%|█▋ | 22/135 [00:30<02:29, 1.32s/it]
|
||
17%|█▋ | 23/135 [00:31<02:27, 1.31s/it]
|
||
18%|█▊ | 24/135 [00:32<02:25, 1.31s/it]
|
||
19%|█▊ | 25/135 [00:33<02:23, 1.30s/it]
|
||
19%|█▉ | 26/135 [00:35<02:21, 1.30s/it]
|
||
20%|██ | 27/135 [00:36<02:20, 1.30s/it]
|
||
21%|██ | 28/135 [00:37<02:19, 1.30s/it]
|
||
21%|██▏ | 29/135 [00:39<02:17, 1.30s/it]
|
||
22%|██▏ | 30/135 [00:40<02:16, 1.30s/it]
|
||
|
||
{'loss': 1.9658, 'grad_norm': 7.160677433013916, 'learning_rate': 1.9251166435386837e-05, 'epoch': 0.22}
|
||
|
||
22%|██▏ | 30/135 [00:40<02:16, 1.30s/it]
|
||
23%|██▎ | 31/135 [00:41<02:15, 1.30s/it]
|
||
24%|██▎ | 32/135 [00:43<02:14, 1.30s/it]
|
||
24%|██▍ | 33/135 [00:44<02:18, 1.36s/it]
|
||
25%|██▌ | 34/135 [00:45<02:15, 1.34s/it]
|
||
26%|██▌ | 35/135 [00:47<02:18, 1.39s/it]
|
||
27%|██▋ | 36/135 [00:48<02:14, 1.36s/it]
|
||
27%|██▋ | 37/135 [00:49<02:11, 1.34s/it]
|
||
28%|██▊ | 38/135 [00:51<02:09, 1.33s/it]
|
||
29%|██▉ | 39/135 [00:52<02:06, 1.32s/it]
|
||
30%|██▉ | 40/135 [00:53<02:05, 1.32s/it]
|
||
|
||
{'loss': 1.7456, 'grad_norm': 4.033580780029297, 'learning_rate': 1.796634556457236e-05, 'epoch': 0.3}
|
||
|
||
30%|██▉ | 40/135 [00:53<02:05, 1.32s/it]
|
||
30%|███ | 41/135 [00:55<02:03, 1.31s/it]
|
||
31%|███ | 42/135 [00:56<02:02, 1.31s/it]
|
||
32%|███▏ | 43/135 [00:57<02:00, 1.31s/it]
|
||
33%|███▎ | 44/135 [00:59<01:59, 1.31s/it]
|
||
33%|███▎ | 45/135 [01:00<01:57, 1.31s/it]
|
||
34%|███▍ | 46/135 [01:01<01:56, 1.31s/it]
|
||
35%|███▍ | 47/135 [01:03<01:54, 1.31s/it]
|
||
36%|███▌ | 48/135 [01:04<01:53, 1.31s/it]
|
||
36%|███▋ | 49/135 [01:05<01:52, 1.31s/it]
|
||
37%|███▋ | 50/135 [01:07<01:54, 1.35s/it]
|
||
|
||
{'loss': 1.6527, 'grad_norm': 1.7138469219207764, 'learning_rate': 1.614751751104301e-05, 'epoch': 0.37}
|
||
|
||
37%|███▋ | 50/135 [01:07<01:54, 1.35s/it]
|
||
38%|███▊ | 51/135 [01:08<01:55, 1.37s/it]
|
||
39%|███▊ | 52/135 [01:09<01:52, 1.35s/it]
|
||
39%|███▉ | 53/135 [01:11<01:54, 1.39s/it]
|
||
40%|████ | 54/135 [01:12<01:50, 1.37s/it]
|
||
41%|████ | 55/135 [01:13<01:47, 1.35s/it]
|
||
41%|████▏ | 56/135 [01:15<01:45, 1.33s/it]
|
||
42%|████▏ | 57/135 [01:16<01:43, 1.33s/it]
|
||
43%|████▎ | 58/135 [01:17<01:41, 1.32s/it]
|
||
44%|████▎ | 59/135 [01:19<01:40, 1.32s/it]
|
||
44%|████▍ | 60/135 [01:20<01:38, 1.31s/it]
|
||
|
||
{'loss': 1.6036, 'grad_norm': 2.0834147930145264, 'learning_rate': 1.3916603579471705e-05, 'epoch': 0.44}
|
||
|
||
44%|████▍ | 60/135 [01:20<01:38, 1.31s/it]
|
||
45%|████▌ | 61/135 [01:21<01:37, 1.31s/it]
|
||
46%|████▌ | 62/135 [01:23<01:35, 1.31s/it]
|
||
47%|████▋ | 63/135 [01:24<01:34, 1.31s/it]
|
||
47%|████▋ | 64/135 [01:25<01:32, 1.31s/it]
|
||
48%|████▊ | 65/135 [01:26<01:31, 1.31s/it]
|
||
49%|████▉ | 66/135 [01:28<01:30, 1.31s/it]
|
||
50%|████▉ | 67/135 [01:29<01:31, 1.35s/it]
|
||
50%|█████ | 68/135 [01:31<01:29, 1.34s/it]
|
||
51%|█████ | 69/135 [01:32<01:30, 1.37s/it]
|
||
52%|█████▏ | 70/135 [01:33<01:27, 1.35s/it]
|
||
|
||
{'loss': 1.559, 'grad_norm': 1.5917985439300537, 'learning_rate': 1.1423148382732854e-05, 'epoch': 0.52}
|
||
|
||
52%|█████▏ | 70/135 [01:33<01:27, 1.35s/it]
|
||
53%|█████▎ | 71/135 [01:35<01:25, 1.33s/it]
|
||
53%|█████▎ | 72/135 [01:36<01:26, 1.38s/it]
|
||
54%|█████▍ | 73/135 [01:37<01:23, 1.35s/it]
|
||
55%|█████▍ | 74/135 [01:39<01:21, 1.34s/it]
|
||
56%|█████▌ | 75/135 [01:40<01:19, 1.33s/it]
|
||
56%|█████▋ | 76/135 [01:41<01:17, 1.32s/it]
|
||
57%|█████▋ | 77/135 [01:43<01:16, 1.31s/it]
|
||
58%|█████▊ | 78/135 [01:44<01:14, 1.31s/it]
|
||
59%|█████▊ | 79/135 [01:45<01:13, 1.31s/it]
|
||
59%|█████▉ | 80/135 [01:46<01:11, 1.31s/it]
|
||
|
||
{'loss': 1.5159, 'grad_norm': 1.3988016843795776, 'learning_rate': 8.83429543400241e-06, 'epoch': 0.59}
|
||
|
||
59%|█████▉ | 80/135 [01:47<01:11, 1.31s/it]
|
||
60%|██████ | 81/135 [01:48<01:10, 1.31s/it]
|
||
61%|██████ | 82/135 [01:49<01:09, 1.31s/it]
|
||
61%|██████▏ | 83/135 [01:50<01:07, 1.31s/it]
|
||
62%|██████▏ | 84/135 [01:52<01:08, 1.34s/it]
|
||
63%|██████▎ | 85/135 [01:53<01:06, 1.33s/it]
|
||
64%|██████▎ | 86/135 [01:54<01:04, 1.32s/it]
|
||
64%|██████▍ | 87/135 [01:56<01:05, 1.36s/it]
|
||
65%|██████▌ | 88/135 [01:57<01:05, 1.39s/it]
|
||
66%|██████▌ | 89/135 [01:59<01:02, 1.37s/it]
|
||
67%|██████▋ | 90/135 [02:00<01:00, 1.35s/it]
|
||
|
||
{'loss': 1.504, 'grad_norm': 1.077965497970581, 'learning_rate': 6.323583033672799e-06, 'epoch': 0.67}
|
||
|
||
67%|██████▋ | 90/135 [02:00<01:00, 1.35s/it]
|
||
67%|██████▋ | 91/135 [02:01<00:58, 1.33s/it]
|
||
68%|██████▊ | 92/135 [02:03<00:56, 1.33s/it]
|
||
69%|██████▉ | 93/135 [02:04<00:55, 1.32s/it]
|
||
70%|██████▉ | 94/135 [02:05<00:53, 1.31s/it]
|
||
70%|███████ | 95/135 [02:06<00:52, 1.31s/it]
|
||
71%|███████ | 96/135 [02:08<00:51, 1.31s/it]
|
||
72%|███████▏ | 97/135 [02:09<00:49, 1.31s/it]
|
||
73%|███████▎ | 98/135 [02:10<00:48, 1.31s/it]
|
||
73%|███████▎ | 99/135 [02:12<00:46, 1.31s/it]
|
||
74%|███████▍ | 100/135 [02:13<00:45, 1.31s/it]
|
||
|
||
{'loss': 1.482, 'grad_norm': 1.0065542459487915, 'learning_rate': 4.059311495186338e-06, 'epoch': 0.74}
|
||
|
||
74%|███████▍ | 100/135 [02:13<00:45, 1.31s/it][INFO|trainer.py:4307] 2026-04-14 19:32:56,372 >>
|
||
***** Running Evaluation *****
|
||
[INFO|trainer.py:4309] 2026-04-14 19:32:56,372 >> Num examples = 931
|
||
[INFO|trainer.py:4312] 2026-04-14 19:32:56,372 >> Batch size = 16
|
||
|
||
|
||
0%| | 0/8 [00:00<?, ?it/s][A
|
||
|
||
25%|██▌ | 2/8 [00:00<00:01, 5.98it/s][A
|
||
|
||
38%|███▊ | 3/8 [00:00<00:01, 4.13it/s][A
|
||
|
||
50%|█████ | 4/8 [00:01<00:01, 3.58it/s][A
|
||
|
||
62%|██████▎ | 5/8 [00:01<00:00, 3.34it/s][A
|
||
|
||
75%|███████▌ | 6/8 [00:01<00:00, 3.18it/s][A
|
||
|
||
88%|████████▊ | 7/8 [00:02<00:00, 3.10it/s][A
|
||
|
||
100%|██████████| 8/8 [00:02<00:00, 3.07it/s][A
|
||
|
||
|
||
|
||
[A{'eval_loss': 1.4770615100860596, 'eval_runtime': 2.7359, 'eval_samples_per_second': 340.288, 'eval_steps_per_second': 2.924, 'epoch': 0.74}
|
||
|
||
74%|███████▍ | 100/135 [02:16<00:45, 1.31s/it]
|
||
|
||
100%|██████████| 8/8 [00:02<00:00, 3.07it/s][A
|
||
|
||
[A
|
||
75%|███████▍ | 101/135 [02:17<01:13, 2.16s/it]
|
||
76%|███████▌ | 102/135 [02:18<01:02, 1.90s/it]
|
||
76%|███████▋ | 103/135 [02:20<00:55, 1.72s/it]
|
||
77%|███████▋ | 104/135 [02:21<00:49, 1.60s/it]
|
||
78%|███████▊ | 105/135 [02:22<00:46, 1.55s/it]
|
||
79%|███████▊ | 106/135 [02:24<00:44, 1.52s/it]
|
||
79%|███████▉ | 107/135 [02:25<00:40, 1.46s/it]
|
||
80%|████████ | 108/135 [02:27<00:38, 1.41s/it]
|
||
81%|████████ | 109/135 [02:28<00:35, 1.38s/it]
|
||
81%|████████▏ | 110/135 [02:29<00:33, 1.36s/it]
|
||
|
||
{'loss': 1.4691, 'grad_norm': 0.8746654987335205, 'learning_rate': 2.1932614882827196e-06, 'epoch': 0.81}
|
||
|
||
81%|████████▏ | 110/135 [02:29<00:33, 1.36s/it]
|
||
82%|████████▏ | 111/135 [02:30<00:32, 1.34s/it]
|
||
83%|████████▎ | 112/135 [02:32<00:30, 1.33s/it]
|
||
84%|████████▎ | 113/135 [02:33<00:29, 1.32s/it]
|
||
84%|████████▍ | 114/135 [02:34<00:27, 1.31s/it]
|
||
85%|████████▌ | 115/135 [02:36<00:26, 1.31s/it]
|
||
86%|████████▌ | 116/135 [02:37<00:24, 1.31s/it]
|
||
87%|████████▋ | 117/135 [02:38<00:23, 1.31s/it]
|
||
87%|████████▋ | 118/135 [02:40<00:22, 1.33s/it]
|
||
88%|████████▊ | 119/135 [02:41<00:21, 1.32s/it]
|
||
89%|████████▉ | 120/135 [02:42<00:19, 1.32s/it]
|
||
|
||
{'loss': 1.4602, 'grad_norm': 0.8962180614471436, 'learning_rate': 8.505197417404687e-07, 'epoch': 0.89}
|
||
|
||
89%|████████▉ | 120/135 [02:42<00:19, 1.32s/it]
|
||
90%|████████▉ | 121/135 [02:44<00:18, 1.31s/it]
|
||
90%|█████████ | 122/135 [02:45<00:16, 1.31s/it]
|
||
91%|█████████ | 123/135 [02:46<00:16, 1.35s/it]
|
||
92%|█████████▏| 124/135 [02:48<00:15, 1.38s/it]
|
||
93%|█████████▎| 125/135 [02:49<00:13, 1.36s/it]
|
||
93%|█████████▎| 126/135 [02:50<00:12, 1.34s/it]
|
||
94%|█████████▍| 127/135 [02:52<00:10, 1.33s/it]
|
||
95%|█████████▍| 128/135 [02:53<00:09, 1.32s/it]
|
||
96%|█████████▌| 129/135 [02:54<00:07, 1.31s/it]
|
||
96%|█████████▋| 130/135 [02:56<00:06, 1.31s/it]
|
||
|
||
{'loss': 1.4526, 'grad_norm': 0.8950978517532349, 'learning_rate': 1.2109411818274851e-07, 'epoch': 0.96}
|
||
|
||
96%|█████████▋| 130/135 [02:56<00:06, 1.31s/it]
|
||
97%|█████████▋| 131/135 [02:57<00:05, 1.31s/it]
|
||
98%|█████████▊| 132/135 [02:58<00:03, 1.30s/it]
|
||
99%|█████████▊| 133/135 [02:59<00:02, 1.30s/it]
|
||
99%|█████████▉| 134/135 [03:01<00:01, 1.30s/it]
|
||
100%|██████████| 135/135 [03:02<00:00, 1.35s/it][INFO|trainer.py:3984] 2026-04-14 19:34:05,342 >> Saving model checkpoint to /scratch/qu.yang1/outputs/qwen3-8b-base-sft-hh-helpful-8xh200-20260414-192602-232981/checkpoint-135
|
||
[INFO|configuration_utils.py:419] 2026-04-14 19:34:05,349 >> Configuration saved in /scratch/qu.yang1/outputs/qwen3-8b-base-sft-hh-helpful-8xh200-20260414-192602-232981/checkpoint-135/config.json
|
||
[INFO|configuration_utils.py:911] 2026-04-14 19:34:05,353 >> Configuration saved in /scratch/qu.yang1/outputs/qwen3-8b-base-sft-hh-helpful-8xh200-20260414-192602-232981/checkpoint-135/generation_config.json
|
||
[INFO|modeling_utils.py:3580] 2026-04-14 19:34:58,174 >> The model is bigger than the maximum size per checkpoint (5GB) and is going to be split in 6 checkpoint shards. You can find where each parameters has been saved in the index located at /scratch/qu.yang1/outputs/qwen3-8b-base-sft-hh-helpful-8xh200-20260414-192602-232981/checkpoint-135/model.safetensors.index.json.
|
||
[INFO|tokenization_utils_base.py:2510] 2026-04-14 19:34:58,183 >> tokenizer config file saved in /scratch/qu.yang1/outputs/qwen3-8b-base-sft-hh-helpful-8xh200-20260414-192602-232981/checkpoint-135/tokenizer_config.json
|
||
[INFO|tokenization_utils_base.py:2519] 2026-04-14 19:34:58,190 >> Special tokens file saved in /scratch/qu.yang1/outputs/qwen3-8b-base-sft-hh-helpful-8xh200-20260414-192602-232981/checkpoint-135/special_tokens_map.json
|
||
[INFO|trainer.py:2681] 2026-04-14 19:38:57,475 >>
|
||
|
||
Training completed. Do not forget to share your model on huggingface.co/models =)
|
||
|
||
|
||
|
||
|
||
{'train_runtime': 501.5523, 'train_samples_per_second': 34.284, 'train_steps_per_second': 0.269, 'train_loss': 1.7195094338169805, 'epoch': 1.0}
|
||
|
||
100%|██████████| 135/135 [08:14<00:00, 1.35s/it]
|
||
100%|██████████| 135/135 [08:14<00:00, 3.66s/it]
|
||
***** train metrics *****
|
||
epoch = 1.0
|
||
total_flos = 46771303GF
|
||
train_loss = 1.7195
|
||
train_runtime = 0:08:21.55
|
||
train_samples = 43598
|
||
train_samples_per_second = 34.284
|
||
train_steps_per_second = 0.269
|
||
2026-04-14 19:38:57 - INFO - __main__ - *** Save model ***
|
||
[INFO|configuration_utils.py:419] 2026-04-14 19:39:19,837 >> Configuration saved in /scratch/qu.yang1/outputs/qwen3-8b-base-sft-hh-helpful-8xh200-20260414-192602-232981/config.json
|
||
[INFO|configuration_utils.py:911] 2026-04-14 19:39:19,841 >> Configuration saved in /scratch/qu.yang1/outputs/qwen3-8b-base-sft-hh-helpful-8xh200-20260414-192602-232981/generation_config.json
|
||
[INFO|modeling_utils.py:3580] 2026-04-14 19:40:19,850 >> The model is bigger than the maximum size per checkpoint (5GB) and is going to be split in 7 checkpoint shards. You can find where each parameters has been saved in the index located at /scratch/qu.yang1/outputs/qwen3-8b-base-sft-hh-helpful-8xh200-20260414-192602-232981/model.safetensors.index.json.
|
||
[INFO|tokenization_utils_base.py:2510] 2026-04-14 19:40:19,858 >> tokenizer config file saved in /scratch/qu.yang1/outputs/qwen3-8b-base-sft-hh-helpful-8xh200-20260414-192602-232981/tokenizer_config.json
|
||
[INFO|tokenization_utils_base.py:2519] 2026-04-14 19:40:19,867 >> Special tokens file saved in /scratch/qu.yang1/outputs/qwen3-8b-base-sft-hh-helpful-8xh200-20260414-192602-232981/special_tokens_map.json
|
||
2026-04-14 19:40:20 - INFO - __main__ - Saved HF-compatible model artifacts to /scratch/qu.yang1/outputs/qwen3-8b-base-sft-hh-helpful-8xh200-20260414-192602-232981
|
||
2026-04-14 19:40:20 - INFO - __main__ - Saved validated HF-compatible model artifacts to /scratch/qu.yang1/outputs/qwen3-8b-base-sft-hh-helpful-8xh200-20260414-192602-232981
|
||
[INFO|modelcard.py:450] 2026-04-14 19:40:20,569 >> Dropping the following result as it does not have all the necessary fields:
|
||
{'dataset': {'name': 'Anthropic/hh-rlhf', 'type': 'Anthropic/hh-rlhf', 'config': 'default', 'split': 'train', 'args': 'default'}}
|
||
[INFO|configuration_utils.py:419] 2026-04-14 19:40:20,592 >> Configuration saved in /scratch/qu.yang1/outputs/qwen3-8b-base-sft-hh-helpful-8xh200-20260414-192602-232981/config.json
|
||
2026-04-14 19:40:20 - INFO - __main__ - *** Evaluate ***
|
||
[INFO|trainer.py:4307] 2026-04-14 19:40:20,597 >>
|
||
***** Running Evaluation *****
|
||
[INFO|trainer.py:4309] 2026-04-14 19:40:20,597 >> Num examples = 931
|
||
[INFO|trainer.py:4312] 2026-04-14 19:40:20,597 >> Batch size = 16
|
||
|
||
0%| | 0/8 [00:00<?, ?it/s]
|
||
25%|██▌ | 2/8 [00:00<00:00, 6.08it/s]
|
||
38%|███▊ | 3/8 [00:00<00:01, 4.19it/s]
|
||
50%|█████ | 4/8 [00:01<00:01, 3.61it/s]
|
||
62%|██████▎ | 5/8 [00:01<00:00, 3.37it/s]
|
||
75%|███████▌ | 6/8 [00:01<00:00, 3.22it/s]
|
||
88%|████████▊ | 7/8 [00:02<00:00, 3.13it/s]
|
||
100%|██████████| 8/8 [00:02<00:00, 3.11it/s]
|
||
100%|██████████| 8/8 [00:02<00:00, 3.36it/s]
|
||
***** eval metrics *****
|
||
epoch = 1.0
|
||
eval_loss = 1.4622
|
||
eval_runtime = 0:00:02.69
|
||
eval_samples = 2339
|
||
eval_samples_per_second = 344.926
|
||
eval_steps_per_second = 2.964
|
||
2026-04-14 19:40:23 - INFO - __main__ - *** Training complete ***
|
||
wandb: - 0.014 MB of 0.014 MB uploaded
|
||
wandb: \ 0.014 MB of 0.014 MB uploaded
|
||
wandb: | 0.014 MB of 0.014 MB uploaded
|
||
wandb: / 0.014 MB of 0.014 MB uploaded
|
||
wandb: - 0.043 MB of 0.063 MB uploaded
|
||
wandb: \ 0.065 MB of 0.065 MB uploaded
|
||
wandb: | 0.065 MB of 0.065 MB uploaded
|
||
wandb:
|
||
wandb: Run history:
|
||
wandb: eval/loss █▁
|
||
wandb: eval/runtime █▁
|
||
wandb: eval/samples_per_second ▁█
|
||
wandb: eval/steps_per_second ▁█
|
||
wandb: train/epoch ▁▁▂▃▃▄▄▅▅▆▆▆▇▇███
|
||
wandb: train/global_step ▁▁▂▃▃▄▄▅▅▆▆▆▇▇███
|
||
wandb: train/grad_norm █▃▄▄▃▁▂▁▁▁▁▁▁▁
|
||
wandb: train/learning_rate ▁▆██▇▇▆▅▄▃▂▂▁▁
|
||
wandb: train/loss █▇▄▃▂▂▂▁▁▁▁▁▁▁
|
||
wandb:
|
||
wandb: Run summary:
|
||
wandb: eval/loss 1.46225
|
||
wandb: eval/runtime 2.6991
|
||
wandb: eval/samples_per_second 344.926
|
||
wandb: eval/steps_per_second 2.964
|
||
wandb: total_flos 5.022030486516531e+16
|
||
wandb: train/epoch 1.0
|
||
wandb: train/global_step 135
|
||
wandb: train/grad_norm 0.8951
|
||
wandb: train/learning_rate 0.0
|
||
wandb: train/loss 1.4526
|
||
wandb: train_loss 1.71951
|
||
wandb: train_runtime 501.5523
|
||
wandb: train_samples_per_second 34.284
|
||
wandb: train_steps_per_second 0.269
|
||
wandb:
|
||
wandb: 🚀 View run qwen3-8b-base-sft-hh-helpful-8xh200-20260414-192602-232981 at: https://wandb.ai/feng-cheng-northeastern-university/huggingface/runs/mlwlhzba
|
||
wandb: ⭐️ View project at: https://wandb.ai/feng-cheng-northeastern-university/huggingface
|
||
wandb: Synced 6 W&B file(s), 0 media file(s), 2 artifact file(s) and 0 other file(s)
|
||
wandb: Find logs at: /scratch/qu.yang1/wandb/wandb/run-20260414_193038-mlwlhzba/logs
|
||
wandb: WARNING The new W&B backend becomes opt-out in version 0.18.0; try it out with `wandb.require("core")`! See https://wandb.me/wandb-core for more information.
|