189 lines
4.8 KiB
YAML
189 lines
4.8 KiB
YAML
data:
|
|
data_path: ../data/msmarco/parsed_hard_ids_10p_train.jsonl
|
|
dataset_seed: 42
|
|
max_block_length: 160
|
|
max_seq_length: 6144
|
|
num_documents: 30
|
|
qrels_path: null
|
|
streaming: false
|
|
train_test_split: 0.99
|
|
val_data_path: null
|
|
model:
|
|
attn_implementation: default_blockrank
|
|
lora_alpha: -1
|
|
lora_dropout: 0.0
|
|
lora_r: -1
|
|
lora_target_modules: q_proj,k_proj,v_proj,o_proj,gate_proj,up_proj,down_proj
|
|
model_name_or_path: /workspace/nilesh_work/hf_cache/Mistral-7B-Instruct-v0.3
|
|
trust_remote_code: false
|
|
use_4bit: false
|
|
use_blockrank: true
|
|
use_lora: false
|
|
training:
|
|
accelerator_config:
|
|
dispatch_batches: null
|
|
even_batches: true
|
|
gradient_accumulation_kwargs: null
|
|
non_blocking: false
|
|
split_batches: false
|
|
use_seedable_sampler: true
|
|
activation_offloading: false
|
|
adafactor: false
|
|
adam_beta1: 0.9
|
|
adam_beta2: 0.999
|
|
adam_epsilon: 1.0e-08
|
|
assistant_only_loss: false
|
|
auto_find_batch_size: false
|
|
aux_layer_idx: 20
|
|
aux_loss_weight: 0.1
|
|
aux_temperature: 0.05
|
|
average_tokens_across_devices: true
|
|
batch_eval_metrics: false
|
|
bf16: true
|
|
bf16_full_eval: false
|
|
chat_template_path: null
|
|
completion_only_loss: null
|
|
data_seed: null
|
|
dataloader_drop_last: false
|
|
dataloader_num_workers: 0
|
|
dataloader_persistent_workers: false
|
|
dataloader_pin_memory: true
|
|
dataloader_prefetch_factor: null
|
|
dataset_kwargs:
|
|
skip_prepare_dataset: true
|
|
dataset_num_proc: null
|
|
dataset_text_field: text
|
|
ddp_backend: null
|
|
ddp_broadcast_buffers: null
|
|
ddp_bucket_cap_mb: null
|
|
ddp_find_unused_parameters: null
|
|
ddp_timeout: 1800
|
|
debug: []
|
|
deepspeed: null
|
|
disable_tqdm: false
|
|
do_eval: true
|
|
do_predict: false
|
|
do_train: false
|
|
eos_token: <EOS_TOKEN>
|
|
eval_accumulation_steps: null
|
|
eval_delay: 0
|
|
eval_do_concat_batches: true
|
|
eval_on_start: false
|
|
eval_packing: null
|
|
eval_steps: 500
|
|
eval_strategy: 'no'
|
|
eval_use_gather_object: false
|
|
evaluation_strategy: steps
|
|
fp16: false
|
|
fp16_backend: auto
|
|
fp16_full_eval: false
|
|
fp16_opt_level: O1
|
|
fsdp: []
|
|
fsdp_config:
|
|
min_num_params: 0
|
|
xla: false
|
|
xla_fsdp_grad_ckpt: false
|
|
xla_fsdp_v2: false
|
|
fsdp_min_num_params: 0
|
|
fsdp_transformer_layer_cls_to_wrap: null
|
|
full_determinism: false
|
|
gradient_accumulation_steps: 4
|
|
gradient_checkpointing: true
|
|
gradient_checkpointing_kwargs: null
|
|
greater_is_better: false
|
|
group_by_length: false
|
|
half_precision_backend: auto
|
|
hub_always_push: false
|
|
hub_model_id: null
|
|
hub_private_repo: null
|
|
hub_revision: null
|
|
hub_strategy: every_save
|
|
hub_token: <HUB_TOKEN>
|
|
ignore_data_skip: false
|
|
include_for_metrics: []
|
|
include_inputs_for_metrics: false
|
|
include_num_input_tokens_seen: 'no'
|
|
include_tokens_per_second: false
|
|
jit_mode_eval: false
|
|
label_names: null
|
|
label_smoothing_factor: 0.0
|
|
learning_rate: 3.0e-06
|
|
length_column_name: length
|
|
liger_kernel_config: null
|
|
load_best_model_at_end: false
|
|
local_rank: 0
|
|
log_level: passive
|
|
log_level_replica: warning
|
|
log_on_each_node: true
|
|
logging_dir: ../outputs/blockrank-with-aux-loss-mistral-7b-icr-medium_hard_ids-full/runs/Nov03_04-06-43_06353250b0cb
|
|
logging_first_step: true
|
|
logging_nan_inf_filter: true
|
|
logging_steps: 25
|
|
logging_strategy: steps
|
|
loss_type: nll
|
|
lr_scheduler_kwargs: {}
|
|
lr_scheduler_type: cosine
|
|
max_grad_norm: 1.0
|
|
max_length: 1024
|
|
max_steps: -1
|
|
metric_for_best_model: eval_loss
|
|
model_init_kwargs: null
|
|
mp_parameters: ''
|
|
neftune_noise_alpha: null
|
|
no_cuda: false
|
|
num_train_epochs: 1
|
|
optim: adamw_8bit
|
|
optim_args: null
|
|
optim_target_modules: null
|
|
output_dir: ../outputs/blockrank-with-aux-loss-mistral-7b-icr-medium_hard_ids-full
|
|
overwrite_output_dir: false
|
|
packing: false
|
|
packing_strategy: bfd
|
|
pad_to_multiple_of: null
|
|
pad_token: <PAD_TOKEN>
|
|
padding_free: false
|
|
parallelism_config: null
|
|
past_index: -1
|
|
per_device_eval_batch_size: 1
|
|
per_device_train_batch_size: 1
|
|
per_gpu_eval_batch_size: null
|
|
per_gpu_train_batch_size: null
|
|
prediction_loss_only: false
|
|
project: huggingface
|
|
push_to_hub: false
|
|
push_to_hub_model_id: null
|
|
push_to_hub_organization: null
|
|
push_to_hub_token: <PUSH_TO_HUB_TOKEN>
|
|
ray_scope: last
|
|
remove_unused_columns: false
|
|
report_to:
|
|
- wandb
|
|
restore_callback_states_from_checkpoint: false
|
|
resume_from_checkpoint: null
|
|
run_name: blockrank-with-aux-loss-mistral-7b-icr-medium_hard_ids-full
|
|
save_on_each_node: false
|
|
save_only_model: false
|
|
save_safetensors: true
|
|
save_steps: 500
|
|
save_strategy: 'no'
|
|
save_total_limit: 1
|
|
seed: 42
|
|
skip_memory_metrics: true
|
|
tf32: null
|
|
torch_compile: false
|
|
torch_compile_backend: null
|
|
torch_compile_mode: null
|
|
torch_empty_cache_steps: null
|
|
torchdynamo: null
|
|
tpu_metrics_debug: false
|
|
tpu_num_cores: null
|
|
trackio_space_id: trackio
|
|
use_aux_loss: true
|
|
use_cpu: false
|
|
use_legacy_prediction_loop: false
|
|
use_liger_kernel: false
|
|
use_mps_device: false
|
|
warmup_ratio: 0.01
|
|
warmup_steps: 0
|
|
weight_decay: 0
|