初始化项目,由ModelHub XC社区提供模型
Model: simone-papicchio/Think2SQL-7B Source: Original Platform
This commit is contained in:
238
all_configurations.yaml
Normal file
238
all_configurations.yaml
Normal file
@@ -0,0 +1,238 @@
|
||||
model_args:
|
||||
attn_implementation: flash_attention_2
|
||||
bnb_4bit_quant_type: nf4
|
||||
load_in_4bit: false
|
||||
load_in_8bit: false
|
||||
lora_alpha: 32
|
||||
lora_dropout: 0.05
|
||||
lora_modules_to_save: null
|
||||
lora_r: 16
|
||||
lora_target_modules: null
|
||||
lora_task_type: CAUSAL_LM
|
||||
model_name_or_path: Qwen/Qwen2.5-Coder-7B-Instruct
|
||||
model_revision: main
|
||||
torch_dtype: bfloat16
|
||||
trust_remote_code: false
|
||||
use_bnb_nested_quant: false
|
||||
use_dora: false
|
||||
use_peft: false
|
||||
use_rslora: false
|
||||
script_args:
|
||||
cosine_max_len: 1000
|
||||
cosine_max_value_correct: 1.0
|
||||
cosine_max_value_wrong: -0.5
|
||||
cosine_min_value_correct: 0.5
|
||||
cosine_min_value_wrong: 0.0
|
||||
dataset_config: null
|
||||
dataset_name: simone-papicchio/bird
|
||||
dataset_test_split: test
|
||||
dataset_train_split: train
|
||||
gradient_checkpointing_use_reentrant: false
|
||||
ignore_bias_buffers: false
|
||||
reward_funcs:
|
||||
- qatch_metrics
|
||||
- format
|
||||
- tag_count
|
||||
training_args:
|
||||
_n_gpu: 1
|
||||
accelerator_config:
|
||||
dispatch_batches: null
|
||||
even_batches: true
|
||||
gradient_accumulation_kwargs: null
|
||||
non_blocking: false
|
||||
split_batches: false
|
||||
use_configured_state: false
|
||||
use_seedable_sampler: true
|
||||
adafactor: false
|
||||
adam_beta1: 0.9
|
||||
adam_beta2: 0.999
|
||||
adam_epsilon: 1.0e-08
|
||||
add_system_prompt: true
|
||||
add_validation: false
|
||||
auto_find_batch_size: false
|
||||
average_tokens_across_devices: false
|
||||
base_db_path: data/bird_train/train_databases
|
||||
batch_eval_metrics: false
|
||||
benchmarks: []
|
||||
beta: 0.04
|
||||
bf16: true
|
||||
bf16_full_eval: false
|
||||
cache_implementation: null
|
||||
cached_file_path: /workspaces/deep_thinking/cache_target_sql2execution_BIRD_train.pkl
|
||||
callbacks: {}
|
||||
chat_template: null
|
||||
data_seed: null
|
||||
dataloader_drop_last: false
|
||||
dataloader_num_workers: 0
|
||||
dataloader_persistent_workers: false
|
||||
dataloader_pin_memory: true
|
||||
dataloader_prefetch_factor: null
|
||||
dataset_test_split_name: validation
|
||||
ddp_backend: null
|
||||
ddp_broadcast_buffers: null
|
||||
ddp_bucket_cap_mb: null
|
||||
ddp_find_unused_parameters: null
|
||||
ddp_timeout: 1800
|
||||
debug: []
|
||||
deepspeed: null
|
||||
disable_tqdm: false
|
||||
do_eval: false
|
||||
do_predict: false
|
||||
do_train: false
|
||||
ds3_gather_for_generation: true
|
||||
epsilon: 0.2
|
||||
epsilon_high: null
|
||||
eval_accumulation_steps: null
|
||||
eval_delay: 0
|
||||
eval_do_concat_batches: true
|
||||
eval_on_start: false
|
||||
eval_steps: null
|
||||
eval_strategy: !!python/object/apply:transformers.trainer_utils.IntervalStrategy
|
||||
- 'no'
|
||||
eval_use_gather_object: false
|
||||
fp16: false
|
||||
fp16_backend: auto
|
||||
fp16_full_eval: false
|
||||
fp16_opt_level: O1
|
||||
fsdp: []
|
||||
fsdp_config:
|
||||
min_num_params: 0
|
||||
xla: false
|
||||
xla_fsdp_grad_ckpt: false
|
||||
xla_fsdp_v2: false
|
||||
fsdp_min_num_params: 0
|
||||
fsdp_transformer_layer_cls_to_wrap: null
|
||||
full_determinism: false
|
||||
gradient_accumulation_steps: 16
|
||||
gradient_checkpointing: true
|
||||
gradient_checkpointing_kwargs:
|
||||
use_reentrant: false
|
||||
greater_is_better: false
|
||||
group_by_length: false
|
||||
half_precision_backend: auto
|
||||
hub_always_push: false
|
||||
hub_model_id: Qwen2.5-1.5B-Open-R1-GRPO
|
||||
hub_model_revision: main
|
||||
hub_private_repo: null
|
||||
hub_strategy: !!python/object/apply:transformers.trainer_utils.HubStrategy
|
||||
- every_save
|
||||
hub_token: null
|
||||
ignore_data_skip: false
|
||||
include_for_metrics: []
|
||||
include_inputs_for_metrics: false
|
||||
include_num_input_tokens_seen: false
|
||||
include_tokens_per_second: false
|
||||
jit_mode_eval: false
|
||||
label_names: null
|
||||
label_smoothing_factor: 0.0
|
||||
learning_rate: 1.0e-06
|
||||
length_column_name: length
|
||||
load_best_model_at_end: false
|
||||
local_rank: 0
|
||||
log_completions: true
|
||||
log_level: info
|
||||
log_level_replica: warning
|
||||
log_on_each_node: true
|
||||
logging_dir: ./.tensorboard_logging/f5655cd2/
|
||||
logging_first_step: true
|
||||
logging_nan_inf_filter: true
|
||||
logging_steps: 5
|
||||
logging_strategy: !!python/object/apply:transformers.trainer_utils.IntervalStrategy
|
||||
- steps
|
||||
lr_scheduler_kwargs: {}
|
||||
lr_scheduler_type: !!python/object/apply:transformers.trainer_utils.SchedulerType
|
||||
- constant_with_warmup
|
||||
max_completion_length: 4096
|
||||
max_grad_norm: 0.2
|
||||
max_prompt_length: 2048
|
||||
max_steps: -1
|
||||
metric_for_best_model: loss
|
||||
min_p: null
|
||||
model_init_kwargs: '{''revision'': ''main'', ''trust_remote_code'': False, ''attn_implementation'':
|
||||
''flash_attention_2'', ''torch_dtype'': torch.bfloat16, ''use_cache'': False}'
|
||||
mp_parameters: ''
|
||||
neftune_noise_alpha: null
|
||||
no_cuda: false
|
||||
num_completions_to_print: 1
|
||||
num_generations: 16
|
||||
num_iterations: 1
|
||||
num_train_epochs: 1.0
|
||||
optim: !!python/object/apply:transformers.training_args.OptimizerNames
|
||||
- adamw_8bit
|
||||
optim_args: null
|
||||
optim_target_modules: null
|
||||
output_dir: base_models/grpo/Qwen/Qwen2.5-Coder-7B-Instruct/bs_256_ml_4096_gen_16_f5655cd2_RL
|
||||
overwrite_hub_revision: false
|
||||
overwrite_output_dir: false
|
||||
past_index: -1
|
||||
per_device_eval_batch_size: 8
|
||||
per_device_train_batch_size: 8
|
||||
per_gpu_eval_batch_size: null
|
||||
per_gpu_train_batch_size: null
|
||||
prediction_loss_only: false
|
||||
prompt_name: text2sql_model_grpo
|
||||
push_to_hub: false
|
||||
push_to_hub_model_id: null
|
||||
push_to_hub_organization: null
|
||||
push_to_hub_revision: false
|
||||
push_to_hub_token: null
|
||||
ray_scope: last
|
||||
ref_model_mixup_alpha: 0.6
|
||||
ref_model_sync_steps: 512
|
||||
remove_unused_columns: false
|
||||
repetition_penalty: 1.0
|
||||
report_to:
|
||||
- tensorboard
|
||||
- wandb
|
||||
restore_callback_states_from_checkpoint: false
|
||||
resume_from_checkpoint: 'True'
|
||||
reward_weights:
|
||||
- 0.85
|
||||
- 0.1
|
||||
- 0.05
|
||||
run_name: exp-9-7B-QATCH
|
||||
save_on_each_node: false
|
||||
save_only_model: false
|
||||
save_safetensors: true
|
||||
save_steps: 0.1
|
||||
save_strategy: !!python/object/apply:transformers.trainer_utils.SaveStrategy
|
||||
- steps
|
||||
save_total_limit: 3
|
||||
scale_rewards: true
|
||||
seed: 42
|
||||
skip_memory_metrics: true
|
||||
stratified_by_complexity: false
|
||||
sync_ref_model: false
|
||||
temperature: 0.7
|
||||
tf32: null
|
||||
top_k: 50
|
||||
top_p: 1.0
|
||||
torch_compile: false
|
||||
torch_compile_backend: null
|
||||
torch_compile_mode: null
|
||||
torch_empty_cache_steps: null
|
||||
torchdynamo: null
|
||||
tp_size: 0
|
||||
tpu_metrics_debug: false
|
||||
tpu_num_cores: null
|
||||
use_cpu: false
|
||||
use_ipex: false
|
||||
use_legacy_prediction_loop: false
|
||||
use_liger_kernel: false
|
||||
use_liger_loss: false
|
||||
use_mps_device: false
|
||||
use_vllm: true
|
||||
validation_split: 0.2
|
||||
vllm_device: auto
|
||||
vllm_dtype: bfloat16
|
||||
vllm_enable_prefix_caching: null
|
||||
vllm_gpu_memory_utilization: 0.7
|
||||
vllm_guided_decoding_regex: null
|
||||
vllm_max_model_len: null
|
||||
vllm_server_host: 127.0.0.1
|
||||
vllm_server_port: 24879
|
||||
vllm_server_timeout: 120.0
|
||||
wandb_log_unique_prompts: true
|
||||
warmup_ratio: 0.1
|
||||
warmup_steps: 0
|
||||
weight_decay: 0.0
|
||||
Reference in New Issue
Block a user