初始化项目,由ModelHub XC社区提供模型
Model: soynade-research/oolel-lit-gemma Source: Original Platform
This commit is contained in:
36
.gitattributes
vendored
Normal file
36
.gitattributes
vendored
Normal file
@@ -0,0 +1,36 @@
|
||||
*.7z filter=lfs diff=lfs merge=lfs -text
|
||||
*.arrow filter=lfs diff=lfs merge=lfs -text
|
||||
*.bin filter=lfs diff=lfs merge=lfs -text
|
||||
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
||||
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
||||
*.ftz filter=lfs diff=lfs merge=lfs -text
|
||||
*.gz filter=lfs diff=lfs merge=lfs -text
|
||||
*.h5 filter=lfs diff=lfs merge=lfs -text
|
||||
*.joblib filter=lfs diff=lfs merge=lfs -text
|
||||
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
||||
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
||||
*.model filter=lfs diff=lfs merge=lfs -text
|
||||
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
||||
*.npy filter=lfs diff=lfs merge=lfs -text
|
||||
*.npz filter=lfs diff=lfs merge=lfs -text
|
||||
*.onnx filter=lfs diff=lfs merge=lfs -text
|
||||
*.ot filter=lfs diff=lfs merge=lfs -text
|
||||
*.parquet filter=lfs diff=lfs merge=lfs -text
|
||||
*.pb filter=lfs diff=lfs merge=lfs -text
|
||||
*.pickle filter=lfs diff=lfs merge=lfs -text
|
||||
*.pkl filter=lfs diff=lfs merge=lfs -text
|
||||
*.pt filter=lfs diff=lfs merge=lfs -text
|
||||
*.pth filter=lfs diff=lfs merge=lfs -text
|
||||
*.rar filter=lfs diff=lfs merge=lfs -text
|
||||
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
||||
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
||||
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
||||
*.tar filter=lfs diff=lfs merge=lfs -text
|
||||
*.tflite filter=lfs diff=lfs merge=lfs -text
|
||||
*.tgz filter=lfs diff=lfs merge=lfs -text
|
||||
*.wasm filter=lfs diff=lfs merge=lfs -text
|
||||
*.xz filter=lfs diff=lfs merge=lfs -text
|
||||
*.zip filter=lfs diff=lfs merge=lfs -text
|
||||
*.zst filter=lfs diff=lfs merge=lfs -text
|
||||
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
||||
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
||||
96
README.md
Normal file
96
README.md
Normal file
@@ -0,0 +1,96 @@
|
||||
---
|
||||
license: agpl-3.0
|
||||
datasets:
|
||||
- soynade-research/FineWeb2-HQ-50k-Wolof
|
||||
language:
|
||||
- wo
|
||||
- en
|
||||
- fr
|
||||
base_model:
|
||||
- google/gemma-3-270m-it
|
||||
pipeline_tag: text-generation
|
||||
---
|
||||
|
||||
# Oolel-lit-gemma
|
||||
|
||||
Oolel-lit-gemma is a fine-tuned version of [Gemma-3-270m-it](https://huggingface.co/google/gemma-3-270m-it)
|
||||
for the Wolof language. It is part of our Oolel family of compact, on-device Wolof language models
|
||||
developed.
|
||||
|
||||
The model was trained using supervised fine-tuning (SFT) on synthetic data distilled from our
|
||||
larger **Oolel-7B** models via [Oolel-translator](https://github.com/soynade-research/oolel-translator).
|
||||
|
||||
|
||||
## Usage
|
||||
|
||||
### Quick start with pipeline
|
||||
|
||||
```python
|
||||
from transformers import pipeline
|
||||
|
||||
generator = pipeline(
|
||||
"text-generation",
|
||||
model="soynade-research/oolel-lit-gemma",
|
||||
device="cuda",
|
||||
)
|
||||
|
||||
messages = [{"role": "user", "content": "Translate to Wolof: The president is 45 years old."}]
|
||||
|
||||
output = generator(messages, max_new_tokens=256, return_full_text=False)
|
||||
print(output["generated_text"])
|
||||
```
|
||||
### With AutoModel for more control
|
||||
|
||||
```python
|
||||
from transformers import AutoTokenizer, Gemma3ForCausalLM
|
||||
import torch
|
||||
|
||||
model_id = "soynade-research/oolel-lit-gemma"
|
||||
|
||||
|
||||
model = Gemma3ForCausalLM.from_pretrained(
|
||||
model_id
|
||||
).eval()
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
||||
|
||||
messages = [
|
||||
[
|
||||
{
|
||||
"role": "system",
|
||||
"content": [{"type": "text", "text": "You're a Wolof AI assistant. Please always provide detailed and useful answers to the user queries."},]
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": [{"type": "text", "text": "Translate to Wolof: The president is 45 years old."},]
|
||||
},
|
||||
],
|
||||
]
|
||||
|
||||
inputs = tokenizer.apply_chat_template(
|
||||
messages,
|
||||
add_generation_prompt=True,
|
||||
tokenize=True,
|
||||
return_dict=True,
|
||||
return_tensors="pt",
|
||||
).to(model.device).to(torch.bfloat16)
|
||||
|
||||
|
||||
with torch.inference_mode():
|
||||
outputs = model.generate(**inputs, max_new_tokens=256,
|
||||
do_sample=True,
|
||||
temperature=0.7,
|
||||
top_p=0.9,)
|
||||
|
||||
outputs = tokenizer.batch_decode(outputs)
|
||||
|
||||
|
||||
```
|
||||
## Training
|
||||
The training code and configuration are available at
|
||||
[soynade-research/oolel-trainer.](https://github.com/soynade-research/oolel-trainer)
|
||||
|
||||
## Limitations
|
||||
- Primarily optimized for Wolof; performance on other languages may vary
|
||||
- As a 270M parameter model, it may struggle with complex tasks
|
||||
- Outputs should be verified by a native Wolof speaker for critical applications
|
||||
3
added_tokens.json
Normal file
3
added_tokens.json
Normal file
@@ -0,0 +1,3 @@
|
||||
{
|
||||
"<image_soft_token>": 262144
|
||||
}
|
||||
362
args.json
Normal file
362
args.json
Normal file
@@ -0,0 +1,362 @@
|
||||
{
|
||||
"output_dir": "/workspace/output/v0-20260216-180455",
|
||||
"overwrite_output_dir": false,
|
||||
"do_train": false,
|
||||
"do_eval": false,
|
||||
"do_predict": false,
|
||||
"eval_strategy": "no",
|
||||
"prediction_loss_only": false,
|
||||
"per_device_train_batch_size": 8,
|
||||
"per_device_eval_batch_size": 1,
|
||||
"per_gpu_train_batch_size": null,
|
||||
"per_gpu_eval_batch_size": null,
|
||||
"gradient_accumulation_steps": 16,
|
||||
"eval_accumulation_steps": null,
|
||||
"eval_delay": 0,
|
||||
"torch_empty_cache_steps": null,
|
||||
"learning_rate": 0.0001,
|
||||
"weight_decay": 0.1,
|
||||
"adam_beta1": 0.9,
|
||||
"adam_beta2": 0.95,
|
||||
"adam_epsilon": 1e-08,
|
||||
"max_grad_norm": 1.0,
|
||||
"num_train_epochs": 2.0,
|
||||
"max_steps": -1,
|
||||
"lr_scheduler_type": "cosine",
|
||||
"lr_scheduler_kwargs": null,
|
||||
"warmup_ratio": 0.05,
|
||||
"warmup_steps": 0,
|
||||
"log_level": "passive",
|
||||
"log_level_replica": "warning",
|
||||
"log_on_each_node": true,
|
||||
"logging_dir": "/workspace/output/v0-20260216-180455/runs",
|
||||
"logging_strategy": "steps",
|
||||
"logging_first_step": true,
|
||||
"logging_steps": 100,
|
||||
"logging_nan_inf_filter": true,
|
||||
"save_strategy": "steps",
|
||||
"save_steps": 1000.0,
|
||||
"save_total_limit": 3,
|
||||
"save_safetensors": true,
|
||||
"save_on_each_node": false,
|
||||
"save_only_model": false,
|
||||
"restore_callback_states_from_checkpoint": false,
|
||||
"no_cuda": false,
|
||||
"use_cpu": false,
|
||||
"use_mps_device": false,
|
||||
"seed": 42,
|
||||
"data_seed": 42,
|
||||
"jit_mode_eval": false,
|
||||
"bf16": true,
|
||||
"fp16": false,
|
||||
"fp16_opt_level": "O1",
|
||||
"half_precision_backend": "auto",
|
||||
"bf16_full_eval": false,
|
||||
"fp16_full_eval": false,
|
||||
"tf32": null,
|
||||
"local_rank": -1,
|
||||
"ddp_backend": null,
|
||||
"tpu_num_cores": null,
|
||||
"tpu_metrics_debug": false,
|
||||
"debug": null,
|
||||
"dataloader_drop_last": false,
|
||||
"eval_steps": null,
|
||||
"dataloader_num_workers": 16,
|
||||
"dataloader_prefetch_factor": null,
|
||||
"past_index": -1,
|
||||
"run_name": "/workspace/output/v0-20260216-180455",
|
||||
"disable_tqdm": null,
|
||||
"remove_unused_columns": true,
|
||||
"label_names": null,
|
||||
"load_best_model_at_end": false,
|
||||
"metric_for_best_model": "loss",
|
||||
"greater_is_better": false,
|
||||
"ignore_data_skip": false,
|
||||
"fsdp": [],
|
||||
"fsdp_min_num_params": 0,
|
||||
"fsdp_config": null,
|
||||
"fsdp_transformer_layer_cls_to_wrap": null,
|
||||
"accelerator_config": {
|
||||
"dispatch_batches": false
|
||||
},
|
||||
"parallelism_config": null,
|
||||
"deepspeed": null,
|
||||
"label_smoothing_factor": 0.0,
|
||||
"optim": "adamw_torch_fused",
|
||||
"optim_args": null,
|
||||
"adafactor": false,
|
||||
"group_by_length": false,
|
||||
"length_column_name": "length",
|
||||
"report_to": [
|
||||
"tensorboard"
|
||||
],
|
||||
"project": "huggingface",
|
||||
"trackio_space_id": "trackio",
|
||||
"ddp_find_unused_parameters": null,
|
||||
"ddp_bucket_cap_mb": null,
|
||||
"ddp_broadcast_buffers": null,
|
||||
"dataloader_pin_memory": true,
|
||||
"dataloader_persistent_workers": false,
|
||||
"skip_memory_metrics": true,
|
||||
"use_legacy_prediction_loop": false,
|
||||
"push_to_hub": true,
|
||||
"resume_from_checkpoint": null,
|
||||
"hub_model_id": null,
|
||||
"hub_strategy": "every_save",
|
||||
"hub_token": null,
|
||||
"hub_private_repo": true,
|
||||
"hub_always_push": false,
|
||||
"hub_revision": null,
|
||||
"gradient_checkpointing": true,
|
||||
"gradient_checkpointing_kwargs": null,
|
||||
"include_inputs_for_metrics": false,
|
||||
"include_for_metrics": [],
|
||||
"eval_do_concat_batches": true,
|
||||
"fp16_backend": "auto",
|
||||
"push_to_hub_model_id": null,
|
||||
"push_to_hub_organization": null,
|
||||
"push_to_hub_token": null,
|
||||
"mp_parameters": "",
|
||||
"auto_find_batch_size": false,
|
||||
"full_determinism": false,
|
||||
"torchdynamo": null,
|
||||
"ray_scope": "last",
|
||||
"ddp_timeout": 18000000,
|
||||
"torch_compile": false,
|
||||
"torch_compile_backend": null,
|
||||
"torch_compile_mode": null,
|
||||
"include_tokens_per_second": false,
|
||||
"include_num_input_tokens_seen": false,
|
||||
"neftune_noise_alpha": null,
|
||||
"optim_target_modules": null,
|
||||
"batch_eval_metrics": false,
|
||||
"eval_on_start": false,
|
||||
"use_liger_kernel": false,
|
||||
"liger_kernel_config": null,
|
||||
"eval_use_gather_object": false,
|
||||
"average_tokens_across_devices": true,
|
||||
"sortish_sampler": false,
|
||||
"predict_with_generate": false,
|
||||
"generation_max_length": null,
|
||||
"generation_num_beams": null,
|
||||
"generation_config": null,
|
||||
"tuner_backend": "peft",
|
||||
"vit_gradient_checkpointing": null,
|
||||
"router_aux_loss_coef": 0.0,
|
||||
"enable_dft_loss": false,
|
||||
"enable_channel_loss": false,
|
||||
"check_model": true,
|
||||
"acc_strategy": "token",
|
||||
"train_dataloader_shuffle": true,
|
||||
"max_epochs": null,
|
||||
"aligner_lr": null,
|
||||
"vit_lr": null,
|
||||
"use_logits_to_keep": null,
|
||||
"ds3_gather_for_generation": true,
|
||||
"resume_only_model": false,
|
||||
"optimizer": null,
|
||||
"loss_type": null,
|
||||
"metric": null,
|
||||
"eval_use_evalscope": false,
|
||||
"eval_dataset": [],
|
||||
"eval_dataset_args": null,
|
||||
"eval_limit": null,
|
||||
"eval_generation_config": null,
|
||||
"extra_eval_args": null,
|
||||
"use_flash_ckpt": false,
|
||||
"use_ray": false,
|
||||
"ray_exp_name": null,
|
||||
"device_groups": null,
|
||||
"model": "google/gemma-3-270m-it",
|
||||
"model_type": "gemma3_text",
|
||||
"model_revision": null,
|
||||
"task_type": "causal_lm",
|
||||
"torch_dtype": "bfloat16",
|
||||
"attn_impl": "flash_attn",
|
||||
"new_special_tokens": [],
|
||||
"num_labels": null,
|
||||
"problem_type": null,
|
||||
"rope_scaling": null,
|
||||
"device_map": null,
|
||||
"max_memory": {},
|
||||
"max_model_len": null,
|
||||
"local_repo_path": null,
|
||||
"init_strategy": null,
|
||||
"template": "gemma3_text",
|
||||
"system": null,
|
||||
"max_length": 4096,
|
||||
"truncation_strategy": "delete",
|
||||
"max_pixels": null,
|
||||
"agent_template": null,
|
||||
"norm_bbox": null,
|
||||
"use_chat_template": true,
|
||||
"padding_side": "right",
|
||||
"padding_free": false,
|
||||
"loss_scale": "default",
|
||||
"sequence_parallel_size": 1,
|
||||
"template_backend": "swift",
|
||||
"response_prefix": null,
|
||||
"enable_thinking": null,
|
||||
"add_non_thinking_prefix": true,
|
||||
"dataset": [
|
||||
"soynade-research/wo-dioula"
|
||||
],
|
||||
"val_dataset": [],
|
||||
"cached_dataset": [],
|
||||
"cached_val_dataset": [],
|
||||
"split_dataset_ratio": 0.0,
|
||||
"dataset_num_proc": 1,
|
||||
"load_from_cache_file": false,
|
||||
"dataset_shuffle": true,
|
||||
"val_dataset_shuffle": false,
|
||||
"streaming": false,
|
||||
"interleave_prob": null,
|
||||
"stopping_strategy": "first_exhausted",
|
||||
"shuffle_buffer_size": 1000,
|
||||
"download_mode": "reuse_dataset_if_exists",
|
||||
"columns": {},
|
||||
"strict": false,
|
||||
"model_name": [
|
||||
"oolel-lit-gemma"
|
||||
],
|
||||
"model_author": [
|
||||
"soynade-research"
|
||||
],
|
||||
"custom_dataset_info": [],
|
||||
"quant_method": null,
|
||||
"quant_bits": null,
|
||||
"hqq_axis": null,
|
||||
"bnb_4bit_compute_dtype": "bfloat16",
|
||||
"bnb_4bit_quant_type": "nf4",
|
||||
"bnb_4bit_use_double_quant": true,
|
||||
"bnb_4bit_quant_storage": null,
|
||||
"max_new_tokens": 64,
|
||||
"temperature": 0.0,
|
||||
"top_k": null,
|
||||
"top_p": null,
|
||||
"repetition_penalty": null,
|
||||
"num_beams": 1,
|
||||
"stream": false,
|
||||
"stop_words": [],
|
||||
"logprobs": false,
|
||||
"top_logprobs": null,
|
||||
"structured_outputs_regex": null,
|
||||
"ckpt_dir": null,
|
||||
"lora_modules": [],
|
||||
"train_type": "full",
|
||||
"adapters": [],
|
||||
"external_plugins": [],
|
||||
"model_kwargs": {},
|
||||
"load_args": false,
|
||||
"load_data_args": false,
|
||||
"packing": false,
|
||||
"packing_length": null,
|
||||
"packing_num_proc": 1,
|
||||
"lazy_tokenize": false,
|
||||
"custom_register_path": [],
|
||||
"use_hf": true,
|
||||
"ignore_args_error": false,
|
||||
"use_swift_lora": false,
|
||||
"freeze_parameters": [],
|
||||
"freeze_parameters_regex": null,
|
||||
"freeze_parameters_ratio": 0.0,
|
||||
"trainable_parameters": [],
|
||||
"trainable_parameters_regex": null,
|
||||
"freeze_llm": false,
|
||||
"freeze_vit": true,
|
||||
"freeze_aligner": true,
|
||||
"target_modules": [
|
||||
"all-linear"
|
||||
],
|
||||
"target_regex": null,
|
||||
"target_parameters": null,
|
||||
"modules_to_save": [],
|
||||
"lora_rank": 8,
|
||||
"lora_alpha": 32,
|
||||
"lora_dropout": 0.05,
|
||||
"lora_bias": "none",
|
||||
"lora_dtype": null,
|
||||
"lorap_lr_ratio": null,
|
||||
"use_rslora": false,
|
||||
"use_dora": false,
|
||||
"lora_ga_batch_size": 2,
|
||||
"lora_ga_iters": 2,
|
||||
"lora_ga_max_length": 1024,
|
||||
"lora_ga_direction": "ArB2r",
|
||||
"lora_ga_scale": "stable",
|
||||
"lora_ga_stable_gamma": 16,
|
||||
"init_weights": true,
|
||||
"fourier_n_frequency": 2000,
|
||||
"fourier_scaling": 300.0,
|
||||
"boft_block_size": 4,
|
||||
"boft_block_num": 0,
|
||||
"boft_n_butterfly_factor": 1,
|
||||
"boft_dropout": 0.0,
|
||||
"vera_rank": 256,
|
||||
"vera_projection_prng_key": 0,
|
||||
"vera_dropout": 0.0,
|
||||
"vera_d_initial": 0.1,
|
||||
"adapter_act": "gelu",
|
||||
"adapter_length": 128,
|
||||
"use_galore": false,
|
||||
"galore_target_modules": null,
|
||||
"galore_rank": 128,
|
||||
"galore_update_proj_gap": 50,
|
||||
"galore_scale": 1.0,
|
||||
"galore_proj_type": "std",
|
||||
"galore_optim_per_parameter": false,
|
||||
"galore_with_embedding": false,
|
||||
"galore_quantization": false,
|
||||
"galore_proj_quant": false,
|
||||
"galore_proj_bits": 4,
|
||||
"galore_proj_group_size": 256,
|
||||
"galore_cos_threshold": 0.4,
|
||||
"galore_gamma_proj": 2,
|
||||
"galore_queue_size": 5,
|
||||
"adalora_target_r": 8,
|
||||
"adalora_init_r": 12,
|
||||
"adalora_tinit": 0,
|
||||
"adalora_tfinal": 0,
|
||||
"adalora_deltaT": 1,
|
||||
"adalora_beta1": 0.85,
|
||||
"adalora_beta2": 0.85,
|
||||
"adalora_orth_reg_weight": 0.5,
|
||||
"llamapro_num_new_blocks": 4,
|
||||
"llamapro_num_groups": null,
|
||||
"lisa_activated_layers": 0,
|
||||
"lisa_step_interval": 20,
|
||||
"reft_layer_key": null,
|
||||
"reft_layers": null,
|
||||
"reft_rank": 4,
|
||||
"reft_intervention_type": "LoreftIntervention",
|
||||
"reft_args": null,
|
||||
"swanlab_token": null,
|
||||
"swanlab_project": "ms-swift",
|
||||
"swanlab_workspace": null,
|
||||
"swanlab_exp_name": null,
|
||||
"swanlab_notification_method": null,
|
||||
"swanlab_webhook_url": null,
|
||||
"swanlab_secret": null,
|
||||
"swanlab_sender_email": null,
|
||||
"swanlab_receiver_email": null,
|
||||
"swanlab_smtp_server": null,
|
||||
"swanlab_smtp_port": null,
|
||||
"swanlab_email_language": "zh",
|
||||
"swanlab_mode": "cloud",
|
||||
"add_version": true,
|
||||
"create_checkpoint_symlink": false,
|
||||
"zero_hpz_partition_size": null,
|
||||
"deepspeed_autotp_size": null,
|
||||
"early_stop_interval": null,
|
||||
"rank": -1,
|
||||
"global_world_size": 1,
|
||||
"local_world_size": 1,
|
||||
"model_suffix": "gemma-3-270m-it",
|
||||
"model_info": "ModelInfo(model_type='gemma3_text', model_dir='/workspace/hf_artifacts/hub/models--google--gemma-3-270m-it/snapshots/ac82b4e820549b854eebf28ce6dedaf9fdfa17b3', torch_dtype=torch.bfloat16, max_model_len=32768, quant_method=None, quant_bits=None, rope_scaling=None, is_moe_model=False, is_multimodal=False, config=None, task_type='causal_lm', num_labels=None)",
|
||||
"model_meta": "ModelMeta(model_type='gemma3_text', model_groups=[ModelGroup(models=[Model(ms_model_id='LLM-Research/gemma-3-1b-pt', hf_model_id='google/gemma-3-1b-pt', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='LLM-Research/gemma-3-1b-it', hf_model_id='google/gemma-3-1b-it', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='google/gemma-3-270m', hf_model_id='google/gemma-3-270m', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='google/gemma-3-270m-it', hf_model_id='google/gemma-3-270m-it', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='google/medgemma-27b-text-it', hf_model_id='google/medgemma-27b-text-it', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='gemma3_text', get_function=<function get_model_tokenizer_gemma3_text at 0x70064a6a37e0>, model_arch=ModelKeys(arch_name='llama', embedding='model.embed_tokens', module_list='model.layers', lm_head='lm_head', q_proj='model.layers.{}.self_attn.q_proj', k_proj='model.layers.{}.self_attn.k_proj', v_proj='model.layers.{}.self_attn.v_proj', o_proj='model.layers.{}.self_attn.o_proj', attention='model.layers.{}.self_attn', mlp='model.layers.{}.mlp', down_proj='model.layers.{}.mlp.down_proj', qkv_proj=None, qk_proj=None, qa_proj=None, qb_proj=None, kv_proj=None, kva_proj=None, kvb_proj=None), architectures=['Gemma3ForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, is_reranker=False, task_type=None, ignore_patterns=None, requires=['transformers>=4.49'], tags=[])",
|
||||
"model_dir": "/workspace/hf_artifacts/hub/models--google--gemma-3-270m-it/snapshots/ac82b4e820549b854eebf28ce6dedaf9fdfa17b3",
|
||||
"_val_dataset_exists": [],
|
||||
"hub": "<class 'swift.hub.hub.HFHub'>",
|
||||
"evaluation_strategy": "no",
|
||||
"training_args": "Seq2SeqTrainingArguments(output_dir='/workspace/output/v0-20260216-180455', overwrite_output_dir=False, do_train=False, do_eval=False, do_predict=False, eval_strategy=<IntervalStrategy.NO: 'no'>, prediction_loss_only=False, per_device_train_batch_size=8, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=16, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=2.0, max_steps=-1, lr_scheduler_type=<SchedulerType.COSINE: 'cosine'>, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/workspace/output/v0-20260216-180455/runs', logging_strategy=<IntervalStrategy.STEPS: 'steps'>, logging_first_step=True, logging_steps=100, logging_nan_inf_filter=True, save_strategy=<SaveStrategy.STEPS: 'steps'>, save_steps=1000, save_total_limit=3, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=None, dataloader_num_workers=16, dataloader_prefetch_factor=2, past_index=-1, run_name='/workspace/output/v0-20260216-180455', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), parallelism_config=None, deepspeed=None, label_smoothing_factor=0.0, optim=<OptimizerNames.ADAMW_TORCH_FUSED: 'adamw_torch_fused'>, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], project='huggingface', trackio_space_id='trackio', ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=True, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=<HubStrategy.EVERY_SAVE: 'every_save'>, hub_token=None, hub_private_repo=True, hub_always_push=False, hub_revision=None, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=18000000, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, liger_kernel_config=None, eval_use_gather_object=False, average_tokens_across_devices=None, sortish_sampler=False, predict_with_generate=False, generation_max_length=None, generation_num_beams=None, generation_config=None, tuner_backend='peft', vit_gradient_checkpointing=True, router_aux_loss_coef=0.0, enable_dft_loss=False, enable_channel_loss=False, check_model=True, acc_strategy='token', train_dataloader_shuffle=True, max_epochs=None, aligner_lr=None, vit_lr=None, use_logits_to_keep=None, ds3_gather_for_generation=True, resume_only_model=False, optimizer=None, loss_type=None, metric=None, eval_use_evalscope=False, eval_dataset=[], eval_dataset_args=None, eval_limit=None, eval_generation_config=None, extra_eval_args=None, use_flash_ckpt=False, sft_alpha=0, chord_sft_dataset=[], chord_sft_per_device_train_batch_size=None, chord_enable_phi_function=False, chord_mu_warmup_steps=None, chord_mu_decay_steps=None, chord_mu_peak=None, chord_mu_valley=None, train_type='full', local_repo_path=None, galore_config=None, task_type='causal_lm', problem_type=None)"
|
||||
}
|
||||
47
chat_template.jinja
Normal file
47
chat_template.jinja
Normal file
@@ -0,0 +1,47 @@
|
||||
{{ bos_token }}
|
||||
{%- if messages[0]['role'] == 'system' -%}
|
||||
{%- if messages[0]['content'] is string -%}
|
||||
{%- set first_user_prefix = messages[0]['content'] + '
|
||||
|
||||
' -%}
|
||||
{%- else -%}
|
||||
{%- set first_user_prefix = messages[0]['content'][0]['text'] + '
|
||||
|
||||
' -%}
|
||||
{%- endif -%}
|
||||
{%- set loop_messages = messages[1:] -%}
|
||||
{%- else -%}
|
||||
{%- set first_user_prefix = "" -%}
|
||||
{%- set loop_messages = messages -%}
|
||||
{%- endif -%}
|
||||
{%- for message in loop_messages -%}
|
||||
{%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
|
||||
{{ raise_exception("Conversation roles must alternate user/assistant/user/assistant/...") }}
|
||||
{%- endif -%}
|
||||
{%- if (message['role'] == 'assistant') -%}
|
||||
{%- set role = "model" -%}
|
||||
{%- else -%}
|
||||
{%- set role = message['role'] -%}
|
||||
{%- endif -%}
|
||||
{{ '<start_of_turn>' + role + '
|
||||
' + (first_user_prefix if loop.first else "") }}
|
||||
{%- if message['content'] is string -%}
|
||||
{{ message['content'] | trim }}
|
||||
{%- elif message['content'] is iterable -%}
|
||||
{%- for item in message['content'] -%}
|
||||
{%- if item['type'] == 'image' -%}
|
||||
{{ '<start_of_image>' }}
|
||||
{%- elif item['type'] == 'text' -%}
|
||||
{{ item['text'] | trim }}
|
||||
{%- endif -%}
|
||||
{%- endfor -%}
|
||||
{%- else -%}
|
||||
{{ raise_exception("Invalid content type") }}
|
||||
{%- endif -%}
|
||||
{{ '<end_of_turn>
|
||||
' }}
|
||||
{%- endfor -%}
|
||||
{%- if add_generation_prompt -%}
|
||||
{{'<start_of_turn>model
|
||||
'}}
|
||||
{%- endif -%}
|
||||
54
config.json
Normal file
54
config.json
Normal file
@@ -0,0 +1,54 @@
|
||||
{
|
||||
"_sliding_window_pattern": 6,
|
||||
"architectures": [
|
||||
"Gemma3ForCausalLM"
|
||||
],
|
||||
"attention_bias": false,
|
||||
"attention_dropout": 0.0,
|
||||
"attn_logit_softcapping": null,
|
||||
"bos_token_id": 2,
|
||||
"dtype": "bfloat16",
|
||||
"eos_token_id": 1,
|
||||
"final_logit_softcapping": null,
|
||||
"head_dim": 256,
|
||||
"hidden_activation": "gelu_pytorch_tanh",
|
||||
"hidden_size": 640,
|
||||
"initializer_range": 0.02,
|
||||
"intermediate_size": 2048,
|
||||
"layer_types": [
|
||||
"sliding_attention",
|
||||
"sliding_attention",
|
||||
"sliding_attention",
|
||||
"sliding_attention",
|
||||
"sliding_attention",
|
||||
"full_attention",
|
||||
"sliding_attention",
|
||||
"sliding_attention",
|
||||
"sliding_attention",
|
||||
"sliding_attention",
|
||||
"sliding_attention",
|
||||
"full_attention",
|
||||
"sliding_attention",
|
||||
"sliding_attention",
|
||||
"sliding_attention",
|
||||
"sliding_attention",
|
||||
"sliding_attention",
|
||||
"full_attention"
|
||||
],
|
||||
"max_position_embeddings": 32768,
|
||||
"model_type": "gemma3_text",
|
||||
"num_attention_heads": 4,
|
||||
"num_hidden_layers": 18,
|
||||
"num_key_value_heads": 1,
|
||||
"pad_token_id": 0,
|
||||
"query_pre_attn_scalar": 256,
|
||||
"rms_norm_eps": 1e-06,
|
||||
"rope_local_base_freq": 10000.0,
|
||||
"rope_scaling": null,
|
||||
"rope_theta": 1000000.0,
|
||||
"sliding_window": 512,
|
||||
"transformers_version": "4.57.6",
|
||||
"use_bidirectional_attention": false,
|
||||
"use_cache": false,
|
||||
"vocab_size": 262144
|
||||
}
|
||||
11
generation_config.json
Normal file
11
generation_config.json
Normal file
@@ -0,0 +1,11 @@
|
||||
{
|
||||
"cache_implementation": "hybrid",
|
||||
"do_sample": true,
|
||||
"eos_token_id": [
|
||||
1,
|
||||
106
|
||||
],
|
||||
"top_k": 64,
|
||||
"top_p": 0.95,
|
||||
"transformers_version": "4.57.6"
|
||||
}
|
||||
3
model.safetensors
Normal file
3
model.safetensors
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:1fc70f5244b2b7169ddd3291e41e6dc26dbee19083d1e94543a6db45d34a2c1a
|
||||
size 536223056
|
||||
3
optimizer.pt
Normal file
3
optimizer.pt
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:1f61eb9eddbe080a81e2770902928fc08589bdfda8a865664694de46cc3a0b81
|
||||
size 1072594443
|
||||
3
rng_state.pth
Normal file
3
rng_state.pth
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878
|
||||
size 14645
|
||||
3
scheduler.pt
Normal file
3
scheduler.pt
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:94b9d21250e0c4c9206344307e30b2bb0ff729fb03b4200d5a8004e9714582e5
|
||||
size 1465
|
||||
33
special_tokens_map.json
Normal file
33
special_tokens_map.json
Normal file
@@ -0,0 +1,33 @@
|
||||
{
|
||||
"boi_token": "<start_of_image>",
|
||||
"bos_token": {
|
||||
"content": "<bos>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
},
|
||||
"eoi_token": "<end_of_image>",
|
||||
"eos_token": {
|
||||
"content": "<eos>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
},
|
||||
"image_token": "<image_soft_token>",
|
||||
"pad_token": {
|
||||
"content": "<pad>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
},
|
||||
"unk_token": {
|
||||
"content": "<unk>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
}
|
||||
}
|
||||
3
tokenizer.json
Normal file
3
tokenizer.json
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:4667f2089529e8e7657cfb6d1c19910ae71ff5f28aa7ab2ff2763330affad795
|
||||
size 33384568
|
||||
3
tokenizer.model
Normal file
3
tokenizer.model
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:1299c11d7cf632ef3b4e11937501358ada021bbdf7c47638d13c0ee982f2e79c
|
||||
size 4689074
|
||||
51345
tokenizer_config.json
Normal file
51345
tokenizer_config.json
Normal file
File diff suppressed because it is too large
Load Diff
922
trainer_state.json
Normal file
922
trainer_state.json
Normal file
@@ -0,0 +1,922 @@
|
||||
{
|
||||
"best_global_step": null,
|
||||
"best_metric": null,
|
||||
"best_model_checkpoint": null,
|
||||
"epoch": 2.0,
|
||||
"eval_steps": 500,
|
||||
"global_step": 11066,
|
||||
"is_hyper_param_search": false,
|
||||
"is_local_process_zero": true,
|
||||
"is_world_process_zero": true,
|
||||
"log_history": [
|
||||
{
|
||||
"epoch": 0.00018074807107917894,
|
||||
"grad_norm": 58.25,
|
||||
"learning_rate": 1.805054151624549e-07,
|
||||
"loss": 4.652698516845703,
|
||||
"step": 1,
|
||||
"token_acc": 0.43327626145634
|
||||
},
|
||||
{
|
||||
"epoch": 0.018074807107917895,
|
||||
"grad_norm": 6.53125,
|
||||
"learning_rate": 1.805054151624549e-05,
|
||||
"loss": 3.920129641137942,
|
||||
"step": 100,
|
||||
"token_acc": 0.45332661759508064
|
||||
},
|
||||
{
|
||||
"epoch": 0.03614961421583579,
|
||||
"grad_norm": 6.0,
|
||||
"learning_rate": 3.610108303249098e-05,
|
||||
"loss": 2.8419125366210936,
|
||||
"step": 200,
|
||||
"token_acc": 0.5193163006097973
|
||||
},
|
||||
{
|
||||
"epoch": 0.054224421323753685,
|
||||
"grad_norm": 5.625,
|
||||
"learning_rate": 5.415162454873647e-05,
|
||||
"loss": 2.2325677490234375,
|
||||
"step": 300,
|
||||
"token_acc": 0.577260581442613
|
||||
},
|
||||
{
|
||||
"epoch": 0.07229922843167158,
|
||||
"grad_norm": 5.09375,
|
||||
"learning_rate": 7.220216606498195e-05,
|
||||
"loss": 1.9477809143066407,
|
||||
"step": 400,
|
||||
"token_acc": 0.6095200383891847
|
||||
},
|
||||
{
|
||||
"epoch": 0.09037403553958948,
|
||||
"grad_norm": 4.4375,
|
||||
"learning_rate": 9.025270758122743e-05,
|
||||
"loss": 1.7510385131835937,
|
||||
"step": 500,
|
||||
"token_acc": 0.6378192081318703
|
||||
},
|
||||
{
|
||||
"epoch": 0.10844884264750737,
|
||||
"grad_norm": 4.28125,
|
||||
"learning_rate": 9.999527526045029e-05,
|
||||
"loss": 1.6535232543945313,
|
||||
"step": 600,
|
||||
"token_acc": 0.6496733713777781
|
||||
},
|
||||
{
|
||||
"epoch": 0.12652364975542527,
|
||||
"grad_norm": 3.625,
|
||||
"learning_rate": 9.99524110790929e-05,
|
||||
"loss": 1.5711769104003905,
|
||||
"step": 700,
|
||||
"token_acc": 0.6620965629303417
|
||||
},
|
||||
{
|
||||
"epoch": 0.14459845686334316,
|
||||
"grad_norm": 3.4375,
|
||||
"learning_rate": 9.986493474590536e-05,
|
||||
"loss": 1.5024029541015624,
|
||||
"step": 800,
|
||||
"token_acc": 0.6721741041947722
|
||||
},
|
||||
{
|
||||
"epoch": 0.16267326397126106,
|
||||
"grad_norm": 3.0,
|
||||
"learning_rate": 9.973292438539405e-05,
|
||||
"loss": 1.4528140258789062,
|
||||
"step": 900,
|
||||
"token_acc": 0.6810912279574309
|
||||
},
|
||||
{
|
||||
"epoch": 0.18074807107917895,
|
||||
"grad_norm": 3.421875,
|
||||
"learning_rate": 9.955649789509624e-05,
|
||||
"loss": 1.4288172912597656,
|
||||
"step": 1000,
|
||||
"token_acc": 0.6845218263208889
|
||||
},
|
||||
{
|
||||
"epoch": 0.19882287818709685,
|
||||
"grad_norm": 3.4375,
|
||||
"learning_rate": 9.933581284028659e-05,
|
||||
"loss": 1.4166110229492188,
|
||||
"step": 1100,
|
||||
"token_acc": 0.68570654719404
|
||||
},
|
||||
{
|
||||
"epoch": 0.21689768529501474,
|
||||
"grad_norm": 2.90625,
|
||||
"learning_rate": 9.907106631325671e-05,
|
||||
"loss": 1.3844677734375,
|
||||
"step": 1200,
|
||||
"token_acc": 0.6901876362337299
|
||||
},
|
||||
{
|
||||
"epoch": 0.23497249240293264,
|
||||
"grad_norm": 2.375,
|
||||
"learning_rate": 9.876249475729344e-05,
|
||||
"loss": 1.3720639038085938,
|
||||
"step": 1300,
|
||||
"token_acc": 0.6921398802222583
|
||||
},
|
||||
{
|
||||
"epoch": 0.25304729951085053,
|
||||
"grad_norm": 2.6875,
|
||||
"learning_rate": 9.841037375551294e-05,
|
||||
"loss": 1.3502085876464844,
|
||||
"step": 1400,
|
||||
"token_acc": 0.6961120678024922
|
||||
},
|
||||
{
|
||||
"epoch": 0.27112210661876845,
|
||||
"grad_norm": 2.53125,
|
||||
"learning_rate": 9.801501778473935e-05,
|
||||
"loss": 1.3242225646972656,
|
||||
"step": 1500,
|
||||
"token_acc": 0.7008815156964483
|
||||
},
|
||||
{
|
||||
"epoch": 0.2891969137266863,
|
||||
"grad_norm": 2.671875,
|
||||
"learning_rate": 9.757677993464771e-05,
|
||||
"loss": 1.3310586547851562,
|
||||
"step": 1600,
|
||||
"token_acc": 0.6988938129948864
|
||||
},
|
||||
{
|
||||
"epoch": 0.30727172083460425,
|
||||
"grad_norm": 2.4375,
|
||||
"learning_rate": 9.709605159242199e-05,
|
||||
"loss": 1.3116970825195313,
|
||||
"step": 1700,
|
||||
"token_acc": 0.7015194406957871
|
||||
},
|
||||
{
|
||||
"epoch": 0.3253465279425221,
|
||||
"grad_norm": 2.5625,
|
||||
"learning_rate": 9.657326209320998e-05,
|
||||
"loss": 1.3130838012695312,
|
||||
"step": 1800,
|
||||
"token_acc": 0.7008979147977925
|
||||
},
|
||||
{
|
||||
"epoch": 0.34342133505044004,
|
||||
"grad_norm": 2.453125,
|
||||
"learning_rate": 9.600887833668701e-05,
|
||||
"loss": 1.2876347351074218,
|
||||
"step": 1900,
|
||||
"token_acc": 0.7059691653488992
|
||||
},
|
||||
{
|
||||
"epoch": 0.3614961421583579,
|
||||
"grad_norm": 2.28125,
|
||||
"learning_rate": 9.540340437007106e-05,
|
||||
"loss": 1.2805183410644532,
|
||||
"step": 2000,
|
||||
"token_acc": 0.7076493690679662
|
||||
},
|
||||
{
|
||||
"epoch": 0.3795709492662758,
|
||||
"grad_norm": 2.3125,
|
||||
"learning_rate": 9.475738093796172e-05,
|
||||
"loss": 1.2861131286621095,
|
||||
"step": 2100,
|
||||
"token_acc": 0.7055881842553867
|
||||
},
|
||||
{
|
||||
"epoch": 0.3976457563741937,
|
||||
"grad_norm": 2.375,
|
||||
"learning_rate": 9.407138499940496e-05,
|
||||
"loss": 1.268822479248047,
|
||||
"step": 2200,
|
||||
"token_acc": 0.7090846641855041
|
||||
},
|
||||
{
|
||||
"epoch": 0.4157205634821116,
|
||||
"grad_norm": 2.59375,
|
||||
"learning_rate": 9.334602921261492e-05,
|
||||
"loss": 1.2444308471679688,
|
||||
"step": 2300,
|
||||
"token_acc": 0.7131349902523053
|
||||
},
|
||||
{
|
||||
"epoch": 0.4337953705900295,
|
||||
"grad_norm": 3.015625,
|
||||
"learning_rate": 9.258196138781327e-05,
|
||||
"loss": 1.2656473541259765,
|
||||
"step": 2400,
|
||||
"token_acc": 0.7094099462846465
|
||||
},
|
||||
{
|
||||
"epoch": 0.4518701776979474,
|
||||
"grad_norm": 2.265625,
|
||||
"learning_rate": 9.177986390867419e-05,
|
||||
"loss": 1.2598892211914063,
|
||||
"step": 2500,
|
||||
"token_acc": 0.7109632032447007
|
||||
},
|
||||
{
|
||||
"epoch": 0.4699449848058653,
|
||||
"grad_norm": 2.5,
|
||||
"learning_rate": 9.09404531228924e-05,
|
||||
"loss": 1.2393927764892578,
|
||||
"step": 2600,
|
||||
"token_acc": 0.7133786975644326
|
||||
},
|
||||
{
|
||||
"epoch": 0.4880197919137832,
|
||||
"grad_norm": 2.3125,
|
||||
"learning_rate": 9.0064478702418e-05,
|
||||
"loss": 1.2412493896484376,
|
||||
"step": 2700,
|
||||
"token_acc": 0.7132032480227637
|
||||
},
|
||||
{
|
||||
"epoch": 0.5060945990217011,
|
||||
"grad_norm": 2.5625,
|
||||
"learning_rate": 8.915272297392945e-05,
|
||||
"loss": 1.2382020568847656,
|
||||
"step": 2800,
|
||||
"token_acc": 0.7131689546842609
|
||||
},
|
||||
{
|
||||
"epoch": 0.5241694061296189,
|
||||
"grad_norm": 2.640625,
|
||||
"learning_rate": 8.820600022014338e-05,
|
||||
"loss": 1.2349536895751954,
|
||||
"step": 2900,
|
||||
"token_acc": 0.7140737742682136
|
||||
},
|
||||
{
|
||||
"epoch": 0.5422442132375369,
|
||||
"grad_norm": 2.109375,
|
||||
"learning_rate": 8.722515595258402e-05,
|
||||
"loss": 1.2141342163085938,
|
||||
"step": 3000,
|
||||
"token_acc": 0.7181298317819385
|
||||
},
|
||||
{
|
||||
"epoch": 0.5603190203454548,
|
||||
"grad_norm": 2.078125,
|
||||
"learning_rate": 8.621106615646292e-05,
|
||||
"loss": 1.2058545684814452,
|
||||
"step": 3100,
|
||||
"token_acc": 0.7190379193709925
|
||||
},
|
||||
{
|
||||
"epoch": 0.5783938274533726,
|
||||
"grad_norm": 2.453125,
|
||||
"learning_rate": 8.51646365083426e-05,
|
||||
"loss": 1.2195273590087892,
|
||||
"step": 3200,
|
||||
"token_acc": 0.7170014250513891
|
||||
},
|
||||
{
|
||||
"epoch": 0.5964686345612905,
|
||||
"grad_norm": 2.3125,
|
||||
"learning_rate": 8.408680156728299e-05,
|
||||
"loss": 1.2225239562988282,
|
||||
"step": 3300,
|
||||
"token_acc": 0.71660874090857
|
||||
},
|
||||
{
|
||||
"epoch": 0.6145434416692085,
|
||||
"grad_norm": 2.359375,
|
||||
"learning_rate": 8.297852394019336e-05,
|
||||
"loss": 1.1935769653320312,
|
||||
"step": 3400,
|
||||
"token_acc": 0.721348535831292
|
||||
},
|
||||
{
|
||||
"epoch": 0.6326182487771264,
|
||||
"grad_norm": 2.328125,
|
||||
"learning_rate": 8.184079342213466e-05,
|
||||
"loss": 1.198054428100586,
|
||||
"step": 3500,
|
||||
"token_acc": 0.7195931038612298
|
||||
},
|
||||
{
|
||||
"epoch": 0.6506930558850442,
|
||||
"grad_norm": 2.15625,
|
||||
"learning_rate": 8.067462611234052e-05,
|
||||
"loss": 1.2039249420166016,
|
||||
"step": 3600,
|
||||
"token_acc": 0.7198145640636711
|
||||
},
|
||||
{
|
||||
"epoch": 0.6687678629929621,
|
||||
"grad_norm": 2.25,
|
||||
"learning_rate": 7.948106350674593e-05,
|
||||
"loss": 1.2044364166259767,
|
||||
"step": 3700,
|
||||
"token_acc": 0.7191590166885238
|
||||
},
|
||||
{
|
||||
"epoch": 0.6868426701008801,
|
||||
"grad_norm": 2.21875,
|
||||
"learning_rate": 7.826117156783461e-05,
|
||||
"loss": 1.1947254180908202,
|
||||
"step": 3800,
|
||||
"token_acc": 0.7211205989004562
|
||||
},
|
||||
{
|
||||
"epoch": 0.7049174772087979,
|
||||
"grad_norm": 2.171875,
|
||||
"learning_rate": 7.701603977263513e-05,
|
||||
"loss": 1.1763773345947266,
|
||||
"step": 3900,
|
||||
"token_acc": 0.7237809512705871
|
||||
},
|
||||
{
|
||||
"epoch": 0.7229922843167158,
|
||||
"grad_norm": 2.546875,
|
||||
"learning_rate": 7.574678013971672e-05,
|
||||
"loss": 1.180088424682617,
|
||||
"step": 4000,
|
||||
"token_acc": 0.7241476445274628
|
||||
},
|
||||
{
|
||||
"epoch": 0.7410670914246337,
|
||||
"grad_norm": 2.359375,
|
||||
"learning_rate": 7.445452623605307e-05,
|
||||
"loss": 1.1668415069580078,
|
||||
"step": 4100,
|
||||
"token_acc": 0.7264480800382547
|
||||
},
|
||||
{
|
||||
"epoch": 0.7591418985325517,
|
||||
"grad_norm": 2.015625,
|
||||
"learning_rate": 7.314043216464158e-05,
|
||||
"loss": 1.1807654571533204,
|
||||
"step": 4200,
|
||||
"token_acc": 0.7243762391137034
|
||||
},
|
||||
{
|
||||
"epoch": 0.7772167056404695,
|
||||
"grad_norm": 2.328125,
|
||||
"learning_rate": 7.180567153378193e-05,
|
||||
"loss": 1.1845186614990235,
|
||||
"step": 4300,
|
||||
"token_acc": 0.7214888736557151
|
||||
},
|
||||
{
|
||||
"epoch": 0.7952915127483874,
|
||||
"grad_norm": 2.1875,
|
||||
"learning_rate": 7.045143640893474e-05,
|
||||
"loss": 1.1583942413330077,
|
||||
"step": 4400,
|
||||
"token_acc": 0.7274813631693335
|
||||
},
|
||||
{
|
||||
"epoch": 0.8133663198563053,
|
||||
"grad_norm": 2.109375,
|
||||
"learning_rate": 6.907893624809609e-05,
|
||||
"loss": 1.145471420288086,
|
||||
"step": 4500,
|
||||
"token_acc": 0.7298424303929127
|
||||
},
|
||||
{
|
||||
"epoch": 0.8314411269642232,
|
||||
"grad_norm": 2.359375,
|
||||
"learning_rate": 6.768939682163902e-05,
|
||||
"loss": 1.1666727447509766,
|
||||
"step": 4600,
|
||||
"token_acc": 0.7264280244660613
|
||||
},
|
||||
{
|
||||
"epoch": 0.8495159340721411,
|
||||
"grad_norm": 2.09375,
|
||||
"learning_rate": 6.628405911758647e-05,
|
||||
"loss": 1.1622318267822265,
|
||||
"step": 4700,
|
||||
"token_acc": 0.7271438460195889
|
||||
},
|
||||
{
|
||||
"epoch": 0.867590741180059,
|
||||
"grad_norm": 2.15625,
|
||||
"learning_rate": 6.486417823329354e-05,
|
||||
"loss": 1.1364639282226563,
|
||||
"step": 4800,
|
||||
"token_acc": 0.7319144126512372
|
||||
},
|
||||
{
|
||||
"epoch": 0.8856655482879768,
|
||||
"grad_norm": 2.421875,
|
||||
"learning_rate": 6.34310222545287e-05,
|
||||
"loss": 1.1561846923828125,
|
||||
"step": 4900,
|
||||
"token_acc": 0.7282120514362783
|
||||
},
|
||||
{
|
||||
"epoch": 0.9037403553958948,
|
||||
"grad_norm": 2.109375,
|
||||
"learning_rate": 6.198587112295526e-05,
|
||||
"loss": 1.1464973449707032,
|
||||
"step": 5000,
|
||||
"token_acc": 0.7304544859576637
|
||||
},
|
||||
{
|
||||
"epoch": 0.9218151625038127,
|
||||
"grad_norm": 2.0,
|
||||
"learning_rate": 6.053001549302422e-05,
|
||||
"loss": 1.135927963256836,
|
||||
"step": 5100,
|
||||
"token_acc": 0.7309169370264882
|
||||
},
|
||||
{
|
||||
"epoch": 0.9398899696117305,
|
||||
"grad_norm": 2.359375,
|
||||
"learning_rate": 5.906475557929985e-05,
|
||||
"loss": 1.1362411499023437,
|
||||
"step": 5200,
|
||||
"token_acc": 0.7308731770004574
|
||||
},
|
||||
{
|
||||
"epoch": 0.9579647767196484,
|
||||
"grad_norm": 2.15625,
|
||||
"learning_rate": 5.759139999524705e-05,
|
||||
"loss": 1.132964859008789,
|
||||
"step": 5300,
|
||||
"token_acc": 0.7320876184986531
|
||||
},
|
||||
{
|
||||
"epoch": 0.9760395838275664,
|
||||
"grad_norm": 2.09375,
|
||||
"learning_rate": 5.611126458451772e-05,
|
||||
"loss": 1.1330313873291016,
|
||||
"step": 5400,
|
||||
"token_acc": 0.7321249508199824
|
||||
},
|
||||
{
|
||||
"epoch": 0.9941143909354843,
|
||||
"grad_norm": 2.1875,
|
||||
"learning_rate": 5.462567124577992e-05,
|
||||
"loss": 1.137665786743164,
|
||||
"step": 5500,
|
||||
"token_acc": 0.731388650772945
|
||||
},
|
||||
{
|
||||
"epoch": 1.0121101207623049,
|
||||
"grad_norm": 2.71875,
|
||||
"learning_rate": 5.3135946752139385e-05,
|
||||
"loss": 1.0870736694335938,
|
||||
"step": 5600,
|
||||
"token_acc": 0.7406857225004193
|
||||
},
|
||||
{
|
||||
"epoch": 1.0301849278702229,
|
||||
"grad_norm": 2.234375,
|
||||
"learning_rate": 5.1643421566207615e-05,
|
||||
"loss": 1.0574837493896485,
|
||||
"step": 5700,
|
||||
"token_acc": 0.7462299226419611
|
||||
},
|
||||
{
|
||||
"epoch": 1.0482597349781408,
|
||||
"grad_norm": 2.1875,
|
||||
"learning_rate": 5.0149428651874985e-05,
|
||||
"loss": 1.0600157165527344,
|
||||
"step": 5800,
|
||||
"token_acc": 0.7451135106751428
|
||||
},
|
||||
{
|
||||
"epoch": 1.0663345420860586,
|
||||
"grad_norm": 2.296875,
|
||||
"learning_rate": 4.86553022838499e-05,
|
||||
"loss": 1.0566656494140625,
|
||||
"step": 5900,
|
||||
"token_acc": 0.7469262925837835
|
||||
},
|
||||
{
|
||||
"epoch": 1.0844093491939766,
|
||||
"grad_norm": 2.1875,
|
||||
"learning_rate": 4.716237685602735e-05,
|
||||
"loss": 1.073977508544922,
|
||||
"step": 6000,
|
||||
"token_acc": 0.7416494081610407
|
||||
},
|
||||
{
|
||||
"epoch": 1.1024841563018946,
|
||||
"grad_norm": 2.359375,
|
||||
"learning_rate": 4.567198568975096e-05,
|
||||
"loss": 1.0501838684082032,
|
||||
"step": 6100,
|
||||
"token_acc": 0.7475906446092413
|
||||
},
|
||||
{
|
||||
"epoch": 1.1205589634098123,
|
||||
"grad_norm": 1.9453125,
|
||||
"learning_rate": 4.418545984303294e-05,
|
||||
"loss": 1.0559381866455078,
|
||||
"step": 6200,
|
||||
"token_acc": 0.7470775683736796
|
||||
},
|
||||
{
|
||||
"epoch": 1.1386337705177303,
|
||||
"grad_norm": 2.09375,
|
||||
"learning_rate": 4.2704126921795424e-05,
|
||||
"loss": 1.0539588928222656,
|
||||
"step": 6300,
|
||||
"token_acc": 0.7466571018782976
|
||||
},
|
||||
{
|
||||
"epoch": 1.1567085776256483,
|
||||
"grad_norm": 2.125,
|
||||
"learning_rate": 4.1229309894194806e-05,
|
||||
"loss": 1.0646955108642577,
|
||||
"step": 6400,
|
||||
"token_acc": 0.7450717568377625
|
||||
},
|
||||
{
|
||||
"epoch": 1.174783384733566,
|
||||
"grad_norm": 1.8828125,
|
||||
"learning_rate": 3.976232590908812e-05,
|
||||
"loss": 1.0539531707763672,
|
||||
"step": 6500,
|
||||
"token_acc": 0.7465753023540981
|
||||
},
|
||||
{
|
||||
"epoch": 1.192858191841484,
|
||||
"grad_norm": 2.25,
|
||||
"learning_rate": 3.830448511969638e-05,
|
||||
"loss": 1.0504056549072265,
|
||||
"step": 6600,
|
||||
"token_acc": 0.7470591527899016
|
||||
},
|
||||
{
|
||||
"epoch": 1.2109329989494018,
|
||||
"grad_norm": 1.9921875,
|
||||
"learning_rate": 3.6857089513516035e-05,
|
||||
"loss": 1.0537297821044922,
|
||||
"step": 6700,
|
||||
"token_acc": 0.746264189895728
|
||||
},
|
||||
{
|
||||
"epoch": 1.2290078060573197,
|
||||
"grad_norm": 2.078125,
|
||||
"learning_rate": 3.542143174952282e-05,
|
||||
"loss": 1.0569972229003906,
|
||||
"step": 6800,
|
||||
"token_acc": 0.7456408105039157
|
||||
},
|
||||
{
|
||||
"epoch": 1.2470826131652377,
|
||||
"grad_norm": 2.078125,
|
||||
"learning_rate": 3.399879400370704e-05,
|
||||
"loss": 1.0594657897949218,
|
||||
"step": 6900,
|
||||
"token_acc": 0.7464631930608675
|
||||
},
|
||||
{
|
||||
"epoch": 1.2651574202731555,
|
||||
"grad_norm": 2.296875,
|
||||
"learning_rate": 3.259044682397107e-05,
|
||||
"loss": 1.059138946533203,
|
||||
"step": 7000,
|
||||
"token_acc": 0.7466058265866723
|
||||
},
|
||||
{
|
||||
"epoch": 1.2832322273810735,
|
||||
"grad_norm": 2.140625,
|
||||
"learning_rate": 3.119764799541187e-05,
|
||||
"loss": 1.0547212982177734,
|
||||
"step": 7100,
|
||||
"token_acc": 0.7473527098438676
|
||||
},
|
||||
{
|
||||
"epoch": 1.3013070344889912,
|
||||
"grad_norm": 2.125,
|
||||
"learning_rate": 2.9821641417001806e-05,
|
||||
"loss": 1.0402613067626953,
|
||||
"step": 7200,
|
||||
"token_acc": 0.7493453306325137
|
||||
},
|
||||
{
|
||||
"epoch": 1.3193818415969092,
|
||||
"grad_norm": 2.140625,
|
||||
"learning_rate": 2.846365599067111e-05,
|
||||
"loss": 1.045955352783203,
|
||||
"step": 7300,
|
||||
"token_acc": 0.7483038206918617
|
||||
},
|
||||
{
|
||||
"epoch": 1.3374566487048272,
|
||||
"grad_norm": 2.203125,
|
||||
"learning_rate": 2.7124904523784144e-05,
|
||||
"loss": 1.0378961944580078,
|
||||
"step": 7400,
|
||||
"token_acc": 0.7502343443728431
|
||||
},
|
||||
{
|
||||
"epoch": 1.355531455812745,
|
||||
"grad_norm": 2.46875,
|
||||
"learning_rate": 2.580658264598942e-05,
|
||||
"loss": 1.0476718902587892,
|
||||
"step": 7500,
|
||||
"token_acc": 0.7483254533842428
|
||||
},
|
||||
{
|
||||
"epoch": 1.373606262920663,
|
||||
"grad_norm": 2.109375,
|
||||
"learning_rate": 2.450986774141123e-05,
|
||||
"loss": 1.0452989196777345,
|
||||
"step": 7600,
|
||||
"token_acc": 0.7482226148441731
|
||||
},
|
||||
{
|
||||
"epoch": 1.3916810700285809,
|
||||
"grad_norm": 2.140625,
|
||||
"learning_rate": 2.3235917897135934e-05,
|
||||
"loss": 1.0406829833984375,
|
||||
"step": 7700,
|
||||
"token_acc": 0.74961852670988
|
||||
},
|
||||
{
|
||||
"epoch": 1.4097558771364986,
|
||||
"grad_norm": 2.3125,
|
||||
"learning_rate": 2.1985870868932456e-05,
|
||||
"loss": 1.0259892272949218,
|
||||
"step": 7800,
|
||||
"token_acc": 0.7533507908450509
|
||||
},
|
||||
{
|
||||
"epoch": 1.4278306842444166,
|
||||
"grad_norm": 2.421875,
|
||||
"learning_rate": 2.076084306513049e-05,
|
||||
"loss": 1.0363540649414062,
|
||||
"step": 7900,
|
||||
"token_acc": 0.7503043904274775
|
||||
},
|
||||
{
|
||||
"epoch": 1.4459054913523346,
|
||||
"grad_norm": 1.984375,
|
||||
"learning_rate": 1.9561928549563968e-05,
|
||||
"loss": 1.0644924926757813,
|
||||
"step": 8000,
|
||||
"token_acc": 0.7456255842220881
|
||||
},
|
||||
{
|
||||
"epoch": 1.4639802984602523,
|
||||
"grad_norm": 2.125,
|
||||
"learning_rate": 1.839019806447024e-05,
|
||||
"loss": 1.0405005645751952,
|
||||
"step": 8100,
|
||||
"token_acc": 0.7495747171714061
|
||||
},
|
||||
{
|
||||
"epoch": 1.4820551055681703,
|
||||
"grad_norm": 2.09375,
|
||||
"learning_rate": 1.724669807421762e-05,
|
||||
"loss": 1.0440809631347656,
|
||||
"step": 8200,
|
||||
"token_acc": 0.7486369722470563
|
||||
},
|
||||
{
|
||||
"epoch": 1.5001299126760883,
|
||||
"grad_norm": 2.125,
|
||||
"learning_rate": 1.6132449830715263e-05,
|
||||
"loss": 1.0530775451660157,
|
||||
"step": 8300,
|
||||
"token_acc": 0.7477425947235785
|
||||
},
|
||||
{
|
||||
"epoch": 1.518204719784006,
|
||||
"grad_norm": 2.109375,
|
||||
"learning_rate": 1.5048448461340258e-05,
|
||||
"loss": 1.0460784912109375,
|
||||
"step": 8400,
|
||||
"token_acc": 0.7489244523851678
|
||||
},
|
||||
{
|
||||
"epoch": 1.5362795268919238,
|
||||
"grad_norm": 2.125,
|
||||
"learning_rate": 1.3995662080196215e-05,
|
||||
"loss": 1.0327759552001954,
|
||||
"step": 8500,
|
||||
"token_acc": 0.7517219276186928
|
||||
},
|
||||
{
|
||||
"epoch": 1.5543543339998418,
|
||||
"grad_norm": 1.8125,
|
||||
"learning_rate": 1.2975030923497262e-05,
|
||||
"loss": 1.037949981689453,
|
||||
"step": 8600,
|
||||
"token_acc": 0.7504533222058104
|
||||
},
|
||||
{
|
||||
"epoch": 1.5724291411077598,
|
||||
"grad_norm": 2.109375,
|
||||
"learning_rate": 1.1987466509849655e-05,
|
||||
"loss": 1.0523592376708983,
|
||||
"step": 8700,
|
||||
"token_acc": 0.7466913343954412
|
||||
},
|
||||
{
|
||||
"epoch": 1.5905039482156775,
|
||||
"grad_norm": 2.265625,
|
||||
"learning_rate": 1.1033850826180781e-05,
|
||||
"loss": 1.048785171508789,
|
||||
"step": 8800,
|
||||
"token_acc": 0.7472665235971139
|
||||
},
|
||||
{
|
||||
"epoch": 1.6085787553235955,
|
||||
"grad_norm": 2.0,
|
||||
"learning_rate": 1.0115035540042784e-05,
|
||||
"loss": 1.032520523071289,
|
||||
"step": 8900,
|
||||
"token_acc": 0.7507173368953239
|
||||
},
|
||||
{
|
||||
"epoch": 1.6266535624315135,
|
||||
"grad_norm": 2.21875,
|
||||
"learning_rate": 9.231841238994194e-06,
|
||||
"loss": 1.043864974975586,
|
||||
"step": 9000,
|
||||
"token_acc": 0.7493066372338768
|
||||
},
|
||||
{
|
||||
"epoch": 1.6447283695394312,
|
||||
"grad_norm": 1.90625,
|
||||
"learning_rate": 8.385056697738796e-06,
|
||||
"loss": 1.0542935943603515,
|
||||
"step": 9100,
|
||||
"token_acc": 0.7465372369884397
|
||||
},
|
||||
{
|
||||
"epoch": 1.6628031766473492,
|
||||
"grad_norm": 1.984375,
|
||||
"learning_rate": 7.575438173676513e-06,
|
||||
"loss": 1.031275177001953,
|
||||
"step": 9200,
|
||||
"token_acc": 0.7517255178380788
|
||||
},
|
||||
{
|
||||
"epoch": 1.6808779837552672,
|
||||
"grad_norm": 2.125,
|
||||
"learning_rate": 6.803708731495117e-06,
|
||||
"loss": 1.0453128814697266,
|
||||
"step": 9300,
|
||||
"token_acc": 0.7487111883579803
|
||||
},
|
||||
{
|
||||
"epoch": 1.698952790863185,
|
||||
"grad_norm": 1.8984375,
|
||||
"learning_rate": 6.070557597406163e-06,
|
||||
"loss": 1.0431288146972657,
|
||||
"step": 9400,
|
||||
"token_acc": 0.7488178364241926
|
||||
},
|
||||
{
|
||||
"epoch": 1.717027597971103,
|
||||
"grad_norm": 1.96875,
|
||||
"learning_rate": 5.376639543601858e-06,
|
||||
"loss": 1.0395802307128905,
|
||||
"step": 9500,
|
||||
"token_acc": 0.7503542725542914
|
||||
},
|
||||
{
|
||||
"epoch": 1.735102405079021,
|
||||
"grad_norm": 2.078125,
|
||||
"learning_rate": 4.722574303482557e-06,
|
||||
"loss": 1.0538075256347657,
|
||||
"step": 9600,
|
||||
"token_acc": 0.7472696432430282
|
||||
},
|
||||
{
|
||||
"epoch": 1.7531772121869387,
|
||||
"grad_norm": 2.234375,
|
||||
"learning_rate": 4.1089460181771675e-06,
|
||||
"loss": 1.046026382446289,
|
||||
"step": 9700,
|
||||
"token_acc": 0.7478882170852356
|
||||
},
|
||||
{
|
||||
"epoch": 1.7712520192948564,
|
||||
"grad_norm": 2.15625,
|
||||
"learning_rate": 3.5363027148507423e-06,
|
||||
"loss": 1.0465138244628907,
|
||||
"step": 9800,
|
||||
"token_acc": 0.7479565299418576
|
||||
},
|
||||
{
|
||||
"epoch": 1.7893268264027746,
|
||||
"grad_norm": 2.28125,
|
||||
"learning_rate": 3.0051558172652316e-06,
|
||||
"loss": 1.0322959899902344,
|
||||
"step": 9900,
|
||||
"token_acc": 0.752127805838985
|
||||
},
|
||||
{
|
||||
"epoch": 1.8074016335106924,
|
||||
"grad_norm": 2.21875,
|
||||
"learning_rate": 2.5159796890304564e-06,
|
||||
"loss": 1.0446186065673828,
|
||||
"step": 10000,
|
||||
"token_acc": 0.7488376764944241
|
||||
},
|
||||
{
|
||||
"epoch": 1.8254764406186101,
|
||||
"grad_norm": 2.0625,
|
||||
"learning_rate": 2.069211209953287e-06,
|
||||
"loss": 1.0309945678710937,
|
||||
"step": 10100,
|
||||
"token_acc": 0.7524413140197763
|
||||
},
|
||||
{
|
||||
"epoch": 1.8435512477265281,
|
||||
"grad_norm": 1.8203125,
|
||||
"learning_rate": 1.6652493858632823e-06,
|
||||
"loss": 1.0363735198974608,
|
||||
"step": 10200,
|
||||
"token_acc": 0.7503588935333034
|
||||
},
|
||||
{
|
||||
"epoch": 1.861626054834446,
|
||||
"grad_norm": 2.296875,
|
||||
"learning_rate": 1.3044549922633876e-06,
|
||||
"loss": 1.0336082458496094,
|
||||
"step": 10300,
|
||||
"token_acc": 0.7506362801039103
|
||||
},
|
||||
{
|
||||
"epoch": 1.8797008619423639,
|
||||
"grad_norm": 2.046875,
|
||||
"learning_rate": 9.871502521237975e-07,
|
||||
"loss": 1.0328756713867187,
|
||||
"step": 10400,
|
||||
"token_acc": 0.7518247545659716
|
||||
},
|
||||
{
|
||||
"epoch": 1.8977756690502818,
|
||||
"grad_norm": 1.8671875,
|
||||
"learning_rate": 7.136185481068925e-07,
|
||||
"loss": 1.0489426422119141,
|
||||
"step": 10500,
|
||||
"token_acc": 0.747691160714326
|
||||
},
|
||||
{
|
||||
"epoch": 1.9158504761581998,
|
||||
"grad_norm": 1.765625,
|
||||
"learning_rate": 4.841041694801208e-07,
|
||||
"loss": 1.0270442962646484,
|
||||
"step": 10600,
|
||||
"token_acc": 0.7519379627407866
|
||||
},
|
||||
{
|
||||
"epoch": 1.9339252832661176,
|
||||
"grad_norm": 2.0,
|
||||
"learning_rate": 2.988120939429684e-07,
|
||||
"loss": 1.0303496551513671,
|
||||
"step": 10700,
|
||||
"token_acc": 0.7513509325616106
|
||||
},
|
||||
{
|
||||
"epoch": 1.9520000903740355,
|
||||
"grad_norm": 1.890625,
|
||||
"learning_rate": 1.5790780456277355e-07,
|
||||
"loss": 1.0394702911376954,
|
||||
"step": 10800,
|
||||
"token_acc": 0.7495070401423912
|
||||
},
|
||||
{
|
||||
"epoch": 1.9700748974819535,
|
||||
"grad_norm": 2.390625,
|
||||
"learning_rate": 6.15171419829752e-08,
|
||||
"loss": 1.0353932189941406,
|
||||
"step": 10900,
|
||||
"token_acc": 0.7506620155660084
|
||||
},
|
||||
{
|
||||
"epoch": 1.9881497045898713,
|
||||
"grad_norm": 2.15625,
|
||||
"learning_rate": 9.726192035691694e-09,
|
||||
"loss": 1.0364900970458983,
|
||||
"step": 11000,
|
||||
"token_acc": 0.7511165613553197
|
||||
}
|
||||
],
|
||||
"logging_steps": 100,
|
||||
"max_steps": 11066,
|
||||
"num_input_tokens_seen": 0,
|
||||
"num_train_epochs": 2,
|
||||
"save_steps": 1000,
|
||||
"stateful_callbacks": {
|
||||
"TrainerControl": {
|
||||
"args": {
|
||||
"should_epoch_stop": false,
|
||||
"should_evaluate": false,
|
||||
"should_log": false,
|
||||
"should_save": true,
|
||||
"should_training_stop": true
|
||||
},
|
||||
"attributes": {}
|
||||
}
|
||||
},
|
||||
"total_flos": 8.597718898108247e+17,
|
||||
"train_batch_size": 8,
|
||||
"trial_name": null,
|
||||
"trial_params": null
|
||||
}
|
||||
3
training_args.bin
Normal file
3
training_args.bin
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:a53afc965adde4abf79521e988467b9e3ccedf21f656c2f533f82df1965c7abf
|
||||
size 6993
|
||||
Reference in New Issue
Block a user