初始化项目,由ModelHub XC社区提供模型

Model: soynade-research/oolel-lit-gemma
Source: Original Platform
This commit is contained in:
ModelHub XC
2026-04-22 15:59:49 +08:00
commit 62325d65bd
17 changed files with 52930 additions and 0 deletions

36
.gitattributes vendored Normal file
View File

@@ -0,0 +1,36 @@
*.7z filter=lfs diff=lfs merge=lfs -text
*.arrow filter=lfs diff=lfs merge=lfs -text
*.bin filter=lfs diff=lfs merge=lfs -text
*.bz2 filter=lfs diff=lfs merge=lfs -text
*.ckpt filter=lfs diff=lfs merge=lfs -text
*.ftz filter=lfs diff=lfs merge=lfs -text
*.gz filter=lfs diff=lfs merge=lfs -text
*.h5 filter=lfs diff=lfs merge=lfs -text
*.joblib filter=lfs diff=lfs merge=lfs -text
*.lfs.* filter=lfs diff=lfs merge=lfs -text
*.mlmodel filter=lfs diff=lfs merge=lfs -text
*.model filter=lfs diff=lfs merge=lfs -text
*.msgpack filter=lfs diff=lfs merge=lfs -text
*.npy filter=lfs diff=lfs merge=lfs -text
*.npz filter=lfs diff=lfs merge=lfs -text
*.onnx filter=lfs diff=lfs merge=lfs -text
*.ot filter=lfs diff=lfs merge=lfs -text
*.parquet filter=lfs diff=lfs merge=lfs -text
*.pb filter=lfs diff=lfs merge=lfs -text
*.pickle filter=lfs diff=lfs merge=lfs -text
*.pkl filter=lfs diff=lfs merge=lfs -text
*.pt filter=lfs diff=lfs merge=lfs -text
*.pth filter=lfs diff=lfs merge=lfs -text
*.rar filter=lfs diff=lfs merge=lfs -text
*.safetensors filter=lfs diff=lfs merge=lfs -text
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
*.tar.* filter=lfs diff=lfs merge=lfs -text
*.tar filter=lfs diff=lfs merge=lfs -text
*.tflite filter=lfs diff=lfs merge=lfs -text
*.tgz filter=lfs diff=lfs merge=lfs -text
*.wasm filter=lfs diff=lfs merge=lfs -text
*.xz filter=lfs diff=lfs merge=lfs -text
*.zip filter=lfs diff=lfs merge=lfs -text
*.zst filter=lfs diff=lfs merge=lfs -text
*tfevents* filter=lfs diff=lfs merge=lfs -text
tokenizer.json filter=lfs diff=lfs merge=lfs -text

96
README.md Normal file
View File

@@ -0,0 +1,96 @@
---
license: agpl-3.0
datasets:
- soynade-research/FineWeb2-HQ-50k-Wolof
language:
- wo
- en
- fr
base_model:
- google/gemma-3-270m-it
pipeline_tag: text-generation
---
# Oolel-lit-gemma
Oolel-lit-gemma is a fine-tuned version of [Gemma-3-270m-it](https://huggingface.co/google/gemma-3-270m-it)
for the Wolof language. It is part of our Oolel family of compact, on-device Wolof language models
developed.
The model was trained using supervised fine-tuning (SFT) on synthetic data distilled from our
larger **Oolel-7B** models via [Oolel-translator](https://github.com/soynade-research/oolel-translator).
## Usage
### Quick start with pipeline
```python
from transformers import pipeline
generator = pipeline(
"text-generation",
model="soynade-research/oolel-lit-gemma",
device="cuda",
)
messages = [{"role": "user", "content": "Translate to Wolof: The president is 45 years old."}]
output = generator(messages, max_new_tokens=256, return_full_text=False)
print(output["generated_text"])
```
### With AutoModel for more control
```python
from transformers import AutoTokenizer, Gemma3ForCausalLM
import torch
model_id = "soynade-research/oolel-lit-gemma"
model = Gemma3ForCausalLM.from_pretrained(
model_id
).eval()
tokenizer = AutoTokenizer.from_pretrained(model_id)
messages = [
[
{
"role": "system",
"content": [{"type": "text", "text": "You're a Wolof AI assistant. Please always provide detailed and useful answers to the user queries."},]
},
{
"role": "user",
"content": [{"type": "text", "text": "Translate to Wolof: The president is 45 years old."},]
},
],
]
inputs = tokenizer.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
return_tensors="pt",
).to(model.device).to(torch.bfloat16)
with torch.inference_mode():
outputs = model.generate(**inputs, max_new_tokens=256,
do_sample=True,
temperature=0.7,
top_p=0.9,)
outputs = tokenizer.batch_decode(outputs)
```
## Training
The training code and configuration are available at
[soynade-research/oolel-trainer.](https://github.com/soynade-research/oolel-trainer)
## Limitations
- Primarily optimized for Wolof; performance on other languages may vary
- As a 270M parameter model, it may struggle with complex tasks
- Outputs should be verified by a native Wolof speaker for critical applications

3
added_tokens.json Normal file
View File

@@ -0,0 +1,3 @@
{
"<image_soft_token>": 262144
}

362
args.json Normal file
View File

@@ -0,0 +1,362 @@
{
"output_dir": "/workspace/output/v0-20260216-180455",
"overwrite_output_dir": false,
"do_train": false,
"do_eval": false,
"do_predict": false,
"eval_strategy": "no",
"prediction_loss_only": false,
"per_device_train_batch_size": 8,
"per_device_eval_batch_size": 1,
"per_gpu_train_batch_size": null,
"per_gpu_eval_batch_size": null,
"gradient_accumulation_steps": 16,
"eval_accumulation_steps": null,
"eval_delay": 0,
"torch_empty_cache_steps": null,
"learning_rate": 0.0001,
"weight_decay": 0.1,
"adam_beta1": 0.9,
"adam_beta2": 0.95,
"adam_epsilon": 1e-08,
"max_grad_norm": 1.0,
"num_train_epochs": 2.0,
"max_steps": -1,
"lr_scheduler_type": "cosine",
"lr_scheduler_kwargs": null,
"warmup_ratio": 0.05,
"warmup_steps": 0,
"log_level": "passive",
"log_level_replica": "warning",
"log_on_each_node": true,
"logging_dir": "/workspace/output/v0-20260216-180455/runs",
"logging_strategy": "steps",
"logging_first_step": true,
"logging_steps": 100,
"logging_nan_inf_filter": true,
"save_strategy": "steps",
"save_steps": 1000.0,
"save_total_limit": 3,
"save_safetensors": true,
"save_on_each_node": false,
"save_only_model": false,
"restore_callback_states_from_checkpoint": false,
"no_cuda": false,
"use_cpu": false,
"use_mps_device": false,
"seed": 42,
"data_seed": 42,
"jit_mode_eval": false,
"bf16": true,
"fp16": false,
"fp16_opt_level": "O1",
"half_precision_backend": "auto",
"bf16_full_eval": false,
"fp16_full_eval": false,
"tf32": null,
"local_rank": -1,
"ddp_backend": null,
"tpu_num_cores": null,
"tpu_metrics_debug": false,
"debug": null,
"dataloader_drop_last": false,
"eval_steps": null,
"dataloader_num_workers": 16,
"dataloader_prefetch_factor": null,
"past_index": -1,
"run_name": "/workspace/output/v0-20260216-180455",
"disable_tqdm": null,
"remove_unused_columns": true,
"label_names": null,
"load_best_model_at_end": false,
"metric_for_best_model": "loss",
"greater_is_better": false,
"ignore_data_skip": false,
"fsdp": [],
"fsdp_min_num_params": 0,
"fsdp_config": null,
"fsdp_transformer_layer_cls_to_wrap": null,
"accelerator_config": {
"dispatch_batches": false
},
"parallelism_config": null,
"deepspeed": null,
"label_smoothing_factor": 0.0,
"optim": "adamw_torch_fused",
"optim_args": null,
"adafactor": false,
"group_by_length": false,
"length_column_name": "length",
"report_to": [
"tensorboard"
],
"project": "huggingface",
"trackio_space_id": "trackio",
"ddp_find_unused_parameters": null,
"ddp_bucket_cap_mb": null,
"ddp_broadcast_buffers": null,
"dataloader_pin_memory": true,
"dataloader_persistent_workers": false,
"skip_memory_metrics": true,
"use_legacy_prediction_loop": false,
"push_to_hub": true,
"resume_from_checkpoint": null,
"hub_model_id": null,
"hub_strategy": "every_save",
"hub_token": null,
"hub_private_repo": true,
"hub_always_push": false,
"hub_revision": null,
"gradient_checkpointing": true,
"gradient_checkpointing_kwargs": null,
"include_inputs_for_metrics": false,
"include_for_metrics": [],
"eval_do_concat_batches": true,
"fp16_backend": "auto",
"push_to_hub_model_id": null,
"push_to_hub_organization": null,
"push_to_hub_token": null,
"mp_parameters": "",
"auto_find_batch_size": false,
"full_determinism": false,
"torchdynamo": null,
"ray_scope": "last",
"ddp_timeout": 18000000,
"torch_compile": false,
"torch_compile_backend": null,
"torch_compile_mode": null,
"include_tokens_per_second": false,
"include_num_input_tokens_seen": false,
"neftune_noise_alpha": null,
"optim_target_modules": null,
"batch_eval_metrics": false,
"eval_on_start": false,
"use_liger_kernel": false,
"liger_kernel_config": null,
"eval_use_gather_object": false,
"average_tokens_across_devices": true,
"sortish_sampler": false,
"predict_with_generate": false,
"generation_max_length": null,
"generation_num_beams": null,
"generation_config": null,
"tuner_backend": "peft",
"vit_gradient_checkpointing": null,
"router_aux_loss_coef": 0.0,
"enable_dft_loss": false,
"enable_channel_loss": false,
"check_model": true,
"acc_strategy": "token",
"train_dataloader_shuffle": true,
"max_epochs": null,
"aligner_lr": null,
"vit_lr": null,
"use_logits_to_keep": null,
"ds3_gather_for_generation": true,
"resume_only_model": false,
"optimizer": null,
"loss_type": null,
"metric": null,
"eval_use_evalscope": false,
"eval_dataset": [],
"eval_dataset_args": null,
"eval_limit": null,
"eval_generation_config": null,
"extra_eval_args": null,
"use_flash_ckpt": false,
"use_ray": false,
"ray_exp_name": null,
"device_groups": null,
"model": "google/gemma-3-270m-it",
"model_type": "gemma3_text",
"model_revision": null,
"task_type": "causal_lm",
"torch_dtype": "bfloat16",
"attn_impl": "flash_attn",
"new_special_tokens": [],
"num_labels": null,
"problem_type": null,
"rope_scaling": null,
"device_map": null,
"max_memory": {},
"max_model_len": null,
"local_repo_path": null,
"init_strategy": null,
"template": "gemma3_text",
"system": null,
"max_length": 4096,
"truncation_strategy": "delete",
"max_pixels": null,
"agent_template": null,
"norm_bbox": null,
"use_chat_template": true,
"padding_side": "right",
"padding_free": false,
"loss_scale": "default",
"sequence_parallel_size": 1,
"template_backend": "swift",
"response_prefix": null,
"enable_thinking": null,
"add_non_thinking_prefix": true,
"dataset": [
"soynade-research/wo-dioula"
],
"val_dataset": [],
"cached_dataset": [],
"cached_val_dataset": [],
"split_dataset_ratio": 0.0,
"dataset_num_proc": 1,
"load_from_cache_file": false,
"dataset_shuffle": true,
"val_dataset_shuffle": false,
"streaming": false,
"interleave_prob": null,
"stopping_strategy": "first_exhausted",
"shuffle_buffer_size": 1000,
"download_mode": "reuse_dataset_if_exists",
"columns": {},
"strict": false,
"model_name": [
"oolel-lit-gemma"
],
"model_author": [
"soynade-research"
],
"custom_dataset_info": [],
"quant_method": null,
"quant_bits": null,
"hqq_axis": null,
"bnb_4bit_compute_dtype": "bfloat16",
"bnb_4bit_quant_type": "nf4",
"bnb_4bit_use_double_quant": true,
"bnb_4bit_quant_storage": null,
"max_new_tokens": 64,
"temperature": 0.0,
"top_k": null,
"top_p": null,
"repetition_penalty": null,
"num_beams": 1,
"stream": false,
"stop_words": [],
"logprobs": false,
"top_logprobs": null,
"structured_outputs_regex": null,
"ckpt_dir": null,
"lora_modules": [],
"train_type": "full",
"adapters": [],
"external_plugins": [],
"model_kwargs": {},
"load_args": false,
"load_data_args": false,
"packing": false,
"packing_length": null,
"packing_num_proc": 1,
"lazy_tokenize": false,
"custom_register_path": [],
"use_hf": true,
"ignore_args_error": false,
"use_swift_lora": false,
"freeze_parameters": [],
"freeze_parameters_regex": null,
"freeze_parameters_ratio": 0.0,
"trainable_parameters": [],
"trainable_parameters_regex": null,
"freeze_llm": false,
"freeze_vit": true,
"freeze_aligner": true,
"target_modules": [
"all-linear"
],
"target_regex": null,
"target_parameters": null,
"modules_to_save": [],
"lora_rank": 8,
"lora_alpha": 32,
"lora_dropout": 0.05,
"lora_bias": "none",
"lora_dtype": null,
"lorap_lr_ratio": null,
"use_rslora": false,
"use_dora": false,
"lora_ga_batch_size": 2,
"lora_ga_iters": 2,
"lora_ga_max_length": 1024,
"lora_ga_direction": "ArB2r",
"lora_ga_scale": "stable",
"lora_ga_stable_gamma": 16,
"init_weights": true,
"fourier_n_frequency": 2000,
"fourier_scaling": 300.0,
"boft_block_size": 4,
"boft_block_num": 0,
"boft_n_butterfly_factor": 1,
"boft_dropout": 0.0,
"vera_rank": 256,
"vera_projection_prng_key": 0,
"vera_dropout": 0.0,
"vera_d_initial": 0.1,
"adapter_act": "gelu",
"adapter_length": 128,
"use_galore": false,
"galore_target_modules": null,
"galore_rank": 128,
"galore_update_proj_gap": 50,
"galore_scale": 1.0,
"galore_proj_type": "std",
"galore_optim_per_parameter": false,
"galore_with_embedding": false,
"galore_quantization": false,
"galore_proj_quant": false,
"galore_proj_bits": 4,
"galore_proj_group_size": 256,
"galore_cos_threshold": 0.4,
"galore_gamma_proj": 2,
"galore_queue_size": 5,
"adalora_target_r": 8,
"adalora_init_r": 12,
"adalora_tinit": 0,
"adalora_tfinal": 0,
"adalora_deltaT": 1,
"adalora_beta1": 0.85,
"adalora_beta2": 0.85,
"adalora_orth_reg_weight": 0.5,
"llamapro_num_new_blocks": 4,
"llamapro_num_groups": null,
"lisa_activated_layers": 0,
"lisa_step_interval": 20,
"reft_layer_key": null,
"reft_layers": null,
"reft_rank": 4,
"reft_intervention_type": "LoreftIntervention",
"reft_args": null,
"swanlab_token": null,
"swanlab_project": "ms-swift",
"swanlab_workspace": null,
"swanlab_exp_name": null,
"swanlab_notification_method": null,
"swanlab_webhook_url": null,
"swanlab_secret": null,
"swanlab_sender_email": null,
"swanlab_receiver_email": null,
"swanlab_smtp_server": null,
"swanlab_smtp_port": null,
"swanlab_email_language": "zh",
"swanlab_mode": "cloud",
"add_version": true,
"create_checkpoint_symlink": false,
"zero_hpz_partition_size": null,
"deepspeed_autotp_size": null,
"early_stop_interval": null,
"rank": -1,
"global_world_size": 1,
"local_world_size": 1,
"model_suffix": "gemma-3-270m-it",
"model_info": "ModelInfo(model_type='gemma3_text', model_dir='/workspace/hf_artifacts/hub/models--google--gemma-3-270m-it/snapshots/ac82b4e820549b854eebf28ce6dedaf9fdfa17b3', torch_dtype=torch.bfloat16, max_model_len=32768, quant_method=None, quant_bits=None, rope_scaling=None, is_moe_model=False, is_multimodal=False, config=None, task_type='causal_lm', num_labels=None)",
"model_meta": "ModelMeta(model_type='gemma3_text', model_groups=[ModelGroup(models=[Model(ms_model_id='LLM-Research/gemma-3-1b-pt', hf_model_id='google/gemma-3-1b-pt', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='LLM-Research/gemma-3-1b-it', hf_model_id='google/gemma-3-1b-it', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='google/gemma-3-270m', hf_model_id='google/gemma-3-270m', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='google/gemma-3-270m-it', hf_model_id='google/gemma-3-270m-it', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='google/medgemma-27b-text-it', hf_model_id='google/medgemma-27b-text-it', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='gemma3_text', get_function=<function get_model_tokenizer_gemma3_text at 0x70064a6a37e0>, model_arch=ModelKeys(arch_name='llama', embedding='model.embed_tokens', module_list='model.layers', lm_head='lm_head', q_proj='model.layers.{}.self_attn.q_proj', k_proj='model.layers.{}.self_attn.k_proj', v_proj='model.layers.{}.self_attn.v_proj', o_proj='model.layers.{}.self_attn.o_proj', attention='model.layers.{}.self_attn', mlp='model.layers.{}.mlp', down_proj='model.layers.{}.mlp.down_proj', qkv_proj=None, qk_proj=None, qa_proj=None, qb_proj=None, kv_proj=None, kva_proj=None, kvb_proj=None), architectures=['Gemma3ForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, is_reranker=False, task_type=None, ignore_patterns=None, requires=['transformers>=4.49'], tags=[])",
"model_dir": "/workspace/hf_artifacts/hub/models--google--gemma-3-270m-it/snapshots/ac82b4e820549b854eebf28ce6dedaf9fdfa17b3",
"_val_dataset_exists": [],
"hub": "<class 'swift.hub.hub.HFHub'>",
"evaluation_strategy": "no",
"training_args": "Seq2SeqTrainingArguments(output_dir='/workspace/output/v0-20260216-180455', overwrite_output_dir=False, do_train=False, do_eval=False, do_predict=False, eval_strategy=<IntervalStrategy.NO: 'no'>, prediction_loss_only=False, per_device_train_batch_size=8, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=16, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=2.0, max_steps=-1, lr_scheduler_type=<SchedulerType.COSINE: 'cosine'>, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/workspace/output/v0-20260216-180455/runs', logging_strategy=<IntervalStrategy.STEPS: 'steps'>, logging_first_step=True, logging_steps=100, logging_nan_inf_filter=True, save_strategy=<SaveStrategy.STEPS: 'steps'>, save_steps=1000, save_total_limit=3, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=None, dataloader_num_workers=16, dataloader_prefetch_factor=2, past_index=-1, run_name='/workspace/output/v0-20260216-180455', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), parallelism_config=None, deepspeed=None, label_smoothing_factor=0.0, optim=<OptimizerNames.ADAMW_TORCH_FUSED: 'adamw_torch_fused'>, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], project='huggingface', trackio_space_id='trackio', ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=True, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=<HubStrategy.EVERY_SAVE: 'every_save'>, hub_token=None, hub_private_repo=True, hub_always_push=False, hub_revision=None, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=18000000, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, liger_kernel_config=None, eval_use_gather_object=False, average_tokens_across_devices=None, sortish_sampler=False, predict_with_generate=False, generation_max_length=None, generation_num_beams=None, generation_config=None, tuner_backend='peft', vit_gradient_checkpointing=True, router_aux_loss_coef=0.0, enable_dft_loss=False, enable_channel_loss=False, check_model=True, acc_strategy='token', train_dataloader_shuffle=True, max_epochs=None, aligner_lr=None, vit_lr=None, use_logits_to_keep=None, ds3_gather_for_generation=True, resume_only_model=False, optimizer=None, loss_type=None, metric=None, eval_use_evalscope=False, eval_dataset=[], eval_dataset_args=None, eval_limit=None, eval_generation_config=None, extra_eval_args=None, use_flash_ckpt=False, sft_alpha=0, chord_sft_dataset=[], chord_sft_per_device_train_batch_size=None, chord_enable_phi_function=False, chord_mu_warmup_steps=None, chord_mu_decay_steps=None, chord_mu_peak=None, chord_mu_valley=None, train_type='full', local_repo_path=None, galore_config=None, task_type='causal_lm', problem_type=None)"
}

47
chat_template.jinja Normal file
View File

@@ -0,0 +1,47 @@
{{ bos_token }}
{%- if messages[0]['role'] == 'system' -%}
{%- if messages[0]['content'] is string -%}
{%- set first_user_prefix = messages[0]['content'] + '
' -%}
{%- else -%}
{%- set first_user_prefix = messages[0]['content'][0]['text'] + '
' -%}
{%- endif -%}
{%- set loop_messages = messages[1:] -%}
{%- else -%}
{%- set first_user_prefix = "" -%}
{%- set loop_messages = messages -%}
{%- endif -%}
{%- for message in loop_messages -%}
{%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
{{ raise_exception("Conversation roles must alternate user/assistant/user/assistant/...") }}
{%- endif -%}
{%- if (message['role'] == 'assistant') -%}
{%- set role = "model" -%}
{%- else -%}
{%- set role = message['role'] -%}
{%- endif -%}
{{ '<start_of_turn>' + role + '
' + (first_user_prefix if loop.first else "") }}
{%- if message['content'] is string -%}
{{ message['content'] | trim }}
{%- elif message['content'] is iterable -%}
{%- for item in message['content'] -%}
{%- if item['type'] == 'image' -%}
{{ '<start_of_image>' }}
{%- elif item['type'] == 'text' -%}
{{ item['text'] | trim }}
{%- endif -%}
{%- endfor -%}
{%- else -%}
{{ raise_exception("Invalid content type") }}
{%- endif -%}
{{ '<end_of_turn>
' }}
{%- endfor -%}
{%- if add_generation_prompt -%}
{{'<start_of_turn>model
'}}
{%- endif -%}

54
config.json Normal file
View File

@@ -0,0 +1,54 @@
{
"_sliding_window_pattern": 6,
"architectures": [
"Gemma3ForCausalLM"
],
"attention_bias": false,
"attention_dropout": 0.0,
"attn_logit_softcapping": null,
"bos_token_id": 2,
"dtype": "bfloat16",
"eos_token_id": 1,
"final_logit_softcapping": null,
"head_dim": 256,
"hidden_activation": "gelu_pytorch_tanh",
"hidden_size": 640,
"initializer_range": 0.02,
"intermediate_size": 2048,
"layer_types": [
"sliding_attention",
"sliding_attention",
"sliding_attention",
"sliding_attention",
"sliding_attention",
"full_attention",
"sliding_attention",
"sliding_attention",
"sliding_attention",
"sliding_attention",
"sliding_attention",
"full_attention",
"sliding_attention",
"sliding_attention",
"sliding_attention",
"sliding_attention",
"sliding_attention",
"full_attention"
],
"max_position_embeddings": 32768,
"model_type": "gemma3_text",
"num_attention_heads": 4,
"num_hidden_layers": 18,
"num_key_value_heads": 1,
"pad_token_id": 0,
"query_pre_attn_scalar": 256,
"rms_norm_eps": 1e-06,
"rope_local_base_freq": 10000.0,
"rope_scaling": null,
"rope_theta": 1000000.0,
"sliding_window": 512,
"transformers_version": "4.57.6",
"use_bidirectional_attention": false,
"use_cache": false,
"vocab_size": 262144
}

11
generation_config.json Normal file
View File

@@ -0,0 +1,11 @@
{
"cache_implementation": "hybrid",
"do_sample": true,
"eos_token_id": [
1,
106
],
"top_k": 64,
"top_p": 0.95,
"transformers_version": "4.57.6"
}

3
model.safetensors Normal file
View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:1fc70f5244b2b7169ddd3291e41e6dc26dbee19083d1e94543a6db45d34a2c1a
size 536223056

3
optimizer.pt Normal file
View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:1f61eb9eddbe080a81e2770902928fc08589bdfda8a865664694de46cc3a0b81
size 1072594443

3
rng_state.pth Normal file
View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878
size 14645

3
scheduler.pt Normal file
View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:94b9d21250e0c4c9206344307e30b2bb0ff729fb03b4200d5a8004e9714582e5
size 1465

33
special_tokens_map.json Normal file
View File

@@ -0,0 +1,33 @@
{
"boi_token": "<start_of_image>",
"bos_token": {
"content": "<bos>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
},
"eoi_token": "<end_of_image>",
"eos_token": {
"content": "<eos>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
},
"image_token": "<image_soft_token>",
"pad_token": {
"content": "<pad>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
},
"unk_token": {
"content": "<unk>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
}
}

3
tokenizer.json Normal file
View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:4667f2089529e8e7657cfb6d1c19910ae71ff5f28aa7ab2ff2763330affad795
size 33384568

3
tokenizer.model Normal file
View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:1299c11d7cf632ef3b4e11937501358ada021bbdf7c47638d13c0ee982f2e79c
size 4689074

51345
tokenizer_config.json Normal file

File diff suppressed because it is too large Load Diff

922
trainer_state.json Normal file
View File

@@ -0,0 +1,922 @@
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 500,
"global_step": 11066,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00018074807107917894,
"grad_norm": 58.25,
"learning_rate": 1.805054151624549e-07,
"loss": 4.652698516845703,
"step": 1,
"token_acc": 0.43327626145634
},
{
"epoch": 0.018074807107917895,
"grad_norm": 6.53125,
"learning_rate": 1.805054151624549e-05,
"loss": 3.920129641137942,
"step": 100,
"token_acc": 0.45332661759508064
},
{
"epoch": 0.03614961421583579,
"grad_norm": 6.0,
"learning_rate": 3.610108303249098e-05,
"loss": 2.8419125366210936,
"step": 200,
"token_acc": 0.5193163006097973
},
{
"epoch": 0.054224421323753685,
"grad_norm": 5.625,
"learning_rate": 5.415162454873647e-05,
"loss": 2.2325677490234375,
"step": 300,
"token_acc": 0.577260581442613
},
{
"epoch": 0.07229922843167158,
"grad_norm": 5.09375,
"learning_rate": 7.220216606498195e-05,
"loss": 1.9477809143066407,
"step": 400,
"token_acc": 0.6095200383891847
},
{
"epoch": 0.09037403553958948,
"grad_norm": 4.4375,
"learning_rate": 9.025270758122743e-05,
"loss": 1.7510385131835937,
"step": 500,
"token_acc": 0.6378192081318703
},
{
"epoch": 0.10844884264750737,
"grad_norm": 4.28125,
"learning_rate": 9.999527526045029e-05,
"loss": 1.6535232543945313,
"step": 600,
"token_acc": 0.6496733713777781
},
{
"epoch": 0.12652364975542527,
"grad_norm": 3.625,
"learning_rate": 9.99524110790929e-05,
"loss": 1.5711769104003905,
"step": 700,
"token_acc": 0.6620965629303417
},
{
"epoch": 0.14459845686334316,
"grad_norm": 3.4375,
"learning_rate": 9.986493474590536e-05,
"loss": 1.5024029541015624,
"step": 800,
"token_acc": 0.6721741041947722
},
{
"epoch": 0.16267326397126106,
"grad_norm": 3.0,
"learning_rate": 9.973292438539405e-05,
"loss": 1.4528140258789062,
"step": 900,
"token_acc": 0.6810912279574309
},
{
"epoch": 0.18074807107917895,
"grad_norm": 3.421875,
"learning_rate": 9.955649789509624e-05,
"loss": 1.4288172912597656,
"step": 1000,
"token_acc": 0.6845218263208889
},
{
"epoch": 0.19882287818709685,
"grad_norm": 3.4375,
"learning_rate": 9.933581284028659e-05,
"loss": 1.4166110229492188,
"step": 1100,
"token_acc": 0.68570654719404
},
{
"epoch": 0.21689768529501474,
"grad_norm": 2.90625,
"learning_rate": 9.907106631325671e-05,
"loss": 1.3844677734375,
"step": 1200,
"token_acc": 0.6901876362337299
},
{
"epoch": 0.23497249240293264,
"grad_norm": 2.375,
"learning_rate": 9.876249475729344e-05,
"loss": 1.3720639038085938,
"step": 1300,
"token_acc": 0.6921398802222583
},
{
"epoch": 0.25304729951085053,
"grad_norm": 2.6875,
"learning_rate": 9.841037375551294e-05,
"loss": 1.3502085876464844,
"step": 1400,
"token_acc": 0.6961120678024922
},
{
"epoch": 0.27112210661876845,
"grad_norm": 2.53125,
"learning_rate": 9.801501778473935e-05,
"loss": 1.3242225646972656,
"step": 1500,
"token_acc": 0.7008815156964483
},
{
"epoch": 0.2891969137266863,
"grad_norm": 2.671875,
"learning_rate": 9.757677993464771e-05,
"loss": 1.3310586547851562,
"step": 1600,
"token_acc": 0.6988938129948864
},
{
"epoch": 0.30727172083460425,
"grad_norm": 2.4375,
"learning_rate": 9.709605159242199e-05,
"loss": 1.3116970825195313,
"step": 1700,
"token_acc": 0.7015194406957871
},
{
"epoch": 0.3253465279425221,
"grad_norm": 2.5625,
"learning_rate": 9.657326209320998e-05,
"loss": 1.3130838012695312,
"step": 1800,
"token_acc": 0.7008979147977925
},
{
"epoch": 0.34342133505044004,
"grad_norm": 2.453125,
"learning_rate": 9.600887833668701e-05,
"loss": 1.2876347351074218,
"step": 1900,
"token_acc": 0.7059691653488992
},
{
"epoch": 0.3614961421583579,
"grad_norm": 2.28125,
"learning_rate": 9.540340437007106e-05,
"loss": 1.2805183410644532,
"step": 2000,
"token_acc": 0.7076493690679662
},
{
"epoch": 0.3795709492662758,
"grad_norm": 2.3125,
"learning_rate": 9.475738093796172e-05,
"loss": 1.2861131286621095,
"step": 2100,
"token_acc": 0.7055881842553867
},
{
"epoch": 0.3976457563741937,
"grad_norm": 2.375,
"learning_rate": 9.407138499940496e-05,
"loss": 1.268822479248047,
"step": 2200,
"token_acc": 0.7090846641855041
},
{
"epoch": 0.4157205634821116,
"grad_norm": 2.59375,
"learning_rate": 9.334602921261492e-05,
"loss": 1.2444308471679688,
"step": 2300,
"token_acc": 0.7131349902523053
},
{
"epoch": 0.4337953705900295,
"grad_norm": 3.015625,
"learning_rate": 9.258196138781327e-05,
"loss": 1.2656473541259765,
"step": 2400,
"token_acc": 0.7094099462846465
},
{
"epoch": 0.4518701776979474,
"grad_norm": 2.265625,
"learning_rate": 9.177986390867419e-05,
"loss": 1.2598892211914063,
"step": 2500,
"token_acc": 0.7109632032447007
},
{
"epoch": 0.4699449848058653,
"grad_norm": 2.5,
"learning_rate": 9.09404531228924e-05,
"loss": 1.2393927764892578,
"step": 2600,
"token_acc": 0.7133786975644326
},
{
"epoch": 0.4880197919137832,
"grad_norm": 2.3125,
"learning_rate": 9.0064478702418e-05,
"loss": 1.2412493896484376,
"step": 2700,
"token_acc": 0.7132032480227637
},
{
"epoch": 0.5060945990217011,
"grad_norm": 2.5625,
"learning_rate": 8.915272297392945e-05,
"loss": 1.2382020568847656,
"step": 2800,
"token_acc": 0.7131689546842609
},
{
"epoch": 0.5241694061296189,
"grad_norm": 2.640625,
"learning_rate": 8.820600022014338e-05,
"loss": 1.2349536895751954,
"step": 2900,
"token_acc": 0.7140737742682136
},
{
"epoch": 0.5422442132375369,
"grad_norm": 2.109375,
"learning_rate": 8.722515595258402e-05,
"loss": 1.2141342163085938,
"step": 3000,
"token_acc": 0.7181298317819385
},
{
"epoch": 0.5603190203454548,
"grad_norm": 2.078125,
"learning_rate": 8.621106615646292e-05,
"loss": 1.2058545684814452,
"step": 3100,
"token_acc": 0.7190379193709925
},
{
"epoch": 0.5783938274533726,
"grad_norm": 2.453125,
"learning_rate": 8.51646365083426e-05,
"loss": 1.2195273590087892,
"step": 3200,
"token_acc": 0.7170014250513891
},
{
"epoch": 0.5964686345612905,
"grad_norm": 2.3125,
"learning_rate": 8.408680156728299e-05,
"loss": 1.2225239562988282,
"step": 3300,
"token_acc": 0.71660874090857
},
{
"epoch": 0.6145434416692085,
"grad_norm": 2.359375,
"learning_rate": 8.297852394019336e-05,
"loss": 1.1935769653320312,
"step": 3400,
"token_acc": 0.721348535831292
},
{
"epoch": 0.6326182487771264,
"grad_norm": 2.328125,
"learning_rate": 8.184079342213466e-05,
"loss": 1.198054428100586,
"step": 3500,
"token_acc": 0.7195931038612298
},
{
"epoch": 0.6506930558850442,
"grad_norm": 2.15625,
"learning_rate": 8.067462611234052e-05,
"loss": 1.2039249420166016,
"step": 3600,
"token_acc": 0.7198145640636711
},
{
"epoch": 0.6687678629929621,
"grad_norm": 2.25,
"learning_rate": 7.948106350674593e-05,
"loss": 1.2044364166259767,
"step": 3700,
"token_acc": 0.7191590166885238
},
{
"epoch": 0.6868426701008801,
"grad_norm": 2.21875,
"learning_rate": 7.826117156783461e-05,
"loss": 1.1947254180908202,
"step": 3800,
"token_acc": 0.7211205989004562
},
{
"epoch": 0.7049174772087979,
"grad_norm": 2.171875,
"learning_rate": 7.701603977263513e-05,
"loss": 1.1763773345947266,
"step": 3900,
"token_acc": 0.7237809512705871
},
{
"epoch": 0.7229922843167158,
"grad_norm": 2.546875,
"learning_rate": 7.574678013971672e-05,
"loss": 1.180088424682617,
"step": 4000,
"token_acc": 0.7241476445274628
},
{
"epoch": 0.7410670914246337,
"grad_norm": 2.359375,
"learning_rate": 7.445452623605307e-05,
"loss": 1.1668415069580078,
"step": 4100,
"token_acc": 0.7264480800382547
},
{
"epoch": 0.7591418985325517,
"grad_norm": 2.015625,
"learning_rate": 7.314043216464158e-05,
"loss": 1.1807654571533204,
"step": 4200,
"token_acc": 0.7243762391137034
},
{
"epoch": 0.7772167056404695,
"grad_norm": 2.328125,
"learning_rate": 7.180567153378193e-05,
"loss": 1.1845186614990235,
"step": 4300,
"token_acc": 0.7214888736557151
},
{
"epoch": 0.7952915127483874,
"grad_norm": 2.1875,
"learning_rate": 7.045143640893474e-05,
"loss": 1.1583942413330077,
"step": 4400,
"token_acc": 0.7274813631693335
},
{
"epoch": 0.8133663198563053,
"grad_norm": 2.109375,
"learning_rate": 6.907893624809609e-05,
"loss": 1.145471420288086,
"step": 4500,
"token_acc": 0.7298424303929127
},
{
"epoch": 0.8314411269642232,
"grad_norm": 2.359375,
"learning_rate": 6.768939682163902e-05,
"loss": 1.1666727447509766,
"step": 4600,
"token_acc": 0.7264280244660613
},
{
"epoch": 0.8495159340721411,
"grad_norm": 2.09375,
"learning_rate": 6.628405911758647e-05,
"loss": 1.1622318267822265,
"step": 4700,
"token_acc": 0.7271438460195889
},
{
"epoch": 0.867590741180059,
"grad_norm": 2.15625,
"learning_rate": 6.486417823329354e-05,
"loss": 1.1364639282226563,
"step": 4800,
"token_acc": 0.7319144126512372
},
{
"epoch": 0.8856655482879768,
"grad_norm": 2.421875,
"learning_rate": 6.34310222545287e-05,
"loss": 1.1561846923828125,
"step": 4900,
"token_acc": 0.7282120514362783
},
{
"epoch": 0.9037403553958948,
"grad_norm": 2.109375,
"learning_rate": 6.198587112295526e-05,
"loss": 1.1464973449707032,
"step": 5000,
"token_acc": 0.7304544859576637
},
{
"epoch": 0.9218151625038127,
"grad_norm": 2.0,
"learning_rate": 6.053001549302422e-05,
"loss": 1.135927963256836,
"step": 5100,
"token_acc": 0.7309169370264882
},
{
"epoch": 0.9398899696117305,
"grad_norm": 2.359375,
"learning_rate": 5.906475557929985e-05,
"loss": 1.1362411499023437,
"step": 5200,
"token_acc": 0.7308731770004574
},
{
"epoch": 0.9579647767196484,
"grad_norm": 2.15625,
"learning_rate": 5.759139999524705e-05,
"loss": 1.132964859008789,
"step": 5300,
"token_acc": 0.7320876184986531
},
{
"epoch": 0.9760395838275664,
"grad_norm": 2.09375,
"learning_rate": 5.611126458451772e-05,
"loss": 1.1330313873291016,
"step": 5400,
"token_acc": 0.7321249508199824
},
{
"epoch": 0.9941143909354843,
"grad_norm": 2.1875,
"learning_rate": 5.462567124577992e-05,
"loss": 1.137665786743164,
"step": 5500,
"token_acc": 0.731388650772945
},
{
"epoch": 1.0121101207623049,
"grad_norm": 2.71875,
"learning_rate": 5.3135946752139385e-05,
"loss": 1.0870736694335938,
"step": 5600,
"token_acc": 0.7406857225004193
},
{
"epoch": 1.0301849278702229,
"grad_norm": 2.234375,
"learning_rate": 5.1643421566207615e-05,
"loss": 1.0574837493896485,
"step": 5700,
"token_acc": 0.7462299226419611
},
{
"epoch": 1.0482597349781408,
"grad_norm": 2.1875,
"learning_rate": 5.0149428651874985e-05,
"loss": 1.0600157165527344,
"step": 5800,
"token_acc": 0.7451135106751428
},
{
"epoch": 1.0663345420860586,
"grad_norm": 2.296875,
"learning_rate": 4.86553022838499e-05,
"loss": 1.0566656494140625,
"step": 5900,
"token_acc": 0.7469262925837835
},
{
"epoch": 1.0844093491939766,
"grad_norm": 2.1875,
"learning_rate": 4.716237685602735e-05,
"loss": 1.073977508544922,
"step": 6000,
"token_acc": 0.7416494081610407
},
{
"epoch": 1.1024841563018946,
"grad_norm": 2.359375,
"learning_rate": 4.567198568975096e-05,
"loss": 1.0501838684082032,
"step": 6100,
"token_acc": 0.7475906446092413
},
{
"epoch": 1.1205589634098123,
"grad_norm": 1.9453125,
"learning_rate": 4.418545984303294e-05,
"loss": 1.0559381866455078,
"step": 6200,
"token_acc": 0.7470775683736796
},
{
"epoch": 1.1386337705177303,
"grad_norm": 2.09375,
"learning_rate": 4.2704126921795424e-05,
"loss": 1.0539588928222656,
"step": 6300,
"token_acc": 0.7466571018782976
},
{
"epoch": 1.1567085776256483,
"grad_norm": 2.125,
"learning_rate": 4.1229309894194806e-05,
"loss": 1.0646955108642577,
"step": 6400,
"token_acc": 0.7450717568377625
},
{
"epoch": 1.174783384733566,
"grad_norm": 1.8828125,
"learning_rate": 3.976232590908812e-05,
"loss": 1.0539531707763672,
"step": 6500,
"token_acc": 0.7465753023540981
},
{
"epoch": 1.192858191841484,
"grad_norm": 2.25,
"learning_rate": 3.830448511969638e-05,
"loss": 1.0504056549072265,
"step": 6600,
"token_acc": 0.7470591527899016
},
{
"epoch": 1.2109329989494018,
"grad_norm": 1.9921875,
"learning_rate": 3.6857089513516035e-05,
"loss": 1.0537297821044922,
"step": 6700,
"token_acc": 0.746264189895728
},
{
"epoch": 1.2290078060573197,
"grad_norm": 2.078125,
"learning_rate": 3.542143174952282e-05,
"loss": 1.0569972229003906,
"step": 6800,
"token_acc": 0.7456408105039157
},
{
"epoch": 1.2470826131652377,
"grad_norm": 2.078125,
"learning_rate": 3.399879400370704e-05,
"loss": 1.0594657897949218,
"step": 6900,
"token_acc": 0.7464631930608675
},
{
"epoch": 1.2651574202731555,
"grad_norm": 2.296875,
"learning_rate": 3.259044682397107e-05,
"loss": 1.059138946533203,
"step": 7000,
"token_acc": 0.7466058265866723
},
{
"epoch": 1.2832322273810735,
"grad_norm": 2.140625,
"learning_rate": 3.119764799541187e-05,
"loss": 1.0547212982177734,
"step": 7100,
"token_acc": 0.7473527098438676
},
{
"epoch": 1.3013070344889912,
"grad_norm": 2.125,
"learning_rate": 2.9821641417001806e-05,
"loss": 1.0402613067626953,
"step": 7200,
"token_acc": 0.7493453306325137
},
{
"epoch": 1.3193818415969092,
"grad_norm": 2.140625,
"learning_rate": 2.846365599067111e-05,
"loss": 1.045955352783203,
"step": 7300,
"token_acc": 0.7483038206918617
},
{
"epoch": 1.3374566487048272,
"grad_norm": 2.203125,
"learning_rate": 2.7124904523784144e-05,
"loss": 1.0378961944580078,
"step": 7400,
"token_acc": 0.7502343443728431
},
{
"epoch": 1.355531455812745,
"grad_norm": 2.46875,
"learning_rate": 2.580658264598942e-05,
"loss": 1.0476718902587892,
"step": 7500,
"token_acc": 0.7483254533842428
},
{
"epoch": 1.373606262920663,
"grad_norm": 2.109375,
"learning_rate": 2.450986774141123e-05,
"loss": 1.0452989196777345,
"step": 7600,
"token_acc": 0.7482226148441731
},
{
"epoch": 1.3916810700285809,
"grad_norm": 2.140625,
"learning_rate": 2.3235917897135934e-05,
"loss": 1.0406829833984375,
"step": 7700,
"token_acc": 0.74961852670988
},
{
"epoch": 1.4097558771364986,
"grad_norm": 2.3125,
"learning_rate": 2.1985870868932456e-05,
"loss": 1.0259892272949218,
"step": 7800,
"token_acc": 0.7533507908450509
},
{
"epoch": 1.4278306842444166,
"grad_norm": 2.421875,
"learning_rate": 2.076084306513049e-05,
"loss": 1.0363540649414062,
"step": 7900,
"token_acc": 0.7503043904274775
},
{
"epoch": 1.4459054913523346,
"grad_norm": 1.984375,
"learning_rate": 1.9561928549563968e-05,
"loss": 1.0644924926757813,
"step": 8000,
"token_acc": 0.7456255842220881
},
{
"epoch": 1.4639802984602523,
"grad_norm": 2.125,
"learning_rate": 1.839019806447024e-05,
"loss": 1.0405005645751952,
"step": 8100,
"token_acc": 0.7495747171714061
},
{
"epoch": 1.4820551055681703,
"grad_norm": 2.09375,
"learning_rate": 1.724669807421762e-05,
"loss": 1.0440809631347656,
"step": 8200,
"token_acc": 0.7486369722470563
},
{
"epoch": 1.5001299126760883,
"grad_norm": 2.125,
"learning_rate": 1.6132449830715263e-05,
"loss": 1.0530775451660157,
"step": 8300,
"token_acc": 0.7477425947235785
},
{
"epoch": 1.518204719784006,
"grad_norm": 2.109375,
"learning_rate": 1.5048448461340258e-05,
"loss": 1.0460784912109375,
"step": 8400,
"token_acc": 0.7489244523851678
},
{
"epoch": 1.5362795268919238,
"grad_norm": 2.125,
"learning_rate": 1.3995662080196215e-05,
"loss": 1.0327759552001954,
"step": 8500,
"token_acc": 0.7517219276186928
},
{
"epoch": 1.5543543339998418,
"grad_norm": 1.8125,
"learning_rate": 1.2975030923497262e-05,
"loss": 1.037949981689453,
"step": 8600,
"token_acc": 0.7504533222058104
},
{
"epoch": 1.5724291411077598,
"grad_norm": 2.109375,
"learning_rate": 1.1987466509849655e-05,
"loss": 1.0523592376708983,
"step": 8700,
"token_acc": 0.7466913343954412
},
{
"epoch": 1.5905039482156775,
"grad_norm": 2.265625,
"learning_rate": 1.1033850826180781e-05,
"loss": 1.048785171508789,
"step": 8800,
"token_acc": 0.7472665235971139
},
{
"epoch": 1.6085787553235955,
"grad_norm": 2.0,
"learning_rate": 1.0115035540042784e-05,
"loss": 1.032520523071289,
"step": 8900,
"token_acc": 0.7507173368953239
},
{
"epoch": 1.6266535624315135,
"grad_norm": 2.21875,
"learning_rate": 9.231841238994194e-06,
"loss": 1.043864974975586,
"step": 9000,
"token_acc": 0.7493066372338768
},
{
"epoch": 1.6447283695394312,
"grad_norm": 1.90625,
"learning_rate": 8.385056697738796e-06,
"loss": 1.0542935943603515,
"step": 9100,
"token_acc": 0.7465372369884397
},
{
"epoch": 1.6628031766473492,
"grad_norm": 1.984375,
"learning_rate": 7.575438173676513e-06,
"loss": 1.031275177001953,
"step": 9200,
"token_acc": 0.7517255178380788
},
{
"epoch": 1.6808779837552672,
"grad_norm": 2.125,
"learning_rate": 6.803708731495117e-06,
"loss": 1.0453128814697266,
"step": 9300,
"token_acc": 0.7487111883579803
},
{
"epoch": 1.698952790863185,
"grad_norm": 1.8984375,
"learning_rate": 6.070557597406163e-06,
"loss": 1.0431288146972657,
"step": 9400,
"token_acc": 0.7488178364241926
},
{
"epoch": 1.717027597971103,
"grad_norm": 1.96875,
"learning_rate": 5.376639543601858e-06,
"loss": 1.0395802307128905,
"step": 9500,
"token_acc": 0.7503542725542914
},
{
"epoch": 1.735102405079021,
"grad_norm": 2.078125,
"learning_rate": 4.722574303482557e-06,
"loss": 1.0538075256347657,
"step": 9600,
"token_acc": 0.7472696432430282
},
{
"epoch": 1.7531772121869387,
"grad_norm": 2.234375,
"learning_rate": 4.1089460181771675e-06,
"loss": 1.046026382446289,
"step": 9700,
"token_acc": 0.7478882170852356
},
{
"epoch": 1.7712520192948564,
"grad_norm": 2.15625,
"learning_rate": 3.5363027148507423e-06,
"loss": 1.0465138244628907,
"step": 9800,
"token_acc": 0.7479565299418576
},
{
"epoch": 1.7893268264027746,
"grad_norm": 2.28125,
"learning_rate": 3.0051558172652316e-06,
"loss": 1.0322959899902344,
"step": 9900,
"token_acc": 0.752127805838985
},
{
"epoch": 1.8074016335106924,
"grad_norm": 2.21875,
"learning_rate": 2.5159796890304564e-06,
"loss": 1.0446186065673828,
"step": 10000,
"token_acc": 0.7488376764944241
},
{
"epoch": 1.8254764406186101,
"grad_norm": 2.0625,
"learning_rate": 2.069211209953287e-06,
"loss": 1.0309945678710937,
"step": 10100,
"token_acc": 0.7524413140197763
},
{
"epoch": 1.8435512477265281,
"grad_norm": 1.8203125,
"learning_rate": 1.6652493858632823e-06,
"loss": 1.0363735198974608,
"step": 10200,
"token_acc": 0.7503588935333034
},
{
"epoch": 1.861626054834446,
"grad_norm": 2.296875,
"learning_rate": 1.3044549922633876e-06,
"loss": 1.0336082458496094,
"step": 10300,
"token_acc": 0.7506362801039103
},
{
"epoch": 1.8797008619423639,
"grad_norm": 2.046875,
"learning_rate": 9.871502521237975e-07,
"loss": 1.0328756713867187,
"step": 10400,
"token_acc": 0.7518247545659716
},
{
"epoch": 1.8977756690502818,
"grad_norm": 1.8671875,
"learning_rate": 7.136185481068925e-07,
"loss": 1.0489426422119141,
"step": 10500,
"token_acc": 0.747691160714326
},
{
"epoch": 1.9158504761581998,
"grad_norm": 1.765625,
"learning_rate": 4.841041694801208e-07,
"loss": 1.0270442962646484,
"step": 10600,
"token_acc": 0.7519379627407866
},
{
"epoch": 1.9339252832661176,
"grad_norm": 2.0,
"learning_rate": 2.988120939429684e-07,
"loss": 1.0303496551513671,
"step": 10700,
"token_acc": 0.7513509325616106
},
{
"epoch": 1.9520000903740355,
"grad_norm": 1.890625,
"learning_rate": 1.5790780456277355e-07,
"loss": 1.0394702911376954,
"step": 10800,
"token_acc": 0.7495070401423912
},
{
"epoch": 1.9700748974819535,
"grad_norm": 2.390625,
"learning_rate": 6.15171419829752e-08,
"loss": 1.0353932189941406,
"step": 10900,
"token_acc": 0.7506620155660084
},
{
"epoch": 1.9881497045898713,
"grad_norm": 2.15625,
"learning_rate": 9.726192035691694e-09,
"loss": 1.0364900970458983,
"step": 11000,
"token_acc": 0.7511165613553197
}
],
"logging_steps": 100,
"max_steps": 11066,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 8.597718898108247e+17,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}

3
training_args.bin Normal file
View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:a53afc965adde4abf79521e988467b9e3ccedf21f656c2f533f82df1965c7abf
size 6993