From ad6be70c25cd301e4b0f25939886f1cf3c73d91d Mon Sep 17 00:00:00 2001 From: ModelHub XC Date: Sat, 18 Apr 2026 08:32:01 +0800 Subject: [PATCH] =?UTF-8?q?=E5=88=9D=E5=A7=8B=E5=8C=96=E9=A1=B9=E7=9B=AE?= =?UTF-8?q?=EF=BC=8C=E7=94=B1ModelHub=20XC=E7=A4=BE=E5=8C=BA=E6=8F=90?= =?UTF-8?q?=E4=BE=9B=E6=A8=A1=E5=9E=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Model: vandijklab/C2S-Pythia-410m-cell-type-conditioned-cell-generation Source: Original Platform --- .gitattributes | 53 ++ README.md | 40 ++ config.json | 30 ++ configuration.json | 1 + generation_config.json | 7 + model.safetensors | 3 + rng_state.pth | 3 + scheduler.pt | 3 + special_tokens_map.json | 24 + tokenizer.json | 3 + tokenizer_config.json | 214 ++++++++ trainer_state.json | 1083 +++++++++++++++++++++++++++++++++++++++ training_args.bin | 3 + 13 files changed, 1467 insertions(+) create mode 100644 .gitattributes create mode 100644 README.md create mode 100644 config.json create mode 100644 configuration.json create mode 100644 generation_config.json create mode 100644 model.safetensors create mode 100644 rng_state.pth create mode 100644 scheduler.pt create mode 100644 special_tokens_map.json create mode 100644 tokenizer.json create mode 100644 tokenizer_config.json create mode 100644 trainer_state.json create mode 100644 training_args.bin diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..ec6c39d --- /dev/null +++ b/.gitattributes @@ -0,0 +1,53 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text + + +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text + + +*.rar filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zstandard filter=lfs diff=lfs merge=lfs -text +*.tfevents* filter=lfs diff=lfs merge=lfs -text +*.db* filter=lfs diff=lfs merge=lfs -text +*.ark* filter=lfs diff=lfs merge=lfs -text +**/*ckpt*data* filter=lfs diff=lfs merge=lfs -text +**/*ckpt*.meta filter=lfs diff=lfs merge=lfs -text +**/*ckpt*.index filter=lfs diff=lfs merge=lfs -text + +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.gguf* filter=lfs diff=lfs merge=lfs -text +*.ggml filter=lfs diff=lfs merge=lfs -text +*.llamafile* filter=lfs diff=lfs merge=lfs -text +*.pt2 filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text + +training_args.bin filter=lfs diff=lfs merge=lfs -text +rng_state.pth filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text +model.safetensors filter=lfs diff=lfs merge=lfs -text +scheduler.pt filter=lfs diff=lfs merge=lfs -text \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..8cf2ef9 --- /dev/null +++ b/README.md @@ -0,0 +1,40 @@ +--- +license: cc-by-4.0 +language: +- en +base_model: EleutherAI/pythia-410m +library_name: transformers +tags: +- biology +- scRNAseq +--- + +# Overview +This is the C2S-Pythia-410m-cell-type-conditioned-cell-generation model, built on the Pythia-410m architecture developed +by EleutherAI, fine-tuned using Cell2Sentence (C2S) on a comprehensive collection of single-cell RNA sequencing +(scRNA-seq) datasets from CellxGene and the Human Cell Atlas. Cell2Sentence is a pioneering technique that adapts +large language models (LLMs) to single-cell biology by converting scRNA-seq data into "cell sentences" — ordered +sequences of gene names based on expression levels. This model is specifically trained for cell type-conditioned +single-cell generation, enabling the generation of realistic single-cell profiles conditioned on specified cell +types. + +# Training Data +This model was trained on over 57 million human and mouse cells gathered from over 800 single-cell RNA sequencing +datasets from CellxGene and the Human Cell Atlas. This dataset covers a broad range of cell types and conditions +from multiple tissues in both human and mouse. + +This model was trained with the top 200 genes per cell sentence. + +# Tasks +This model is designed for: + +- Cell type-conditioned single-cell generation: Generating single-cell profiles conditioned on specific cell types, allowing for the creation of synthetic cells that reflect the gene expression patterns of targeted cell types. + + +# Cell2Sentence Links +- GitHub: https://github.com/vandijklab/cell2sentence (Note: Codebase has Apache 2.0 license, weights shared on HuggingFace are CC-by-4.0) +- Paper: https://www.biorxiv.org/content/10.1101/2023.09.11.557287v3 + +# Pythia Links +- Paper: https://arxiv.org/pdf/2304.01373 +- Hugging Face: https://huggingface.co/EleutherAI/pythia-410m \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..0cc4951 --- /dev/null +++ b/config.json @@ -0,0 +1,30 @@ +{ + "_name_or_path": "EleutherAI/pythia-410m", + "architectures": [ + "GPTNeoXForCausalLM" + ], + "attention_bias": true, + "attention_dropout": 0.0, + "bos_token_id": 0, + "classifier_dropout": 0.1, + "eos_token_id": 0, + "hidden_act": "gelu", + "hidden_dropout": 0.0, + "hidden_size": 1024, + "initializer_range": 0.02, + "intermediate_size": 4096, + "layer_norm_eps": 1e-05, + "max_position_embeddings": 4096, + "model_type": "gpt_neox", + "num_attention_heads": 16, + "num_hidden_layers": 24, + "rope_scaling": null, + "rotary_emb_base": 10000, + "rotary_pct": 0.25, + "tie_word_embeddings": false, + "torch_dtype": "float32", + "transformers_version": "4.41.0", + "use_cache": false, + "use_parallel_residual": true, + "vocab_size": 50304 +} diff --git a/configuration.json b/configuration.json new file mode 100644 index 0000000..bbeeda1 --- /dev/null +++ b/configuration.json @@ -0,0 +1 @@ +{"framework": "pytorch", "task": "text-generation", "allow_remote": true} \ No newline at end of file diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..ea61db8 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "bos_token_id": 0, + "eos_token_id": 0, + "transformers_version": "4.41.0", + "use_cache": false +} diff --git a/model.safetensors b/model.safetensors new file mode 100644 index 0000000..7ec4628 --- /dev/null +++ b/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f577cb1e096bd60402864246a00ab4c134732c86c1fea778f75b25e22005de34 +size 1621370224 diff --git a/rng_state.pth b/rng_state.pth new file mode 100644 index 0000000..ec2db64 --- /dev/null +++ b/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a2455962b40b4d11fc51a4a05cd856f8de69d57ee9739b17e08471d4403b6dd9 +size 14244 diff --git a/scheduler.pt b/scheduler.pt new file mode 100644 index 0000000..a39451e --- /dev/null +++ b/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:33817259d0fe142b1457dacebf6aeed9e5ee493ae600ebf9ed24a8d94a22bf61 +size 1064 diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..330140f --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|endoftext|>", + "unk_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..614d056 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:870f4e2baa6b683221fa52004d5d6f40ab8c9d31961617304b78c910c2c3caf2 +size 2114042 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..67bcd0e --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,214 @@ +{ + "add_bos_token": false, + "add_eos_token": false, + "add_prefix_space": false, + "added_tokens_decoder": { + "0": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "<|padding|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "50254": { + "content": " ", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "50255": { + "content": " ", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "50256": { + "content": " ", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "50257": { + "content": " ", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "50258": { + "content": " ", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "50259": { + "content": " ", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "50260": { + "content": " ", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "50261": { + "content": " ", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "50262": { + "content": " ", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "50263": { + "content": " ", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "50264": { + "content": " ", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "50265": { + "content": " ", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "50266": { + "content": " ", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "50267": { + "content": " ", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "50268": { + "content": " ", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "50269": { + "content": " ", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "50270": { + "content": " ", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "50271": { + "content": " ", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "50272": { + "content": " ", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "50273": { + "content": " ", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "50274": { + "content": " ", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "50275": { + "content": " ", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "50276": { + "content": " ", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "bos_token": "<|endoftext|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|endoftext|>", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|endoftext|>", + "tokenizer_class": "GPTNeoXTokenizer", + "unk_token": "<|endoftext|>" +} diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000..c79b677 --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,1083 @@ +{ + "best_metric": 1.805204153060913, + "best_model_checkpoint": "/home/sr2464/scratch/C2S_Files/C2S_training_runs/multicell_v2_pretraining_runs/finetuning-EleutherAI/pythia-410m-multicell_v2_pretraining-2024-07-28_14-10-44/checkpoint-7000", + "epoch": 0.024296135179068302, + "eval_steps": 100, + "global_step": 7000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00034708764541526143, + "grad_norm": 12.638826370239258, + "learning_rate": 3.470776065528252e-07, + "loss": 4.1006, + "step": 100 + }, + { + "epoch": 0.00034708764541526143, + "eval_loss": 3.750605821609497, + "eval_runtime": 28.5808, + "eval_samples_per_second": 16.69, + "eval_steps_per_second": 0.35, + "step": 100 + }, + { + "epoch": 0.0006941752908305229, + "grad_norm": 7.417098522186279, + "learning_rate": 6.941552131056504e-07, + "loss": 3.4941, + "step": 200 + }, + { + "epoch": 0.0006941752908305229, + "eval_loss": 3.152513265609741, + "eval_runtime": 27.3056, + "eval_samples_per_second": 17.469, + "eval_steps_per_second": 0.366, + "step": 200 + }, + { + "epoch": 0.0010412629362457843, + "grad_norm": 5.065331935882568, + "learning_rate": 1.0412328196584756e-06, + "loss": 3.0197, + "step": 300 + }, + { + "epoch": 0.0010412629362457843, + "eval_loss": 2.8438754081726074, + "eval_runtime": 27.3207, + "eval_samples_per_second": 17.459, + "eval_steps_per_second": 0.366, + "step": 300 + }, + { + "epoch": 0.0013883505816610457, + "grad_norm": 5.113323211669922, + "learning_rate": 1.3883104262113009e-06, + "loss": 2.7915, + "step": 400 + }, + { + "epoch": 0.0013883505816610457, + "eval_loss": 2.6905336380004883, + "eval_runtime": 27.6367, + "eval_samples_per_second": 17.26, + "eval_steps_per_second": 0.362, + "step": 400 + }, + { + "epoch": 0.0017354382270763071, + "grad_norm": 6.473913192749023, + "learning_rate": 1.7353880327641261e-06, + "loss": 2.6697, + "step": 500 + }, + { + "epoch": 0.0017354382270763071, + "eval_loss": 2.609987735748291, + "eval_runtime": 27.5233, + "eval_samples_per_second": 17.331, + "eval_steps_per_second": 0.363, + "step": 500 + }, + { + "epoch": 0.0020825258724915686, + "grad_norm": 6.615407466888428, + "learning_rate": 2.082465639316951e-06, + "loss": 2.5959, + "step": 600 + }, + { + "epoch": 0.0020825258724915686, + "eval_loss": 2.5381109714508057, + "eval_runtime": 27.411, + "eval_samples_per_second": 17.402, + "eval_steps_per_second": 0.365, + "step": 600 + }, + { + "epoch": 0.00242961351790683, + "grad_norm": 8.234296798706055, + "learning_rate": 2.429543245869777e-06, + "loss": 2.5239, + "step": 700 + }, + { + "epoch": 0.00242961351790683, + "eval_loss": 2.4786229133605957, + "eval_runtime": 27.9876, + "eval_samples_per_second": 17.043, + "eval_steps_per_second": 0.357, + "step": 700 + }, + { + "epoch": 0.0027767011633220914, + "grad_norm": 3.987603187561035, + "learning_rate": 2.7766208524226017e-06, + "loss": 2.4544, + "step": 800 + }, + { + "epoch": 0.0027767011633220914, + "eval_loss": 2.4083495140075684, + "eval_runtime": 27.5819, + "eval_samples_per_second": 17.294, + "eval_steps_per_second": 0.363, + "step": 800 + }, + { + "epoch": 0.003123788808737353, + "grad_norm": 3.854851484298706, + "learning_rate": 3.123698458975427e-06, + "loss": 2.3861, + "step": 900 + }, + { + "epoch": 0.003123788808737353, + "eval_loss": 2.336987257003784, + "eval_runtime": 27.6168, + "eval_samples_per_second": 17.272, + "eval_steps_per_second": 0.362, + "step": 900 + }, + { + "epoch": 0.0034708764541526143, + "grad_norm": 3.7287888526916504, + "learning_rate": 3.4707760655282523e-06, + "loss": 2.3199, + "step": 1000 + }, + { + "epoch": 0.0034708764541526143, + "eval_loss": 2.2719526290893555, + "eval_runtime": 26.9345, + "eval_samples_per_second": 17.71, + "eval_steps_per_second": 0.371, + "step": 1000 + }, + { + "epoch": 0.0038179640995678757, + "grad_norm": 3.5708882808685303, + "learning_rate": 3.817853672081077e-06, + "loss": 2.2467, + "step": 1100 + }, + { + "epoch": 0.0038179640995678757, + "eval_loss": 2.206721067428589, + "eval_runtime": 27.0387, + "eval_samples_per_second": 17.641, + "eval_steps_per_second": 0.37, + "step": 1100 + }, + { + "epoch": 0.004165051744983137, + "grad_norm": 3.284808397293091, + "learning_rate": 4.164931278633902e-06, + "loss": 2.2017, + "step": 1200 + }, + { + "epoch": 0.004165051744983137, + "eval_loss": 2.1687662601470947, + "eval_runtime": 26.9942, + "eval_samples_per_second": 17.67, + "eval_steps_per_second": 0.37, + "step": 1200 + }, + { + "epoch": 0.004512139390398399, + "grad_norm": 3.306734323501587, + "learning_rate": 4.5120088851867285e-06, + "loss": 2.1612, + "step": 1300 + }, + { + "epoch": 0.004512139390398399, + "eval_loss": 2.129286766052246, + "eval_runtime": 27.1243, + "eval_samples_per_second": 17.586, + "eval_steps_per_second": 0.369, + "step": 1300 + }, + { + "epoch": 0.00485922703581366, + "grad_norm": 4.48453950881958, + "learning_rate": 4.859086491739554e-06, + "loss": 2.1302, + "step": 1400 + }, + { + "epoch": 0.00485922703581366, + "eval_loss": 2.1066970825195312, + "eval_runtime": 26.7667, + "eval_samples_per_second": 17.821, + "eval_steps_per_second": 0.374, + "step": 1400 + }, + { + "epoch": 0.0052063146812289214, + "grad_norm": 3.936067819595337, + "learning_rate": 5.206164098292378e-06, + "loss": 2.1019, + "step": 1500 + }, + { + "epoch": 0.0052063146812289214, + "eval_loss": 2.079268217086792, + "eval_runtime": 26.9038, + "eval_samples_per_second": 17.73, + "eval_steps_per_second": 0.372, + "step": 1500 + }, + { + "epoch": 0.005553402326644183, + "grad_norm": 3.049617290496826, + "learning_rate": 5.5532417048452035e-06, + "loss": 2.0839, + "step": 1600 + }, + { + "epoch": 0.005553402326644183, + "eval_loss": 2.063786268234253, + "eval_runtime": 26.8676, + "eval_samples_per_second": 17.754, + "eval_steps_per_second": 0.372, + "step": 1600 + }, + { + "epoch": 0.005900489972059444, + "grad_norm": 2.732224464416504, + "learning_rate": 5.900319311398029e-06, + "loss": 2.0707, + "step": 1700 + }, + { + "epoch": 0.005900489972059444, + "eval_loss": 2.046088695526123, + "eval_runtime": 26.9915, + "eval_samples_per_second": 17.672, + "eval_steps_per_second": 0.37, + "step": 1700 + }, + { + "epoch": 0.006247577617474706, + "grad_norm": 2.1865687370300293, + "learning_rate": 6.247396917950854e-06, + "loss": 2.0527, + "step": 1800 + }, + { + "epoch": 0.006247577617474706, + "eval_loss": 2.031470537185669, + "eval_runtime": 26.9114, + "eval_samples_per_second": 17.725, + "eval_steps_per_second": 0.372, + "step": 1800 + }, + { + "epoch": 0.006594665262889967, + "grad_norm": 2.854724407196045, + "learning_rate": 6.59447452450368e-06, + "loss": 2.0407, + "step": 1900 + }, + { + "epoch": 0.006594665262889967, + "eval_loss": 2.0221400260925293, + "eval_runtime": 26.7594, + "eval_samples_per_second": 17.825, + "eval_steps_per_second": 0.374, + "step": 1900 + }, + { + "epoch": 0.006941752908305229, + "grad_norm": 2.716592788696289, + "learning_rate": 6.9415521310565046e-06, + "loss": 2.0369, + "step": 2000 + }, + { + "epoch": 0.006941752908305229, + "eval_loss": 2.0095324516296387, + "eval_runtime": 26.7666, + "eval_samples_per_second": 17.821, + "eval_steps_per_second": 0.374, + "step": 2000 + }, + { + "epoch": 0.00728884055372049, + "grad_norm": 3.099235773086548, + "learning_rate": 7.28862973760933e-06, + "loss": 2.0172, + "step": 2100 + }, + { + "epoch": 0.00728884055372049, + "eval_loss": 1.9926340579986572, + "eval_runtime": 26.7511, + "eval_samples_per_second": 17.831, + "eval_steps_per_second": 0.374, + "step": 2100 + }, + { + "epoch": 0.0076359281991357515, + "grad_norm": 2.7901623249053955, + "learning_rate": 7.635707344162154e-06, + "loss": 2.0011, + "step": 2200 + }, + { + "epoch": 0.0076359281991357515, + "eval_loss": 1.9787664413452148, + "eval_runtime": 26.8686, + "eval_samples_per_second": 17.753, + "eval_steps_per_second": 0.372, + "step": 2200 + }, + { + "epoch": 0.007983015844551014, + "grad_norm": 1.9205849170684814, + "learning_rate": 7.982784950714981e-06, + "loss": 1.9848, + "step": 2300 + }, + { + "epoch": 0.007983015844551014, + "eval_loss": 1.9650758504867554, + "eval_runtime": 26.6567, + "eval_samples_per_second": 17.894, + "eval_steps_per_second": 0.375, + "step": 2300 + }, + { + "epoch": 0.008330103489966274, + "grad_norm": 2.2833688259124756, + "learning_rate": 8.329862557267805e-06, + "loss": 1.9804, + "step": 2400 + }, + { + "epoch": 0.008330103489966274, + "eval_loss": 1.95713472366333, + "eval_runtime": 26.6415, + "eval_samples_per_second": 17.904, + "eval_steps_per_second": 0.375, + "step": 2400 + }, + { + "epoch": 0.008677191135381537, + "grad_norm": 2.6494622230529785, + "learning_rate": 8.67694016382063e-06, + "loss": 1.9622, + "step": 2500 + }, + { + "epoch": 0.008677191135381537, + "eval_loss": 1.949569821357727, + "eval_runtime": 26.7134, + "eval_samples_per_second": 17.856, + "eval_steps_per_second": 0.374, + "step": 2500 + }, + { + "epoch": 0.009024278780796797, + "grad_norm": 1.9379554986953735, + "learning_rate": 9.024017770373457e-06, + "loss": 1.9624, + "step": 2600 + }, + { + "epoch": 0.009024278780796797, + "eval_loss": 1.9367306232452393, + "eval_runtime": 26.6328, + "eval_samples_per_second": 17.91, + "eval_steps_per_second": 0.375, + "step": 2600 + }, + { + "epoch": 0.00937136642621206, + "grad_norm": 1.797877311706543, + "learning_rate": 9.37109537692628e-06, + "loss": 1.9466, + "step": 2700 + }, + { + "epoch": 0.00937136642621206, + "eval_loss": 1.9342317581176758, + "eval_runtime": 26.6027, + "eval_samples_per_second": 17.931, + "eval_steps_per_second": 0.376, + "step": 2700 + }, + { + "epoch": 0.00971845407162732, + "grad_norm": 2.123497724533081, + "learning_rate": 9.718172983479108e-06, + "loss": 1.9421, + "step": 2800 + }, + { + "epoch": 0.00971845407162732, + "eval_loss": 1.9269955158233643, + "eval_runtime": 26.3282, + "eval_samples_per_second": 18.117, + "eval_steps_per_second": 0.38, + "step": 2800 + }, + { + "epoch": 0.010065541717042582, + "grad_norm": 2.386960029602051, + "learning_rate": 1.0065250590031931e-05, + "loss": 1.9395, + "step": 2900 + }, + { + "epoch": 0.010065541717042582, + "eval_loss": 1.9186092615127563, + "eval_runtime": 26.5488, + "eval_samples_per_second": 17.967, + "eval_steps_per_second": 0.377, + "step": 2900 + }, + { + "epoch": 0.010412629362457843, + "grad_norm": 1.994126558303833, + "learning_rate": 1.0412328196584756e-05, + "loss": 1.9242, + "step": 3000 + }, + { + "epoch": 0.010412629362457843, + "eval_loss": 1.9123972654342651, + "eval_runtime": 26.4618, + "eval_samples_per_second": 18.026, + "eval_steps_per_second": 0.378, + "step": 3000 + }, + { + "epoch": 0.010759717007873105, + "grad_norm": 1.7175105810165405, + "learning_rate": 1.0759405803137582e-05, + "loss": 1.9232, + "step": 3100 + }, + { + "epoch": 0.010759717007873105, + "eval_loss": 1.903687834739685, + "eval_runtime": 26.5449, + "eval_samples_per_second": 17.97, + "eval_steps_per_second": 0.377, + "step": 3100 + }, + { + "epoch": 0.011106804653288366, + "grad_norm": 1.8057008981704712, + "learning_rate": 1.1106483409690407e-05, + "loss": 1.923, + "step": 3200 + }, + { + "epoch": 0.011106804653288366, + "eval_loss": 1.9004727602005005, + "eval_runtime": 26.3249, + "eval_samples_per_second": 18.12, + "eval_steps_per_second": 0.38, + "step": 3200 + }, + { + "epoch": 0.011453892298703628, + "grad_norm": 1.6960588693618774, + "learning_rate": 1.1453561016243232e-05, + "loss": 1.9102, + "step": 3300 + }, + { + "epoch": 0.011453892298703628, + "eval_loss": 1.8942128419876099, + "eval_runtime": 26.351, + "eval_samples_per_second": 18.102, + "eval_steps_per_second": 0.379, + "step": 3300 + }, + { + "epoch": 0.011800979944118889, + "grad_norm": 1.8168739080429077, + "learning_rate": 1.1800638622796058e-05, + "loss": 1.9074, + "step": 3400 + }, + { + "epoch": 0.011800979944118889, + "eval_loss": 1.889912486076355, + "eval_runtime": 26.273, + "eval_samples_per_second": 18.155, + "eval_steps_per_second": 0.381, + "step": 3400 + }, + { + "epoch": 0.012148067589534151, + "grad_norm": 1.8745700120925903, + "learning_rate": 1.2147716229348883e-05, + "loss": 1.9053, + "step": 3500 + }, + { + "epoch": 0.012148067589534151, + "eval_loss": 1.8899447917938232, + "eval_runtime": 26.3575, + "eval_samples_per_second": 18.097, + "eval_steps_per_second": 0.379, + "step": 3500 + }, + { + "epoch": 0.012495155234949411, + "grad_norm": 1.77675199508667, + "learning_rate": 1.2494793835901708e-05, + "loss": 1.8983, + "step": 3600 + }, + { + "epoch": 0.012495155234949411, + "eval_loss": 1.8839936256408691, + "eval_runtime": 26.4506, + "eval_samples_per_second": 18.034, + "eval_steps_per_second": 0.378, + "step": 3600 + }, + { + "epoch": 0.012842242880364674, + "grad_norm": 1.473862648010254, + "learning_rate": 1.2841871442454533e-05, + "loss": 1.8998, + "step": 3700 + }, + { + "epoch": 0.012842242880364674, + "eval_loss": 1.8791130781173706, + "eval_runtime": 26.7984, + "eval_samples_per_second": 17.8, + "eval_steps_per_second": 0.373, + "step": 3700 + }, + { + "epoch": 0.013189330525779934, + "grad_norm": 1.486438512802124, + "learning_rate": 1.318894904900736e-05, + "loss": 1.8951, + "step": 3800 + }, + { + "epoch": 0.013189330525779934, + "eval_loss": 1.8784898519515991, + "eval_runtime": 26.3412, + "eval_samples_per_second": 18.109, + "eval_steps_per_second": 0.38, + "step": 3800 + }, + { + "epoch": 0.013536418171195197, + "grad_norm": 1.2373360395431519, + "learning_rate": 1.3536026655560182e-05, + "loss": 1.887, + "step": 3900 + }, + { + "epoch": 0.013536418171195197, + "eval_loss": 1.870314121246338, + "eval_runtime": 26.436, + "eval_samples_per_second": 18.044, + "eval_steps_per_second": 0.378, + "step": 3900 + }, + { + "epoch": 0.013883505816610457, + "grad_norm": 1.5347727537155151, + "learning_rate": 1.3883104262113009e-05, + "loss": 1.8951, + "step": 4000 + }, + { + "epoch": 0.013883505816610457, + "eval_loss": 1.8662590980529785, + "eval_runtime": 26.4553, + "eval_samples_per_second": 18.03, + "eval_steps_per_second": 0.378, + "step": 4000 + }, + { + "epoch": 0.01423059346202572, + "grad_norm": 1.771752953529358, + "learning_rate": 1.4230181868665834e-05, + "loss": 1.8854, + "step": 4100 + }, + { + "epoch": 0.01423059346202572, + "eval_loss": 1.8646233081817627, + "eval_runtime": 26.398, + "eval_samples_per_second": 18.07, + "eval_steps_per_second": 0.379, + "step": 4100 + }, + { + "epoch": 0.01457768110744098, + "grad_norm": 2.278961420059204, + "learning_rate": 1.457725947521866e-05, + "loss": 1.8829, + "step": 4200 + }, + { + "epoch": 0.01457768110744098, + "eval_loss": 1.8655200004577637, + "eval_runtime": 26.4296, + "eval_samples_per_second": 18.048, + "eval_steps_per_second": 0.378, + "step": 4200 + }, + { + "epoch": 0.014924768752856242, + "grad_norm": 1.965826392173767, + "learning_rate": 1.4924337081771487e-05, + "loss": 1.8768, + "step": 4300 + }, + { + "epoch": 0.014924768752856242, + "eval_loss": 1.861385464668274, + "eval_runtime": 26.4002, + "eval_samples_per_second": 18.068, + "eval_steps_per_second": 0.379, + "step": 4300 + }, + { + "epoch": 0.015271856398271503, + "grad_norm": 1.2046449184417725, + "learning_rate": 1.527141468832431e-05, + "loss": 1.8789, + "step": 4400 + }, + { + "epoch": 0.015271856398271503, + "eval_loss": 1.8564562797546387, + "eval_runtime": 26.5199, + "eval_samples_per_second": 17.986, + "eval_steps_per_second": 0.377, + "step": 4400 + }, + { + "epoch": 0.015618944043686765, + "grad_norm": 1.2664066553115845, + "learning_rate": 1.5618492294877134e-05, + "loss": 1.8676, + "step": 4500 + }, + { + "epoch": 0.015618944043686765, + "eval_loss": 1.8561848402023315, + "eval_runtime": 26.3757, + "eval_samples_per_second": 18.085, + "eval_steps_per_second": 0.379, + "step": 4500 + }, + { + "epoch": 0.015966031689102027, + "grad_norm": 1.242021918296814, + "learning_rate": 1.5965569901429962e-05, + "loss": 1.8747, + "step": 4600 + }, + { + "epoch": 0.015966031689102027, + "eval_loss": 1.8486684560775757, + "eval_runtime": 26.3838, + "eval_samples_per_second": 18.079, + "eval_steps_per_second": 0.379, + "step": 4600 + }, + { + "epoch": 0.016313119334517286, + "grad_norm": 1.7827301025390625, + "learning_rate": 1.6312647507982788e-05, + "loss": 1.8675, + "step": 4700 + }, + { + "epoch": 0.016313119334517286, + "eval_loss": 1.848850131034851, + "eval_runtime": 26.3928, + "eval_samples_per_second": 18.073, + "eval_steps_per_second": 0.379, + "step": 4700 + }, + { + "epoch": 0.01666020697993255, + "grad_norm": 1.4154130220413208, + "learning_rate": 1.665972511453561e-05, + "loss": 1.8646, + "step": 4800 + }, + { + "epoch": 0.01666020697993255, + "eval_loss": 1.8437469005584717, + "eval_runtime": 26.3717, + "eval_samples_per_second": 18.088, + "eval_steps_per_second": 0.379, + "step": 4800 + }, + { + "epoch": 0.01700729462534781, + "grad_norm": 1.556569218635559, + "learning_rate": 1.7006802721088435e-05, + "loss": 1.8633, + "step": 4900 + }, + { + "epoch": 0.01700729462534781, + "eval_loss": 1.8424543142318726, + "eval_runtime": 26.5267, + "eval_samples_per_second": 17.982, + "eval_steps_per_second": 0.377, + "step": 4900 + }, + { + "epoch": 0.017354382270763073, + "grad_norm": 1.2393227815628052, + "learning_rate": 1.735388032764126e-05, + "loss": 1.8581, + "step": 5000 + }, + { + "epoch": 0.017354382270763073, + "eval_loss": 1.8405553102493286, + "eval_runtime": 26.3219, + "eval_samples_per_second": 18.122, + "eval_steps_per_second": 0.38, + "step": 5000 + }, + { + "epoch": 0.017701469916178332, + "grad_norm": 1.141632318496704, + "learning_rate": 1.770095793419409e-05, + "loss": 1.8589, + "step": 5100 + }, + { + "epoch": 0.017701469916178332, + "eval_loss": 1.8358362913131714, + "eval_runtime": 26.3268, + "eval_samples_per_second": 18.118, + "eval_steps_per_second": 0.38, + "step": 5100 + }, + { + "epoch": 0.018048557561593594, + "grad_norm": 1.1542439460754395, + "learning_rate": 1.8048035540746914e-05, + "loss": 1.8583, + "step": 5200 + }, + { + "epoch": 0.018048557561593594, + "eval_loss": 1.8365525007247925, + "eval_runtime": 26.4883, + "eval_samples_per_second": 18.008, + "eval_steps_per_second": 0.378, + "step": 5200 + }, + { + "epoch": 0.018395645207008857, + "grad_norm": 1.0082952976226807, + "learning_rate": 1.8395113147299736e-05, + "loss": 1.8523, + "step": 5300 + }, + { + "epoch": 0.018395645207008857, + "eval_loss": 1.83348548412323, + "eval_runtime": 26.7213, + "eval_samples_per_second": 17.851, + "eval_steps_per_second": 0.374, + "step": 5300 + }, + { + "epoch": 0.01874273285242412, + "grad_norm": 1.3992128372192383, + "learning_rate": 1.874219075385256e-05, + "loss": 1.8557, + "step": 5400 + }, + { + "epoch": 0.01874273285242412, + "eval_loss": 1.830024003982544, + "eval_runtime": 26.3985, + "eval_samples_per_second": 18.069, + "eval_steps_per_second": 0.379, + "step": 5400 + }, + { + "epoch": 0.019089820497839378, + "grad_norm": 1.0521454811096191, + "learning_rate": 1.9089268360405386e-05, + "loss": 1.8541, + "step": 5500 + }, + { + "epoch": 0.019089820497839378, + "eval_loss": 1.8300797939300537, + "eval_runtime": 26.4761, + "eval_samples_per_second": 18.016, + "eval_steps_per_second": 0.378, + "step": 5500 + }, + { + "epoch": 0.01943690814325464, + "grad_norm": 1.2668583393096924, + "learning_rate": 1.9436345966958215e-05, + "loss": 1.8453, + "step": 5600 + }, + { + "epoch": 0.01943690814325464, + "eval_loss": 1.8309643268585205, + "eval_runtime": 26.3911, + "eval_samples_per_second": 18.074, + "eval_steps_per_second": 0.379, + "step": 5600 + }, + { + "epoch": 0.019783995788669902, + "grad_norm": 1.09402334690094, + "learning_rate": 1.9783423573511037e-05, + "loss": 1.8536, + "step": 5700 + }, + { + "epoch": 0.019783995788669902, + "eval_loss": 1.8289095163345337, + "eval_runtime": 26.4192, + "eval_samples_per_second": 18.055, + "eval_steps_per_second": 0.379, + "step": 5700 + }, + { + "epoch": 0.020131083434085165, + "grad_norm": 0.960943341255188, + "learning_rate": 2.0130501180063862e-05, + "loss": 1.8509, + "step": 5800 + }, + { + "epoch": 0.020131083434085165, + "eval_loss": 1.824504017829895, + "eval_runtime": 26.471, + "eval_samples_per_second": 18.02, + "eval_steps_per_second": 0.378, + "step": 5800 + }, + { + "epoch": 0.020478171079500423, + "grad_norm": 1.0958678722381592, + "learning_rate": 2.0477578786616688e-05, + "loss": 1.8452, + "step": 5900 + }, + { + "epoch": 0.020478171079500423, + "eval_loss": 1.8239096403121948, + "eval_runtime": 26.3508, + "eval_samples_per_second": 18.102, + "eval_steps_per_second": 0.379, + "step": 5900 + }, + { + "epoch": 0.020825258724915686, + "grad_norm": 1.1169177293777466, + "learning_rate": 2.0824656393169513e-05, + "loss": 1.8431, + "step": 6000 + }, + { + "epoch": 0.020825258724915686, + "eval_loss": 1.8211857080459595, + "eval_runtime": 26.3842, + "eval_samples_per_second": 18.079, + "eval_steps_per_second": 0.379, + "step": 6000 + }, + { + "epoch": 0.021172346370330948, + "grad_norm": 1.2566906213760376, + "learning_rate": 2.117173399972234e-05, + "loss": 1.8399, + "step": 6100 + }, + { + "epoch": 0.021172346370330948, + "eval_loss": 1.8177697658538818, + "eval_runtime": 26.4481, + "eval_samples_per_second": 18.035, + "eval_steps_per_second": 0.378, + "step": 6100 + }, + { + "epoch": 0.02151943401574621, + "grad_norm": 1.0141446590423584, + "learning_rate": 2.1518811606275163e-05, + "loss": 1.8412, + "step": 6200 + }, + { + "epoch": 0.02151943401574621, + "eval_loss": 1.8186702728271484, + "eval_runtime": 26.5203, + "eval_samples_per_second": 17.986, + "eval_steps_per_second": 0.377, + "step": 6200 + }, + { + "epoch": 0.021866521661161473, + "grad_norm": 1.1088286638259888, + "learning_rate": 2.186588921282799e-05, + "loss": 1.8433, + "step": 6300 + }, + { + "epoch": 0.021866521661161473, + "eval_loss": 1.8170266151428223, + "eval_runtime": 26.8479, + "eval_samples_per_second": 17.767, + "eval_steps_per_second": 0.372, + "step": 6300 + }, + { + "epoch": 0.02221360930657673, + "grad_norm": 1.0931763648986816, + "learning_rate": 2.2212966819380814e-05, + "loss": 1.8363, + "step": 6400 + }, + { + "epoch": 0.02221360930657673, + "eval_loss": 1.81370210647583, + "eval_runtime": 26.4259, + "eval_samples_per_second": 18.05, + "eval_steps_per_second": 0.378, + "step": 6400 + }, + { + "epoch": 0.022560696951991994, + "grad_norm": 1.040462851524353, + "learning_rate": 2.256004442593364e-05, + "loss": 1.8339, + "step": 6500 + }, + { + "epoch": 0.022560696951991994, + "eval_loss": 1.8151589632034302, + "eval_runtime": 26.44, + "eval_samples_per_second": 18.041, + "eval_steps_per_second": 0.378, + "step": 6500 + }, + { + "epoch": 0.022907784597407256, + "grad_norm": 1.0170607566833496, + "learning_rate": 2.2907122032486464e-05, + "loss": 1.833, + "step": 6600 + }, + { + "epoch": 0.022907784597407256, + "eval_loss": 1.8107798099517822, + "eval_runtime": 26.3941, + "eval_samples_per_second": 18.072, + "eval_steps_per_second": 0.379, + "step": 6600 + }, + { + "epoch": 0.02325487224282252, + "grad_norm": 0.813853919506073, + "learning_rate": 2.325419963903929e-05, + "loss": 1.8338, + "step": 6700 + }, + { + "epoch": 0.02325487224282252, + "eval_loss": 1.8114594221115112, + "eval_runtime": 26.3741, + "eval_samples_per_second": 18.086, + "eval_steps_per_second": 0.379, + "step": 6700 + }, + { + "epoch": 0.023601959888237777, + "grad_norm": 1.4288556575775146, + "learning_rate": 2.3601277245592115e-05, + "loss": 1.8301, + "step": 6800 + }, + { + "epoch": 0.023601959888237777, + "eval_loss": 1.8073900938034058, + "eval_runtime": 26.3565, + "eval_samples_per_second": 18.098, + "eval_steps_per_second": 0.379, + "step": 6800 + }, + { + "epoch": 0.02394904753365304, + "grad_norm": 0.8318554162979126, + "learning_rate": 2.394835485214494e-05, + "loss": 1.8293, + "step": 6900 + }, + { + "epoch": 0.02394904753365304, + "eval_loss": 1.80862295627594, + "eval_runtime": 26.4221, + "eval_samples_per_second": 18.053, + "eval_steps_per_second": 0.378, + "step": 6900 + }, + { + "epoch": 0.024296135179068302, + "grad_norm": 0.9045419096946716, + "learning_rate": 2.4295432458697766e-05, + "loss": 1.8324, + "step": 7000 + }, + { + "epoch": 0.024296135179068302, + "eval_loss": 1.805204153060913, + "eval_runtime": 26.4185, + "eval_samples_per_second": 18.056, + "eval_steps_per_second": 0.379, + "step": 7000 + } + ], + "logging_steps": 100, + "max_steps": 288111, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.46085192400896e+18, + "train_batch_size": 48, + "trial_name": null, + "trial_params": null +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000..2ed8732 --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0aaabad18018a1e9d309cf888aa604baa9f9a0b564776b67697d9675fde47834 +size 5368