初始化项目,由ModelHub XC社区提供模型
Model: radce/Kepler-3B-v1.2-gguf Source: Original Platform
This commit is contained in:
49
.gitattributes
vendored
Normal file
49
.gitattributes
vendored
Normal file
@@ -0,0 +1,49 @@
|
||||
*.7z filter=lfs diff=lfs merge=lfs -text
|
||||
*.arrow filter=lfs diff=lfs merge=lfs -text
|
||||
*.bin filter=lfs diff=lfs merge=lfs -text
|
||||
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
||||
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
||||
*.ftz filter=lfs diff=lfs merge=lfs -text
|
||||
*.gz filter=lfs diff=lfs merge=lfs -text
|
||||
*.h5 filter=lfs diff=lfs merge=lfs -text
|
||||
*.joblib filter=lfs diff=lfs merge=lfs -text
|
||||
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
||||
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
||||
*.model filter=lfs diff=lfs merge=lfs -text
|
||||
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
||||
*.npy filter=lfs diff=lfs merge=lfs -text
|
||||
*.npz filter=lfs diff=lfs merge=lfs -text
|
||||
*.onnx filter=lfs diff=lfs merge=lfs -text
|
||||
*.ot filter=lfs diff=lfs merge=lfs -text
|
||||
*.parquet filter=lfs diff=lfs merge=lfs -text
|
||||
*.pb filter=lfs diff=lfs merge=lfs -text
|
||||
*.pickle filter=lfs diff=lfs merge=lfs -text
|
||||
*.pkl filter=lfs diff=lfs merge=lfs -text
|
||||
*.pt filter=lfs diff=lfs merge=lfs -text
|
||||
*.pth filter=lfs diff=lfs merge=lfs -text
|
||||
*.rar filter=lfs diff=lfs merge=lfs -text
|
||||
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
||||
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
||||
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
||||
*.tar filter=lfs diff=lfs merge=lfs -text
|
||||
*.tflite filter=lfs diff=lfs merge=lfs -text
|
||||
*.tgz filter=lfs diff=lfs merge=lfs -text
|
||||
*.wasm filter=lfs diff=lfs merge=lfs -text
|
||||
*.xz filter=lfs diff=lfs merge=lfs -text
|
||||
*.zip filter=lfs diff=lfs merge=lfs -text
|
||||
*.zst filter=lfs diff=lfs merge=lfs -text
|
||||
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
||||
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
||||
unsloth.Q8_0.gguf filter=lfs diff=lfs merge=lfs -text
|
||||
unsloth.F16.gguf filter=lfs diff=lfs merge=lfs -text
|
||||
unsloth.Q5_K_M.gguf filter=lfs diff=lfs merge=lfs -text
|
||||
unsloth.Q4_0.gguf filter=lfs diff=lfs merge=lfs -text
|
||||
Llama-3.2-3B-ru-v1.2-Q8_0.gguf filter=lfs diff=lfs merge=lfs -text
|
||||
Llama-3.2-3B-ru-v1.2-Q5_K_M.gguf filter=lfs diff=lfs merge=lfs -text
|
||||
Llama-3.2-3B-ru-v1.2-Q4_0.gguf filter=lfs diff=lfs merge=lfs -text
|
||||
Llama-3.2-3B-ru-v1.2-F16.gguf filter=lfs diff=lfs merge=lfs -text
|
||||
Kepler-v1.2-F16.gguf filter=lfs diff=lfs merge=lfs -text
|
||||
Kepler-3B-v1.2-Q4_0.gguf filter=lfs diff=lfs merge=lfs -text
|
||||
Kepler-3B-v1.2-F16.gguf filter=lfs diff=lfs merge=lfs -text
|
||||
Kepler-3B-v1.2-Q5_K_M.gguf filter=lfs diff=lfs merge=lfs -text
|
||||
Kepler-3B-v1.2-Q8_0.gguf filter=lfs diff=lfs merge=lfs -text
|
||||
3
Kepler-3B-v1.2-F16.gguf
Normal file
3
Kepler-3B-v1.2-F16.gguf
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:be8472c3c6f3d823961e01b37c3e739120b9427c011ece26230989b00a8231ad
|
||||
size 6433706144
|
||||
3
Kepler-3B-v1.2-Q4_0.gguf
Normal file
3
Kepler-3B-v1.2-Q4_0.gguf
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:6b4f3f01fd38af5a3833df5759cee5b9d1d759c0a7d952ea7dc8591383e0fa0a
|
||||
size 1917197888
|
||||
3
Kepler-3B-v1.2-Q5_K_M.gguf
Normal file
3
Kepler-3B-v1.2-Q5_K_M.gguf
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:1e50bde9bafa80c17e2bb055d07a2b5d589b9a69c41a391d30290b14b9dba18c
|
||||
size 2322161216
|
||||
3
Kepler-3B-v1.2-Q8_0.gguf
Normal file
3
Kepler-3B-v1.2-Q8_0.gguf
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:5b6f1d9d2a4bc443d62c04e33a47716e8a114d1ee9ecc830fef95c534c37e5e3
|
||||
size 3421908704
|
||||
184
README.md
Normal file
184
README.md
Normal file
@@ -0,0 +1,184 @@
|
||||
---
|
||||
license: mit
|
||||
language:
|
||||
- ru
|
||||
base_model:
|
||||
- meta-llama/Llama-3.2-3B-Instruct
|
||||
pipeline_tag: text-generation
|
||||
new_version: radce/Kepler-3B-v1.3-gguf
|
||||
---
|
||||
Обучение:
|
||||
```
|
||||
{'loss': 1.0045, 'grad_norm': 1.434638500213623, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.02}
|
||||
{'loss': 0.8268, 'grad_norm': 1.598044753074646, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.03}
|
||||
{'loss': 1.0176, 'grad_norm': 1.381459355354309, 'learning_rate': 6e-06, 'epoch': 0.05}
|
||||
{'loss': 0.8224, 'grad_norm': 1.5868133306503296, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.07}
|
||||
{'loss': 0.9405, 'grad_norm': 1.477469801902771, 'learning_rate': 1e-05, 'epoch': 0.08}
|
||||
{'loss': 0.9827, 'grad_norm': 1.5393009185791016, 'learning_rate': 1.2e-05, 'epoch': 0.1}
|
||||
{'loss': 0.8774, 'grad_norm': 1.2693432569503784, 'learning_rate': 1.4000000000000001e-05, 'epoch': 0.12}
|
||||
{'loss': 0.9519, 'grad_norm': 1.2252171039581299, 'learning_rate': 1.6000000000000003e-05, 'epoch': 0.14}
|
||||
{'loss': 0.7712, 'grad_norm': 1.2567410469055176, 'learning_rate': 1.8e-05, 'epoch': 0.15}
|
||||
{'loss': 0.9039, 'grad_norm': 1.069510579109192, 'learning_rate': 2e-05, 'epoch': 0.17}
|
||||
{'loss': 0.8103, 'grad_norm': 1.0923222303390503, 'learning_rate': 2.2000000000000003e-05, 'epoch': 0.19}
|
||||
{'loss': 0.8733, 'grad_norm': 1.1691577434539795, 'learning_rate': 2.4e-05, 'epoch': 0.2}
|
||||
{'loss': 0.9522, 'grad_norm': 1.1134045124053955, 'learning_rate': 2.6000000000000002e-05, 'epoch': 0.22}
|
||||
{'loss': 0.855, 'grad_norm': 1.0952295064926147, 'learning_rate': 2.8000000000000003e-05, 'epoch': 0.24}
|
||||
{'loss': 0.8681, 'grad_norm': 1.214271068572998, 'learning_rate': 3e-05, 'epoch': 0.25}
|
||||
{'loss': 1.0045, 'grad_norm': 1.1826456785202026, 'learning_rate': 3.2000000000000005e-05, 'epoch': 0.27}
|
||||
{'loss': 0.987, 'grad_norm': 1.2941240072250366, 'learning_rate': 3.4000000000000007e-05, 'epoch': 0.29}
|
||||
{'loss': 0.943, 'grad_norm': 1.2107560634613037, 'learning_rate': 3.6e-05, 'epoch': 0.31}
|
||||
{'loss': 0.9103, 'grad_norm': 1.2422106266021729, 'learning_rate': 3.8e-05, 'epoch': 0.32}
|
||||
{'loss': 0.7847, 'grad_norm': 1.1542211771011353, 'learning_rate': 4e-05, 'epoch': 0.34}
|
||||
{'loss': 0.8424, 'grad_norm': 1.1583290100097656, 'learning_rate': 4.2e-05, 'epoch': 0.36}
|
||||
{'loss': 0.8719, 'grad_norm': 1.1088871955871582, 'learning_rate': 4.4000000000000006e-05, 'epoch': 0.37}
|
||||
{'loss': 0.8529, 'grad_norm': 1.172914981842041, 'learning_rate': 4.600000000000001e-05, 'epoch': 0.39}
|
||||
{'loss': 0.9943, 'grad_norm': 1.2056026458740234, 'learning_rate': 4.8e-05, 'epoch': 0.41}
|
||||
{'loss': 0.897, 'grad_norm': 1.0738246440887451, 'learning_rate': 5e-05, 'epoch': 0.42}
|
||||
{'loss': 0.9494, 'grad_norm': 1.3239482641220093, 'learning_rate': 4.967105263157895e-05, 'epoch': 0.44}
|
||||
{'loss': 0.84, 'grad_norm': 1.2228782176971436, 'learning_rate': 4.9342105263157894e-05, 'epoch': 0.46}
|
||||
{'loss': 0.9096, 'grad_norm': 1.3999152183532715, 'learning_rate': 4.901315789473684e-05, 'epoch': 0.47}
|
||||
{'loss': 0.8563, 'grad_norm': 1.2490519285202026, 'learning_rate': 4.868421052631579e-05, 'epoch': 0.49}
|
||||
{'loss': 0.7797, 'grad_norm': 1.030503273010254, 'learning_rate': 4.8355263157894734e-05, 'epoch': 0.51}
|
||||
{'loss': 0.8837, 'grad_norm': 0.9622178673744202, 'learning_rate': 4.802631578947368e-05, 'epoch': 0.53}
|
||||
{'loss': 0.9102, 'grad_norm': 1.4168736934661865, 'learning_rate': 4.769736842105263e-05, 'epoch': 0.54}
|
||||
{'loss': 0.9757, 'grad_norm': 1.2101160287857056, 'learning_rate': 4.736842105263158e-05, 'epoch': 0.56}
|
||||
{'loss': 0.8566, 'grad_norm': 1.0162750482559204, 'learning_rate': 4.703947368421053e-05, 'epoch': 0.58}
|
||||
{'loss': 0.8328, 'grad_norm': 1.2085012197494507, 'learning_rate': 4.671052631578948e-05, 'epoch': 0.59}
|
||||
{'loss': 0.8492, 'grad_norm': 1.0620111227035522, 'learning_rate': 4.638157894736843e-05, 'epoch': 0.61}
|
||||
{'loss': 0.8269, 'grad_norm': 1.1411484479904175, 'learning_rate': 4.605263157894737e-05, 'epoch': 0.63}
|
||||
{'loss': 0.8676, 'grad_norm': 1.0270442962646484, 'learning_rate': 4.572368421052632e-05, 'epoch': 0.64}
|
||||
{'loss': 0.6919, 'grad_norm': 1.04926335811615, 'learning_rate': 4.539473684210527e-05, 'epoch': 0.66}
|
||||
{'loss': 0.7621, 'grad_norm': 0.9661086201667786, 'learning_rate': 4.506578947368421e-05, 'epoch': 0.68}
|
||||
{'loss': 0.8536, 'grad_norm': 1.0378029346466064, 'learning_rate': 4.473684210526316e-05, 'epoch': 0.69}
|
||||
{'loss': 0.8257, 'grad_norm': 1.0945830345153809, 'learning_rate': 4.440789473684211e-05, 'epoch': 0.71}
|
||||
{'loss': 0.7799, 'grad_norm': 0.9396383166313171, 'learning_rate': 4.407894736842105e-05, 'epoch': 0.73}
|
||||
{'loss': 0.8284, 'grad_norm': 1.0389983654022217, 'learning_rate': 4.375e-05, 'epoch': 0.75}
|
||||
{'loss': 0.7705, 'grad_norm': 0.9025696516036987, 'learning_rate': 4.342105263157895e-05, 'epoch': 0.76}
|
||||
{'loss': 0.8789, 'grad_norm': 1.2010221481323242, 'learning_rate': 4.30921052631579e-05, 'epoch': 0.78}
|
||||
{'loss': 0.8574, 'grad_norm': 1.147383689880371, 'learning_rate': 4.2763157894736847e-05, 'epoch': 0.8}
|
||||
{'loss': 0.8907, 'grad_norm': 1.0630712509155273, 'learning_rate': 4.2434210526315796e-05, 'epoch': 0.81}
|
||||
{'loss': 0.9025, 'grad_norm': 1.1264381408691406, 'learning_rate': 4.210526315789474e-05, 'epoch': 0.83}
|
||||
{'loss': 0.8128, 'grad_norm': 1.104578971862793, 'learning_rate': 4.177631578947369e-05, 'epoch': 0.85}
|
||||
{'loss': 0.8149, 'grad_norm': 1.351408839225769, 'learning_rate': 4.1447368421052636e-05, 'epoch': 0.86}
|
||||
{'loss': 0.9135, 'grad_norm': 1.0585707426071167, 'learning_rate': 4.111842105263158e-05, 'epoch': 0.88}
|
||||
{'loss': 0.8334, 'grad_norm': 0.9967688322067261, 'learning_rate': 4.078947368421053e-05, 'epoch': 0.9}
|
||||
{'loss': 0.8665, 'grad_norm': 1.0266575813293457, 'learning_rate': 4.0460526315789476e-05, 'epoch': 0.92}
|
||||
{'loss': 0.9028, 'grad_norm': 1.0675684213638306, 'learning_rate': 4.0131578947368425e-05, 'epoch': 0.93}
|
||||
{'loss': 0.8895, 'grad_norm': 1.1028997898101807, 'learning_rate': 3.980263157894737e-05, 'epoch': 0.95}
|
||||
{'loss': 0.9224, 'grad_norm': 1.1560149192810059, 'learning_rate': 3.9473684210526316e-05, 'epoch': 0.97}
|
||||
{'loss': 0.7703, 'grad_norm': 1.0747054815292358, 'learning_rate': 3.9144736842105265e-05, 'epoch': 0.98}
|
||||
{'loss': 0.8491, 'grad_norm': 1.0192369222640991, 'learning_rate': 3.8815789473684214e-05, 'epoch': 1.0}
|
||||
{'loss': 0.7828, 'grad_norm': 1.1316883563995361, 'learning_rate': 3.848684210526316e-05, 'epoch': 1.02}
|
||||
{'loss': 0.7938, 'grad_norm': 1.341268539428711, 'learning_rate': 3.815789473684211e-05, 'epoch': 1.03}
|
||||
{'loss': 0.762, 'grad_norm': 1.0320522785186768, 'learning_rate': 3.7828947368421054e-05, 'epoch': 1.05}
|
||||
{'loss': 0.8595, 'grad_norm': 1.286846399307251, 'learning_rate': 3.7500000000000003e-05, 'epoch': 1.07}
|
||||
{'loss': 0.8076, 'grad_norm': 1.0932618379592896, 'learning_rate': 3.717105263157895e-05, 'epoch': 1.08}
|
||||
{'loss': 0.8065, 'grad_norm': 1.0666605234146118, 'learning_rate': 3.6842105263157895e-05, 'epoch': 1.1}
|
||||
{'loss': 0.7758, 'grad_norm': 1.1275005340576172, 'learning_rate': 3.6513157894736844e-05, 'epoch': 1.12}
|
||||
{'loss': 0.7348, 'grad_norm': 0.9435586333274841, 'learning_rate': 3.618421052631579e-05, 'epoch': 1.14}
|
||||
{'loss': 0.7894, 'grad_norm': 1.0600650310516357, 'learning_rate': 3.5855263157894735e-05, 'epoch': 1.15}
|
||||
{'loss': 0.7364, 'grad_norm': 0.9604922533035278, 'learning_rate': 3.5526315789473684e-05, 'epoch': 1.17}
|
||||
{'loss': 0.8061, 'grad_norm': 0.9637410640716553, 'learning_rate': 3.519736842105263e-05, 'epoch': 1.19}
|
||||
{'loss': 0.7755, 'grad_norm': 0.9641973376274109, 'learning_rate': 3.4868421052631575e-05, 'epoch': 1.2}
|
||||
{'loss': 0.7269, 'grad_norm': 0.9292505979537964, 'learning_rate': 3.4539473684210524e-05, 'epoch': 1.22}
|
||||
{'loss': 0.803, 'grad_norm': 1.021993637084961, 'learning_rate': 3.421052631578947e-05, 'epoch': 1.24}
|
||||
{'loss': 0.7852, 'grad_norm': 1.0617938041687012, 'learning_rate': 3.388157894736842e-05, 'epoch': 1.25}
|
||||
{'loss': 0.919, 'grad_norm': 1.0504485368728638, 'learning_rate': 3.355263157894737e-05, 'epoch': 1.27}
|
||||
{'loss': 0.782, 'grad_norm': 1.0453288555145264, 'learning_rate': 3.322368421052632e-05, 'epoch': 1.29}
|
||||
{'loss': 0.758, 'grad_norm': 1.1698745489120483, 'learning_rate': 3.289473684210527e-05, 'epoch': 1.31}
|
||||
{'loss': 0.7874, 'grad_norm': 0.9532782435417175, 'learning_rate': 3.256578947368421e-05, 'epoch': 1.32}
|
||||
{'loss': 0.7854, 'grad_norm': 1.0863136053085327, 'learning_rate': 3.223684210526316e-05, 'epoch': 1.34}
|
||||
{'loss': 0.9304, 'grad_norm': 0.9589587450027466, 'learning_rate': 3.190789473684211e-05, 'epoch': 1.36}
|
||||
{'loss': 0.8808, 'grad_norm': 1.0683907270431519, 'learning_rate': 3.157894736842105e-05, 'epoch': 1.37}
|
||||
{'loss': 0.7236, 'grad_norm': 1.0750744342803955, 'learning_rate': 3.125e-05, 'epoch': 1.39}
|
||||
{'loss': 0.72, 'grad_norm': 1.0241038799285889, 'learning_rate': 3.092105263157895e-05, 'epoch': 1.41}
|
||||
{'loss': 0.7656, 'grad_norm': 0.961932361125946, 'learning_rate': 3.059210526315789e-05, 'epoch': 1.42}
|
||||
{'loss': 0.7767, 'grad_norm': 1.0009723901748657, 'learning_rate': 3.0263157894736844e-05, 'epoch': 1.44}
|
||||
{'loss': 0.785, 'grad_norm': 0.9455718994140625, 'learning_rate': 2.9934210526315793e-05, 'epoch': 1.46}
|
||||
{'loss': 0.7797, 'grad_norm': 1.1910121440887451, 'learning_rate': 2.9605263157894735e-05, 'epoch': 1.47}
|
||||
{'loss': 0.8361, 'grad_norm': 1.247739315032959, 'learning_rate': 2.9276315789473684e-05, 'epoch': 1.49}
|
||||
{'loss': 0.8375, 'grad_norm': 1.128031849861145, 'learning_rate': 2.8947368421052634e-05, 'epoch': 1.51}
|
||||
{'loss': 0.8447, 'grad_norm': 1.1405525207519531, 'learning_rate': 2.861842105263158e-05, 'epoch': 1.53}
|
||||
{'loss': 0.7917, 'grad_norm': 1.1599271297454834, 'learning_rate': 2.8289473684210528e-05, 'epoch': 1.54}
|
||||
{'loss': 0.8084, 'grad_norm': 0.9633246660232544, 'learning_rate': 2.7960526315789477e-05, 'epoch': 1.56}
|
||||
{'loss': 0.7432, 'grad_norm': 0.9510669112205505, 'learning_rate': 2.7631578947368426e-05, 'epoch': 1.58}
|
||||
{'loss': 0.8118, 'grad_norm': 0.9820936322212219, 'learning_rate': 2.730263157894737e-05, 'epoch': 1.59}
|
||||
{'loss': 0.757, 'grad_norm': 1.0045727491378784, 'learning_rate': 2.6973684210526317e-05, 'epoch': 1.61}
|
||||
{'loss': 0.7861, 'grad_norm': 0.9349510669708252, 'learning_rate': 2.6644736842105266e-05, 'epoch': 1.63}
|
||||
{'loss': 0.8677, 'grad_norm': 1.0302456617355347, 'learning_rate': 2.6315789473684212e-05, 'epoch': 1.64}
|
||||
{'loss': 0.8358, 'grad_norm': 0.9015762209892273, 'learning_rate': 2.598684210526316e-05, 'epoch': 1.66}
|
||||
{'loss': 0.7407, 'grad_norm': 1.0294384956359863, 'learning_rate': 2.565789473684211e-05, 'epoch': 1.68}
|
||||
{'loss': 0.8719, 'grad_norm': 1.0246537923812866, 'learning_rate': 2.5328947368421052e-05, 'epoch': 1.69}
|
||||
{'loss': 0.8762, 'grad_norm': 1.049678921699524, 'learning_rate': 2.5e-05, 'epoch': 1.71}
|
||||
{'loss': 0.7522, 'grad_norm': 1.413390040397644, 'learning_rate': 2.4671052631578947e-05, 'epoch': 1.73}
|
||||
{'loss': 0.7404, 'grad_norm': 0.988237738609314, 'learning_rate': 2.4342105263157896e-05, 'epoch': 1.75}
|
||||
{'loss': 0.7105, 'grad_norm': 0.9710869193077087, 'learning_rate': 2.401315789473684e-05, 'epoch': 1.76}
|
||||
{'loss': 0.7412, 'grad_norm': 1.1005488634109497, 'learning_rate': 2.368421052631579e-05, 'epoch': 1.78}
|
||||
{'loss': 0.8328, 'grad_norm': 1.0540498495101929, 'learning_rate': 2.335526315789474e-05, 'epoch': 1.8}
|
||||
{'loss': 0.8219, 'grad_norm': 1.0091166496276855, 'learning_rate': 2.3026315789473685e-05, 'epoch': 1.81}
|
||||
{'loss': 0.9058, 'grad_norm': 1.3978734016418457, 'learning_rate': 2.2697368421052634e-05, 'epoch': 1.83}
|
||||
{'loss': 0.8499, 'grad_norm': 1.2428942918777466, 'learning_rate': 2.236842105263158e-05, 'epoch': 1.85}
|
||||
{'loss': 0.8286, 'grad_norm': 1.1150847673416138, 'learning_rate': 2.2039473684210525e-05, 'epoch': 1.86}
|
||||
{'loss': 0.8778, 'grad_norm': 1.0458968877792358, 'learning_rate': 2.1710526315789474e-05, 'epoch': 1.88}
|
||||
{'loss': 0.7811, 'grad_norm': 1.1380870342254639, 'learning_rate': 2.1381578947368423e-05, 'epoch': 1.9}
|
||||
{'loss': 0.8515, 'grad_norm': 1.0021414756774902, 'learning_rate': 2.105263157894737e-05, 'epoch': 1.92}
|
||||
{'loss': 0.6846, 'grad_norm': 0.9769864678382874, 'learning_rate': 2.0723684210526318e-05, 'epoch': 1.93}
|
||||
{'loss': 0.7976, 'grad_norm': 0.9892944097518921, 'learning_rate': 2.0394736842105264e-05, 'epoch': 1.95}
|
||||
{'loss': 0.7674, 'grad_norm': 0.9153032898902893, 'learning_rate': 2.0065789473684213e-05, 'epoch': 1.97}
|
||||
{'loss': 0.7683, 'grad_norm': 1.055893898010254, 'learning_rate': 1.9736842105263158e-05, 'epoch': 1.98}
|
||||
{'loss': 0.6988, 'grad_norm': 0.9477760791778564, 'learning_rate': 1.9407894736842107e-05, 'epoch': 2.0}
|
||||
{'loss': 0.758, 'grad_norm': 1.0403963327407837, 'learning_rate': 1.9078947368421056e-05, 'epoch': 2.02}
|
||||
{'loss': 0.7272, 'grad_norm': 1.004212498664856, 'learning_rate': 1.8750000000000002e-05, 'epoch': 2.03}
|
||||
{'loss': 0.7309, 'grad_norm': 1.1268775463104248, 'learning_rate': 1.8421052631578947e-05, 'epoch': 2.05}
|
||||
{'loss': 0.7668, 'grad_norm': 1.1228442192077637, 'learning_rate': 1.8092105263157896e-05, 'epoch': 2.07}
|
||||
{'loss': 0.7394, 'grad_norm': 1.300577163696289, 'learning_rate': 1.7763157894736842e-05, 'epoch': 2.08}
|
||||
{'loss': 0.8575, 'grad_norm': 1.214979887008667, 'learning_rate': 1.7434210526315788e-05, 'epoch': 2.1}
|
||||
{'loss': 0.6988, 'grad_norm': 0.9477760791778564, 'learning_rate': 1.9407894736842107e-05, 'epoch': 2.0}
|
||||
{'loss': 0.758, 'grad_norm': 1.0403963327407837, 'learning_rate': 1.9078947368421056e-05, 'epoch': 2.02}
|
||||
{'loss': 0.7272, 'grad_norm': 1.004212498664856, 'learning_rate': 1.8750000000000002e-05, 'epoch': 2.03}
|
||||
{'loss': 0.7309, 'grad_norm': 1.1268775463104248, 'learning_rate': 1.8421052631578947e-05, 'epoch': 2.05}
|
||||
{'loss': 0.7668, 'grad_norm': 1.1228442192077637, 'learning_rate': 1.8092105263157896e-05, 'epoch': 2.07}
|
||||
{'loss': 0.7394, 'grad_norm': 1.300577163696289, 'learning_rate': 1.7763157894736842e-05, 'epoch': 2.08}
|
||||
{'loss': 0.8575, 'grad_norm': 1.214979887008667, 'learning_rate': 1.7434210526315788e-05, 'epoch': 2.1}
|
||||
{'loss': 0.7105, 'grad_norm': 1.3833553791046143, 'learning_rate': 1.7105263157894737e-05, 'epoch': 2.12}
|
||||
{'loss': 0.7992, 'grad_norm': 1.0076022148132324, 'learning_rate': 1.6776315789473686e-05, 'epoch': 2.14}
|
||||
{'loss': 0.7741, 'grad_norm': 1.0674270391464233, 'learning_rate': 1.6447368421052635e-05, 'epoch': 2.15}
|
||||
{'loss': 0.8255, 'grad_norm': 0.9739935398101807, 'learning_rate': 1.611842105263158e-05, 'epoch': 2.17}
|
||||
{'loss': 0.7364, 'grad_norm': 1.0273830890655518, 'learning_rate': 1.5789473684210526e-05, 'epoch': 2.19}
|
||||
{'loss': 0.7663, 'grad_norm': 0.9955479502677917, 'learning_rate': 1.5460526315789475e-05, 'epoch': 2.2}
|
||||
{'loss': 0.7806, 'grad_norm': 1.0048789978027344, 'learning_rate': 1.5131578947368422e-05, 'epoch': 2.22}
|
||||
{'loss': 0.7314, 'grad_norm': 0.910517692565918, 'learning_rate': 1.4802631578947368e-05, 'epoch': 2.24}
|
||||
{'loss': 0.7101, 'grad_norm': 0.8998332619667053, 'learning_rate': 1.4473684210526317e-05, 'epoch': 2.25}
|
||||
{'loss': 0.8105, 'grad_norm': 1.1091840267181396, 'learning_rate': 1.4144736842105264e-05, 'epoch': 2.27}
|
||||
{'loss': 0.7101, 'grad_norm': 1.0029867887496948, 'learning_rate': 1.3815789473684213e-05, 'epoch': 2.29}
|
||||
{'loss': 0.7245, 'grad_norm': 0.9056518077850342, 'learning_rate': 1.3486842105263159e-05, 'epoch': 2.31}
|
||||
{'loss': 0.7078, 'grad_norm': 0.9426273703575134, 'learning_rate': 1.3157894736842106e-05, 'epoch': 2.32}
|
||||
{'loss': 0.7875, 'grad_norm': 0.9711980819702148, 'learning_rate': 1.2828947368421055e-05, 'epoch': 2.34}
|
||||
{'loss': 0.7573, 'grad_norm': 1.0548990964889526, 'learning_rate': 1.25e-05, 'epoch': 2.36}
|
||||
{'loss': 0.7183, 'grad_norm': 1.0763499736785889, 'learning_rate': 1.2171052631578948e-05, 'epoch': 2.37}
|
||||
{'loss': 0.8136, 'grad_norm': 0.9355372190475464, 'learning_rate': 1.1842105263157895e-05, 'epoch': 2.39}
|
||||
{'loss': 0.8372, 'grad_norm': 0.9965329170227051, 'learning_rate': 1.1513157894736843e-05, 'epoch': 2.41}
|
||||
{'loss': 0.7249, 'grad_norm': 1.012364387512207, 'learning_rate': 1.118421052631579e-05, 'epoch': 2.42}
|
||||
{'loss': 0.7521, 'grad_norm': 0.9542561769485474, 'learning_rate': 1.0855263157894737e-05, 'epoch': 2.44}
|
||||
{'loss': 0.7672, 'grad_norm': 1.037903070449829, 'learning_rate': 1.0526315789473684e-05, 'epoch': 2.46}
|
||||
{'loss': 0.7121, 'grad_norm': 0.9946797490119934, 'learning_rate': 1.0197368421052632e-05, 'epoch': 2.47}
|
||||
{'loss': 0.6738, 'grad_norm': 0.9309292435646057, 'learning_rate': 9.868421052631579e-06, 'epoch': 2.49}
|
||||
{'loss': 0.7584, 'grad_norm': 1.0045052766799927, 'learning_rate': 9.539473684210528e-06, 'epoch': 2.51}
|
||||
{'loss': 0.7027, 'grad_norm': 0.9433549046516418, 'learning_rate': 9.210526315789474e-06, 'epoch': 2.53}
|
||||
{'loss': 0.8276, 'grad_norm': 0.9263595938682556, 'learning_rate': 8.881578947368421e-06, 'epoch': 2.54}
|
||||
{'loss': 0.8488, 'grad_norm': 0.9942632913589478, 'learning_rate': 8.552631578947368e-06, 'epoch': 2.56}
|
||||
{'loss': 0.6354, 'grad_norm': 1.0013046264648438, 'learning_rate': 8.223684210526317e-06, 'epoch': 2.58}
|
||||
{'loss': 0.7416, 'grad_norm': 1.0437984466552734, 'learning_rate': 7.894736842105263e-06, 'epoch': 2.59}
|
||||
{'loss': 0.7597, 'grad_norm': 0.9986825585365295, 'learning_rate': 7.565789473684211e-06, 'epoch': 2.61}
|
||||
{'loss': 0.741, 'grad_norm': 0.9330255389213562, 'learning_rate': 7.236842105263158e-06, 'epoch': 2.63}
|
||||
{'loss': 0.7414, 'grad_norm': 0.9645633697509766, 'learning_rate': 6.9078947368421065e-06, 'epoch': 2.64}
|
||||
{'loss': 0.737, 'grad_norm': 0.9212505221366882, 'learning_rate': 6.578947368421053e-06, 'epoch': 2.66}
|
||||
{'loss': 0.7132, 'grad_norm': 0.9016715884208679, 'learning_rate': 6.25e-06, 'epoch': 2.68}
|
||||
{'loss': 0.7327, 'grad_norm': 1.0068784952163696, 'learning_rate': 5.921052631578948e-06, 'epoch': 2.69}
|
||||
{'loss': 0.757, 'grad_norm': 1.0225309133529663, 'learning_rate': 5.592105263157895e-06, 'epoch': 2.71}
|
||||
{'loss': 0.7916, 'grad_norm': 0.9414555430412292, 'learning_rate': 5.263157894736842e-06, 'epoch': 2.73}
|
||||
{'loss': 0.7665, 'grad_norm': 0.9226305484771729, 'learning_rate': 4.9342105263157895e-06, 'epoch': 2.75}
|
||||
```
|
||||
1 эпоха: слои [-1]
|
||||
2 эпоха: слои [-1, -2]
|
||||
3 эпоха: слои [-1, -2, -3]
|
||||
40
config.json
Normal file
40
config.json
Normal file
@@ -0,0 +1,40 @@
|
||||
{
|
||||
"_name_or_path": "radce/Llama-3.2-3B",
|
||||
"architectures": [
|
||||
"LlamaForCausalLM"
|
||||
],
|
||||
"attention_bias": false,
|
||||
"attention_dropout": 0.0,
|
||||
"bos_token_id": 128000,
|
||||
"eos_token_id": [
|
||||
128001,
|
||||
128008,
|
||||
128009
|
||||
],
|
||||
"head_dim": 128,
|
||||
"hidden_act": "silu",
|
||||
"hidden_size": 3072,
|
||||
"initializer_range": 0.02,
|
||||
"intermediate_size": 8192,
|
||||
"max_position_embeddings": 131072,
|
||||
"mlp_bias": false,
|
||||
"model_type": "llama",
|
||||
"num_attention_heads": 24,
|
||||
"num_hidden_layers": 28,
|
||||
"num_key_value_heads": 8,
|
||||
"pretraining_tp": 1,
|
||||
"rms_norm_eps": 1e-05,
|
||||
"rope_scaling": {
|
||||
"factor": 32.0,
|
||||
"high_freq_factor": 4.0,
|
||||
"low_freq_factor": 1.0,
|
||||
"original_max_position_embeddings": 8192,
|
||||
"rope_type": "llama3"
|
||||
},
|
||||
"rope_theta": 500000.0,
|
||||
"tie_word_embeddings": true,
|
||||
"torch_dtype": "bfloat16",
|
||||
"transformers_version": "4.45.2",
|
||||
"use_cache": true,
|
||||
"vocab_size": 128256
|
||||
}
|
||||
12
generation_config.json
Normal file
12
generation_config.json
Normal file
@@ -0,0 +1,12 @@
|
||||
{
|
||||
"bos_token_id": 128000,
|
||||
"do_sample": true,
|
||||
"eos_token_id": [
|
||||
128001,
|
||||
128008,
|
||||
128009
|
||||
],
|
||||
"temperature": 0.6,
|
||||
"top_p": 0.9,
|
||||
"transformers_version": "4.45.2"
|
||||
}
|
||||
3
model-00001-of-00002.safetensors
Normal file
3
model-00001-of-00002.safetensors
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:9a5bee964604edf2c5b3e05eaea3381e63ef90b7dbbf03378d7d417f1c3ce6f2
|
||||
size 4965799096
|
||||
3
model-00002-of-00002.safetensors
Normal file
3
model-00002-of-00002.safetensors
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:5162f8ba48acb62f7ebc2b54036e302338cb6d0e6d4e2668382db937afe12e02
|
||||
size 1459729952
|
||||
261
model.safetensors.index.json
Normal file
261
model.safetensors.index.json
Normal file
@@ -0,0 +1,261 @@
|
||||
{
|
||||
"metadata": {
|
||||
"total_size": 6425499648
|
||||
},
|
||||
"weight_map": {
|
||||
"model.embed_tokens.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.20.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.20.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.20.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.21.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.21.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.21.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.22.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.22.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.23.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.23.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.24.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.25.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.26.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.26.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.27.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.27.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.27.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.norm.weight": "model-00002-of-00002.safetensors"
|
||||
}
|
||||
}
|
||||
23
special_tokens_map.json
Normal file
23
special_tokens_map.json
Normal file
@@ -0,0 +1,23 @@
|
||||
{
|
||||
"bos_token": {
|
||||
"content": "<|begin_of_text|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
},
|
||||
"eos_token": {
|
||||
"content": "<|eot_id|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
},
|
||||
"pad_token": {
|
||||
"content": "<|finetune_right_pad_id|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
}
|
||||
}
|
||||
3
tokenizer.json
Normal file
3
tokenizer.json
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:986aa57ece4024027cc214277d56f511641b31a9814861803de728efedb1f7f4
|
||||
size 17210499
|
||||
2089
tokenizer_config.json
Normal file
2089
tokenizer_config.json
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user