From 7b9ae9318766760f69c4880c5360c8358106c6ef Mon Sep 17 00:00:00 2001 From: ModelHub XC Date: Tue, 5 May 2026 06:35:10 +0800 Subject: [PATCH] =?UTF-8?q?=E5=88=9D=E5=A7=8B=E5=8C=96=E9=A1=B9=E7=9B=AE?= =?UTF-8?q?=EF=BC=8C=E7=94=B1ModelHub=20XC=E7=A4=BE=E5=8C=BA=E6=8F=90?= =?UTF-8?q?=E4=BE=9B=E6=A8=A1=E5=9E=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Model: PAI/pai-baichuan2-7b-doc2qa Source: Original Platform --- .gitattributes | 37 + config.json | 30 + configuration.json | 1 + configuration_baichuan.py | 68 + generation_config.json | 14 + generation_utils.py | 83 + modeling_baichuan.py | 783 +++++ pytorch_model-00001-of-00002.bin | 3 + pytorch_model-00002-of-00002.bin | 3 + pytorch_model.bin.index.json | 3 + quantizer.py | 210 ++ special_tokens_map.json | 30 + tokenization_baichuan.py | 253 ++ tokenizer.model | 3 + tokenizer_config.json | 48 + trainer_state.json | 4708 ++++++++++++++++++++++++++++++ training_args.bin | 3 + 17 files changed, 6280 insertions(+) create mode 100644 .gitattributes create mode 100644 config.json create mode 100644 configuration.json create mode 100644 configuration_baichuan.py create mode 100644 generation_config.json create mode 100644 generation_utils.py create mode 100644 modeling_baichuan.py create mode 100644 pytorch_model-00001-of-00002.bin create mode 100644 pytorch_model-00002-of-00002.bin create mode 100644 pytorch_model.bin.index.json create mode 100644 quantizer.py create mode 100644 special_tokens_map.json create mode 100644 tokenization_baichuan.py create mode 100644 tokenizer.model create mode 100644 tokenizer_config.json create mode 100644 trainer_state.json create mode 100644 training_args.bin diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..b7e9e3a --- /dev/null +++ b/.gitattributes @@ -0,0 +1,37 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bin.* filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zstandard filter=lfs diff=lfs merge=lfs -text +*.tfevents* filter=lfs diff=lfs merge=lfs -text +*.db* filter=lfs diff=lfs merge=lfs -text +*.ark* filter=lfs diff=lfs merge=lfs -text +**/*ckpt*data* filter=lfs diff=lfs merge=lfs -text +**/*ckpt*.meta filter=lfs diff=lfs merge=lfs -text +**/*ckpt*.index filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.gguf* filter=lfs diff=lfs merge=lfs -text +*.ggml filter=lfs diff=lfs merge=lfs -text +*.llamafile* filter=lfs diff=lfs merge=lfs -text diff --git a/config.json b/config.json new file mode 100644 index 0000000..26c432c --- /dev/null +++ b/config.json @@ -0,0 +1,30 @@ +{ + "_from_model_config": true, + "_name_or_path": "/workspace/baichuan2_7b_base/", + "architectures": [ + "BaichuanForCausalLM" + ], + "auto_map": { + "AutoConfig": "configuration_baichuan.BaichuanConfig", + "AutoModelForCausalLM": "modeling_baichuan.BaichuanForCausalLM" + }, + "bos_token_id": 1, + "eos_token_id": 2, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 11008, + "max_position_embeddings": 4096, + "model_max_length": 4096, + "model_type": "baichuan", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "pad_token_id": 0, + "rms_norm_eps": 1e-06, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.33.0", + "use_cache": true, + "vocab_size": 125696, + "z_loss_weight": 0.001 +} diff --git a/configuration.json b/configuration.json new file mode 100644 index 0000000..f9291c3 --- /dev/null +++ b/configuration.json @@ -0,0 +1 @@ +{"framework":"Pytorch","task":"text-generation"} \ No newline at end of file diff --git a/configuration_baichuan.py b/configuration_baichuan.py new file mode 100644 index 0000000..f57eded --- /dev/null +++ b/configuration_baichuan.py @@ -0,0 +1,68 @@ +# Copyright 2023 Baichuan Inc. All Rights Reserved. + +# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from transformers.configuration_utils import PretrainedConfig +from transformers.utils import logging + + +logger = logging.get_logger(__name__) + + +class BaichuanConfig(PretrainedConfig): + model_type = "baichuan" + keys_to_ignore_at_inference = ["past_key_values"] + + def __init__( + self, + vocab_size=125696, + hidden_size=4096, + intermediate_size=11008, + num_hidden_layers=32, + num_attention_heads=32, + hidden_act="silu", + max_position_embeddings=4096, + initializer_range=0.02, + rms_norm_eps=1e-6, + use_cache=True, + pad_token_id=0, + bos_token_id=1, + eos_token_id=2, + tie_word_embeddings=False, + **kwargs, + ): + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + self.use_cache = use_cache + self.z_loss_weight = 0.001 + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..dad8d19 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,14 @@ +{ + "assistant_token_id": 196, + "bos_token_id": 1, + "do_sample": true, + "eos_token_id": 2, + "max_new_tokens": 2048, + "pad_token_id": 0, + "repetition_penalty": 1.05, + "temperature": 0.3, + "top_k": 5, + "top_p": 0.85, + "transformers_version": "4.33.0", + "user_token_id": 195 +} diff --git a/generation_utils.py b/generation_utils.py new file mode 100644 index 0000000..5771699 --- /dev/null +++ b/generation_utils.py @@ -0,0 +1,83 @@ +from typing import List +from queue import Queue + +import torch + + +def build_chat_input(model, tokenizer, messages: List[dict], max_new_tokens: int=0): + def _parse_messages(messages, split_role="user"): + system, rounds = "", [] + round = [] + for i, message in enumerate(messages): + if message["role"] == "system": + assert i == 0 + system = message["content"] + continue + if message["role"] == split_role and round: + rounds.append(round) + round = [] + round.append(message) + if round: + rounds.append(round) + return system, rounds + + max_new_tokens = max_new_tokens or model.generation_config.max_new_tokens + max_input_tokens = model.config.model_max_length - max_new_tokens + system, rounds = _parse_messages(messages, split_role="user") + system_tokens = tokenizer.encode(system) + max_history_tokens = max_input_tokens - len(system_tokens) + + history_tokens = [] + for round in rounds[::-1]: + round_tokens = [] + for message in round: + if message["role"] == "user": + round_tokens.append(model.generation_config.user_token_id) + else: + round_tokens.append(model.generation_config.assistant_token_id) + round_tokens.extend(tokenizer.encode(message["content"])) + if len(history_tokens) == 0 or len(history_tokens) + len(round_tokens) <= max_history_tokens: + history_tokens = round_tokens + history_tokens # concat left + if len(history_tokens) < max_history_tokens: + continue + break + + input_tokens = system_tokens + history_tokens + if messages[-1]["role"] != "assistant": + input_tokens.append(model.generation_config.assistant_token_id) + input_tokens = input_tokens[-max_input_tokens:] # truncate left + return torch.LongTensor([input_tokens]).to(model.device) + + +class TextIterStreamer: + def __init__(self, tokenizer, skip_prompt=False, skip_special_tokens=False): + self.tokenizer = tokenizer + self.skip_prompt = skip_prompt + self.skip_special_tokens = skip_special_tokens + self.tokens = [] + self.text_queue = Queue() + self.next_tokens_are_prompt = True + + def put(self, value): + if self.skip_prompt and self.next_tokens_are_prompt: + self.next_tokens_are_prompt = False + else: + if len(value.shape) > 1: + value = value[0] + self.tokens.extend(value.tolist()) + self.text_queue.put( + self.tokenizer.decode(self.tokens, skip_special_tokens=self.skip_special_tokens)) + + def end(self): + self.text_queue.put(None) + + def __iter__(self): + return self + + def __next__(self): + value = self.text_queue.get() + if value is None: + raise StopIteration() + else: + return value + diff --git a/modeling_baichuan.py b/modeling_baichuan.py new file mode 100644 index 0000000..0cef964 --- /dev/null +++ b/modeling_baichuan.py @@ -0,0 +1,783 @@ +# Copyright 2023 Baichuan Inc. All Rights Reserved. + +# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from .configuration_baichuan import BaichuanConfig +from .generation_utils import build_chat_input, TextIterStreamer + +import math +from typing import List, Optional, Tuple, Union +from threading import Thread + +import torch +import torch.utils.checkpoint +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss +from torch.nn import functional as F +from transformers import PreTrainedModel, PretrainedConfig +from transformers.activations import ACT2FN +from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast +from transformers.generation.utils import GenerationConfig +from transformers.utils import logging, ContextManagers + +import os +from contextlib import contextmanager +logger = logging.get_logger(__name__) + +try: + from xformers import ops as xops +except ImportError: + xops = None + logger.warning( + "Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers\npip install xformers." + ) + + +# Copied from transformers.models.bart.modeling_bart._make_causal_mask +def _make_causal_mask( + input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0 +): + """ + Make causal mask used for bi-directional self-attention. + """ + bsz, tgt_len = input_ids_shape + mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min, device=device), device=device) + mask_cond = torch.arange(mask.size(-1), device=device) + mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0) + mask = mask.to(dtype) + + if past_key_values_length > 0: + mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1) + return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length) + +def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None): + """ + Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`. + """ + if len(mask.size()) == 3: + bsz, src_len, _ = mask.size() + tgt_len = tgt_len if tgt_len is not None else src_len + expanded_mask = mask[:,None,:,:].expand(bsz, 1, tgt_len, src_len).to(dtype) + else: + bsz, src_len = mask.size() + tgt_len = tgt_len if tgt_len is not None else src_len + expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype) + + inverted_mask = 1.0 - expanded_mask + + return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min) + + +class RMSNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-6): + """ + RMSNorm is equivalent to T5LayerNorm + """ + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + + # convert into half-precision if necessary + if self.weight.dtype in [torch.float16, torch.bfloat16]: + hidden_states = hidden_states.to(self.weight.dtype) + + return self.weight * hidden_states + + +class RotaryEmbedding(torch.nn.Module): + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): + super().__init__() + self.inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float().to(device) / dim)) + self.max_seq_len_cached = max_position_embeddings + t = torch.arange(self.max_seq_len_cached, device=self.inv_freq.device, dtype=torch.float32) + freqs = torch.outer(t, self.inv_freq) + emb = torch.cat((freqs, freqs), dim=-1) + self.cos_cached = emb.cos()[None, None, :, :].to(torch.float32) + self.sin_cached = emb.sin()[None, None, :, :].to(torch.float32) + def forward(self, x, seq_len=None): + # x: [bs, num_attention_heads, seq_len, head_size] + # This `if` block is unlikely to be run after we build sin/cos in `__init__`. Keep the logic here just in case. + if seq_len > self.max_seq_len_cached: + self.max_seq_len_cached = seq_len + t = torch.arange(self.max_seq_len_cached, device=self.inv_freq.device, dtype=torch.float32) + freqs = torch.outer(t, self.inv_freq) + emb = torch.cat((freqs, freqs), dim=-1) + self.cos_cached = emb.cos()[None, None, :, :].to(torch.float32).to(x.device) + self.sin_cached = emb.sin()[None, None, :, :].to(torch.float32).to(x.device) + elif self.cos_cached.device != x.device: + self.cos_cached = self.cos_cached.to(x.device) + self.sin_cached = self.sin_cached.to(x.device) + return ( + self.cos_cached[:, :, :seq_len, ...], + self.sin_cached[:, :, :seq_len, ...], + ) + + +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2:] + return torch.cat((-x2, x1), dim=-1) + + +def apply_rotary_pos_emb(q, k, cos_, sin_, position_ids): + cos = cos_.squeeze(1).squeeze(0) # [seq_len, dim] + sin = sin_.squeeze(1).squeeze(0) # [seq_len, dim] + cos = cos[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim] + sin = sin[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim] + q_embed = (q.float() * cos) + (rotate_half(q.float()) * sin) + k_embed = (k.float() * cos) + (rotate_half(k.float()) * sin) + return q_embed.to(q.dtype), k_embed.to(k.dtype) + + +class MLP(nn.Module): + def __init__( + self, + hidden_size: int, + intermediate_size: int, + hidden_act: str, + ): + super().__init__() + self.gate_proj = nn.Linear(hidden_size, intermediate_size, bias=False) + self.down_proj = nn.Linear(intermediate_size, hidden_size, bias=False) + self.up_proj = nn.Linear(hidden_size, intermediate_size, bias=False) + self.act_fn = ACT2FN[hidden_act] + + def forward(self, x): + return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) + + +class Attention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + def __init__(self, config: BaichuanConfig): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.hidden_size // self.num_heads + self.max_position_embeddings = config.max_position_embeddings + + if (self.head_dim * self.num_heads) != self.hidden_size: + raise ValueError( + f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}" + f" and `num_heads`: {self.num_heads})." + ) + self.W_pack = nn.Linear(self.hidden_size, 3 * self.hidden_size, bias=False) + self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False) + self.rotary_emb = RotaryEmbedding(self.head_dim, max_position_embeddings=self.max_position_embeddings) + + def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): + return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: bool = False, + use_cache: bool = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + bsz, q_len, _ = hidden_states.size() + + proj = self.W_pack(hidden_states) + proj = proj.unflatten(-1, (3, self.hidden_size)).unsqueeze(0).transpose(0, -2).squeeze(-2) + query_states = proj[0].view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = proj[1].view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + value_states = proj[2].view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + kv_seq_len += past_key_value[0].shape[-2] + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + # [bsz, nh, t, hd] + + if past_key_value is not None: + # reuse k, v, self_attention + key_states = torch.cat([past_key_value[0], key_states], dim=2) + value_states = torch.cat([past_key_value[1], value_states], dim=2) + + past_key_value = (key_states, value_states) if use_cache else None + if xops is not None and self.training: + attn_weights = None + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + attn_output = xops.memory_efficient_attention( + query_states, key_states, value_states, attn_bias=xops.LowerTriangularMask() + ) + else: + with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=True, enable_mem_efficient=True): + attn_output = F.scaled_dot_product_attention(query_states, key_states, value_states, attn_mask = attention_mask) + attn_output = attn_output.transpose(1, 2) + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + +class DecoderLayer(nn.Module): + def __init__(self, config: BaichuanConfig): + super().__init__() + self.hidden_size = config.hidden_size + self.self_attn = Attention(config=config) + self.mlp = MLP( + hidden_size=self.hidden_size, + intermediate_size=config.intermediate_size, + hidden_act=config.hidden_act, + ) + self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: + + residual = hidden_states + + hidden_states = self.input_layernorm(hidden_states) + + # Self Attention + hidden_states, self_attn_weights, present_key_value = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights,) + + if use_cache: + outputs += (present_key_value,) + + return outputs + + +class BaichuanPreTrainedModel(PreTrainedModel): + config_class = BaichuanConfig + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["DecoderLayer"] + _keys_to_ignore_on_load_unexpected = [r"decoder\.version"] + + def _init_weights(self, module): + std = self.config.initializer_range + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + + def _set_gradient_checkpointing(self, module, value=False): + if isinstance(module, BaichuanModel): + module.gradient_checkpointing = value + + +class BaichuanModel(BaichuanPreTrainedModel): + def __init__(self, config: BaichuanConfig): + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) + self.layers = nn.ModuleList([DecoderLayer(config) for _ in range(config.num_hidden_layers)]) + self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embed_tokens + + def set_input_embeddings(self, value): + self.embed_tokens = value + + # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask + def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length): + # create causal mask + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + combined_attention_mask = None + if input_shape[-1] > 1: + combined_attention_mask = _make_causal_mask( + input_shape, + inputs_embeds.dtype, + device=inputs_embeds.device, + past_key_values_length=past_key_values_length, + ) + + if attention_mask is not None: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to( + inputs_embeds.device + ) + combined_attention_mask = ( + expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask + ) + + return combined_attention_mask + + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutputWithPast]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time") + elif input_ids is not None: + batch_size, seq_length = input_ids.shape + elif inputs_embeds is not None: + batch_size, seq_length, _ = inputs_embeds.shape + else: + raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds") + + seq_length_with_past = seq_length + past_key_values_length = 0 + + if past_key_values is not None: + past_key_values_length = past_key_values[0][0].shape[2] + seq_length_with_past = seq_length_with_past + past_key_values_length + + if position_ids is None: + device = input_ids.device if input_ids is not None else inputs_embeds.device + position_ids = torch.arange( + past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device + ) + position_ids = position_ids.unsqueeze(0).view(-1, seq_length) + else: + position_ids = position_ids.view(-1, seq_length).long() + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + # embed positions + if attention_mask is None: + attention_mask = torch.ones( + (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device + ) + attention_mask = self._prepare_decoder_attention_mask( + attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length + ) + + hidden_states = inputs_embeds + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + next_decoder_cache = () if use_cache else None + + for idx, decoder_layer in enumerate(self.layers): + if output_hidden_states: + all_hidden_states += (hidden_states,) + + past_key_value = past_key_values[idx] if past_key_values is not None else None + + if self.gradient_checkpointing and self.training: + + def create_custom_forward(module): + def custom_forward(*inputs): + # None for past_key_value + return module(*inputs, output_attentions, None) + + return custom_forward + + layer_outputs = torch.utils.checkpoint.checkpoint( + create_custom_forward(decoder_layer), + hidden_states, + attention_mask, + position_ids, + None, + ) + else: + layer_outputs = decoder_layer( + hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + + hidden_states = layer_outputs[0] + + if use_cache: + next_decoder_cache += (layer_outputs[2 if output_attentions else 1],) + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + hidden_states = self.norm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + next_cache = next_decoder_cache if use_cache else None + if not return_dict: + return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None) + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + ) + + +class NormHead(nn.Module): + def __init__(self, hidden_size, vocab_size, bias=False): + super().__init__() + self.weight = nn.Parameter(torch.empty((vocab_size, hidden_size))) + nn.init.kaiming_uniform_(self.weight, a=math.sqrt(5)) + self.first_flag = True + + def forward(self, hidden_states): + if self.training: + norm_weight = nn.functional.normalize(self.weight) + elif self.first_flag: + self.first_flag = False + self.weight = nn.Parameter(nn.functional.normalize(self.weight)) + norm_weight = self.weight + else: + norm_weight = self.weight + return nn.functional.linear(hidden_states, norm_weight) + +_init_weights = True +@contextmanager +def no_init_weights(_enable=True): + global _init_weights + old_init_weights = _init_weights + if _enable: + _init_weights = False + try: + yield + finally: + _init_weights = old_init_weights + +class BaichuanForCausalLM(BaichuanPreTrainedModel): + def __init__(self, config, *model_args, **model_kwargs): + super().__init__(config, *model_args, **model_kwargs) + self.model = BaichuanModel(config) + + self.lm_head = NormHead(config.hidden_size, config.vocab_size, bias=False) + if hasattr(config, "quantization_config") and isinstance(config.quantization_config, dict) and config.quantization_config.get('load_in_4bit', False): + try: + from .quantizer import quantize_offline, init_model_weight_int4 + except ImportError: + raise ImportError(f"Needs QLinear to run quantize.") + quantize_offline(self, 4) + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.embed_tokens + + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + def set_decoder(self, decoder): + self.model = decoder + + def get_decoder(self): + return self.model + + @classmethod + def from_pretrained( + cls, + pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], + *model_args, + config: Optional[Union[PretrainedConfig, str, os.PathLike]] = None, + cache_dir: Optional[Union[str, os.PathLike]] = None, + ignore_mismatched_sizes: bool = False, + force_download: bool = False, + local_files_only: bool = False, + token: Optional[Union[str, bool]] = None, + revision: str = "main", + use_safetensors: bool = None, + **kwargs, + ): + # Load config if we don't provide a configuration + if not isinstance(config, PretrainedConfig): + config_path = config if config is not None else pretrained_model_name_or_path + config, model_kwargs = cls.config_class.from_pretrained( + config_path, + cache_dir=cache_dir, + return_unused_kwargs=True, + force_download=force_download, + resume_download=False, + proxies=None, + local_files_only=local_files_only, + token=token, + revision=revision, + subfolder="", + _from_auto=False, + _from_pipeline=None, + **kwargs, + ) + else: + model_kwargs = kwargs + + if hasattr(config, "quantization_config") and config.quantization_config['load_in_4bit']: + try: + from .quantizer import init_model_weight_int4 + from accelerate import init_empty_weights, dispatch_model, infer_auto_device_map + from accelerate.utils import CustomDtype + from accelerate.utils import get_balanced_memory + except ImportError: + raise ImportError(f"Needs import model weight init func to run quantize.") + # Instantiate model. + init_contexts = [no_init_weights(_enable=True)] + init_contexts.append(init_empty_weights()) + with ContextManagers(init_contexts): + model = cls(config) + + model_file = os.path.join(pretrained_model_name_or_path, 'pytorch_model.bin') + state_dict = torch.load(model_file, map_location="cpu") + model.is_quantized = True + + device_map = kwargs.pop("device_map", None) + torch_dtype = kwargs.pop("torch_dtype", None) + + kwargs = {"no_split_module_classes": model._no_split_modules} + target_dtype = CustomDtype.INT4 + max_memory = get_balanced_memory( + model, + dtype=target_dtype, + low_zero=(device_map == "balanced_low_0"), + max_memory=None, + **kwargs, + ) + kwargs["max_memory"] = max_memory + + device_map = infer_auto_device_map(model, dtype=target_dtype, **kwargs) + model = init_model_weight_int4(config, model, state_dict) + + # Set model in evaluation mode to deactivate DropOut modules by default + model.eval() + # If it is a model with generation capabilities, attempt to load the generation config + if model.can_generate(): + try: + model.generation_config = GenerationConfig.from_pretrained( + pretrained_model_name_or_path, + cache_dir=cache_dir, + force_download=force_download, + resume_download=False, + proxies=None, + local_files_only=local_files_only, + token=token, + revision=revision, + subfolder="", + _from_auto=False, + _from_pipeline=None, + **kwargs, + ) + except (OSError, TypeError): + logger.info( + "Generation config file not found, using a generation config created from the model config." + ) + pass + + if device_map is not None: + dispatch_model(model, device_map=device_map) + + return model + return super(BaichuanForCausalLM, cls).from_pretrained(pretrained_model_name_or_path, *model_args, + config=config, cache_dir=cache_dir, ignore_mismatched_sizes=ignore_mismatched_sizes, + force_download=force_download, local_files_only=local_files_only, token=token, revision=revision, + use_safetensors=use_safetensors, **kwargs) + + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, CausalLMOutputWithPast]: + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs[0] + logits = self.lm_head(hidden_states) + loss = None + if labels is not None: + # Shift so that tokens < n predict n + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + # Flatten the tokens + loss_fct = CrossEntropyLoss() + shift_logits = shift_logits.view(-1, self.config.vocab_size) + shift_labels = shift_labels.view(-1) + softmax_normalizer = shift_logits.max(-1).values ** 2 + z_loss = self.config.z_loss_weight * softmax_normalizer.mean() + # Enable model parallelism + shift_labels = shift_labels.to(shift_logits.device) + loss = loss_fct(shift_logits, shift_labels) + z_loss + + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def prepare_inputs_for_generation( + self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs + ): + if past_key_values: + input_ids = input_ids[:, -1:] + + position_ids = kwargs.get("position_ids", None) + if attention_mask is not None and position_ids is None: + # create position_ids on the fly for batch generation + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + if past_key_values: + position_ids = position_ids[:, -1].unsqueeze(-1) + + # if `inputs_embeds` are passed, we only want to use them in the 1st generation step + if inputs_embeds is not None and past_key_values is None: + model_inputs = {"inputs_embeds": inputs_embeds} + else: + model_inputs = {"input_ids": input_ids} + + model_inputs.update( + { + "position_ids": position_ids, + "past_key_values": past_key_values, + "use_cache": kwargs.get("use_cache"), + "attention_mask": attention_mask, + } + ) + return model_inputs + + @staticmethod + def _reorder_cache(past_key_values, beam_idx): + reordered_past = () + for layer_past in past_key_values: + reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),) + return reordered_past + + def quantize(self, bits: int): + try: + from .quantizer import quantize_online + except ImportError: + raise ImportError(f"Needs QLinear to run quantize.") + return quantize_online(self, bits) + + def chat(self, tokenizer, messages: List[dict], stream=False, + generation_config: Optional[GenerationConfig]=None): + generation_config = generation_config or self.generation_config + input_ids = build_chat_input(self, tokenizer, messages, generation_config.max_new_tokens) + if stream: + streamer = TextIterStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) + Thread(target=self.generate, kwargs=dict( + inputs=input_ids, streamer=streamer, + generation_config=generation_config, + )).start() + return streamer + else: + outputs = self.generate(input_ids, generation_config=generation_config) + response = tokenizer.decode(outputs[0][len(input_ids[0]):], skip_special_tokens=True) + return response diff --git a/pytorch_model-00001-of-00002.bin b/pytorch_model-00001-of-00002.bin new file mode 100644 index 0000000..eaf3566 --- /dev/null +++ b/pytorch_model-00001-of-00002.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6e8fb050aac642c81e3db5f1c8b6e71274aae674970d729c00c1ec528651c4bf +size 9934622796 diff --git a/pytorch_model-00002-of-00002.bin b/pytorch_model-00002-of-00002.bin new file mode 100644 index 0000000..e3d5610 --- /dev/null +++ b/pytorch_model-00002-of-00002.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:78f453b1d118aa92ec917c044413fb85717be3912c121db904fd57290d50e81e +size 5077401163 diff --git a/pytorch_model.bin.index.json b/pytorch_model.bin.index.json new file mode 100644 index 0000000..a04353c --- /dev/null +++ b/pytorch_model.bin.index.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bd4115a901475f97a903e017fd2c9922d9ffa38adcff1960fffcf47041dffda1 +size 18658 diff --git a/quantizer.py b/quantizer.py new file mode 100644 index 0000000..239a2fb --- /dev/null +++ b/quantizer.py @@ -0,0 +1,210 @@ +import bitsandbytes as bnb +from bitsandbytes.nn.modules import Params4bit, Int8Params +import torch + +def Params4bitCuda(self, device): + self.data = self.data.cuda(device) + self.quant_state[0] = self.quant_state[0].cuda(device) + self.quant_state[4][0] = self.quant_state[4][0].cuda(device) + self.quant_state[4][1][0] = self.quant_state[4][1][0].cuda(device) + self.quant_state[4][1][1] = self.quant_state[4][1][1].cuda(device) + + self.quant_state[6] = self.quant_state[6].cuda(device) + return self + +class Linear4bitOnline(torch.nn.Module): + def __init__(self, weight, bias, quant_type): + super().__init__() + self.weight = Params4bit( + weight.data, requires_grad=False, compress_statistics=True, quant_type=quant_type + ) + self.compute_dtype = None + #self.weight.cuda(weight.device) + self.bias = bias + + def forward(self, x: torch.Tensor): + # weights are cast automatically as Int8Params, but the bias has to be cast manually + if self.bias is not None and self.bias.dtype != x.dtype: + self.bias.data = self.bias.data.to(x.dtype) + + if getattr(self.weight, "quant_state", None) is None: + print( + "FP4 quantization state not initialized. Please call .cuda() or .to(device) on the LinearFP4 layer first." + ) + inp_dtype = x.dtype + if self.compute_dtype is not None: + x = x.to(self.compute_dtype) + + bias = None if self.bias is None else self.bias.to(self.compute_dtype) + out = bnb.matmul_4bit( + x, self.weight.t(), bias=bias, quant_state=self.weight.quant_state + ) + + out = out.to(inp_dtype) + + return out + +class Linear8bitLtOnline(torch.nn.Module): + def __init__( + self, + weight, + bias, + has_fp16_weights=True, + memory_efficient_backward=False, + threshold=0.0, + index=None, + ): + super().__init__() + assert ( + not memory_efficient_backward + ), "memory_efficient_backward is no longer required and the argument is deprecated in 0.37.0 and will be removed in 0.39.0" + self.state = bnb.MatmulLtState() + self.index = index + + # Necessary for stacked layers + self.state.threshold = threshold + self.state.has_fp16_weights = has_fp16_weights + self.state.memory_efficient_backward = memory_efficient_backward + if threshold > 0.0 and not has_fp16_weights: + self.state.use_pool = True + + self.weight = Int8Params( + weight.data, + has_fp16_weights=has_fp16_weights, + requires_grad=has_fp16_weights, + ) + self.bias = bias + + def init_8bit_state(self): + self.state.CB = self.weight.CB + self.state.SCB = self.weight.SCB + self.weight.CB = None + self.weight.SCB = None + + def forward(self, x: torch.Tensor): + self.state.is_training = self.training + if self.weight.CB is not None: + self.init_8bit_state() + + # weights are cast automatically as Int8Params, but the bias has to be cast manually + if self.bias is not None and self.bias.dtype != x.dtype: + self.bias.data = self.bias.data.to(x.dtype) + + out = bnb.matmul(x, self.weight, bias=self.bias, state=self.state) + + if not self.state.has_fp16_weights: + if self.state.CB is not None and self.state.CxB is not None: + # we converted 8-bit row major to turing/ampere format in the first inference pass + # we no longer need the row-major weight + del self.state.CB + self.weight.data = self.state.CxB + return out + +def quantize_offline(model, bits: int): + assert (bits == 4), f'bits: {bits} is not supported' + + for i, layer in enumerate(model.model.layers): + layer.self_attn.W_pack = bnb.nn.Linear4bit( + layer.self_attn.W_pack.weight.shape[1], + layer.self_attn.W_pack.weight.shape[0], + False, + torch.float16, + compress_statistics=True, + quant_type="nf4", + ) + layer.self_attn.o_proj = bnb.nn.Linear4bit( + layer.self_attn.o_proj.weight.shape[1], + layer.self_attn.o_proj.weight.shape[0], + False, + torch.float16, + compress_statistics=True, + quant_type="nf4", + ) + + layer.mlp.gate_proj = bnb.nn.Linear4bit( + layer.mlp.gate_proj.weight.shape[1], + layer.mlp.gate_proj.weight.shape[0], + False, + torch.float16, + compress_statistics=True, + quant_type="nf4", + ) + layer.mlp.down_proj = bnb.nn.Linear4bit( + layer.mlp.down_proj.weight.shape[1], + layer.mlp.down_proj.weight.shape[0], + False, + torch.float16, + compress_statistics=True, + quant_type="nf4", + ) + layer.mlp.up_proj = bnb.nn.Linear4bit( + layer.mlp.up_proj.weight.shape[1], + layer.mlp.up_proj.weight.shape[0], + False, + torch.float16, + compress_statistics=True, + quant_type="nf4", + ) + return model + +def quantize_online(model, bits: int): + def quant(weight, bias=None): + if bits == 8: + linear = Linear8bitLtOnline( + weight, + bias, + has_fp16_weights=False, + threshold=6.0, + ) + if bias is not None: + linear.bias = torch.nn.Parameter(bias) + elif bits == 4: + linear = Linear4bitOnline( + weight, + bias, + quant_type="nf4", #fp4/nf4 + ) + else: + raise ValueError("quantize only support 4/8 bit") + return linear + + for i, layer in enumerate(model.model.layers): + layer.self_attn.W_pack = quant(layer.self_attn.W_pack.weight) + layer.self_attn.o_proj = quant(layer.self_attn.o_proj.weight) + layer.mlp.gate_proj = quant(layer.mlp.gate_proj.weight) + layer.mlp.down_proj = quant(layer.mlp.down_proj.weight) + layer.mlp.up_proj = quant(layer.mlp.up_proj.weight) + return model + +def init_model_weight_int4(config, model, state_dict): + #replace Params4bit.cuda with Params4bitCuda + Params4bit.cuda = Params4bitCuda + + for i in range(config.num_hidden_layers): + weight_data = state_dict[f'model.layers.{i}.self_attn.W_pack.weight.data'] + weight_quant_state = state_dict[f'model.layers.{i}.self_attn.W_pack.weight.quant_state'] + model.model.layers[i].self_attn.W_pack.weight = Params4bit(weight_data, requires_grad=False, quant_state=weight_quant_state) + + weight_data = state_dict[f'model.layers.{i}.self_attn.o_proj.weight.data'] + weight_quant_state = state_dict[f'model.layers.{i}.self_attn.o_proj.weight.quant_state'] + model.model.layers[i].self_attn.o_proj.weight = Params4bit(weight_data, requires_grad=False, quant_state=weight_quant_state) + + weight_data = state_dict[f'model.layers.{i}.mlp.gate_proj.weight.data'] + weight_quant_state = state_dict[f'model.layers.{i}.mlp.gate_proj.weight.quant_state'] + model.model.layers[i].mlp.gate_proj.weight = Params4bit(weight_data, requires_grad=False, quant_state=weight_quant_state) + + weight_data = state_dict[f'model.layers.{i}.mlp.up_proj.weight.data'] + weight_quant_state = state_dict[f'model.layers.{i}.mlp.up_proj.weight.quant_state'] + model.model.layers[i].mlp.up_proj.weight = Params4bit(weight_data, requires_grad=False, quant_state=weight_quant_state) + + weight_data = state_dict[f'model.layers.{i}.mlp.down_proj.weight.data'] + weight_quant_state = state_dict[f'model.layers.{i}.mlp.down_proj.weight.quant_state'] + model.model.layers[i].mlp.down_proj.weight = Params4bit(weight_data, requires_grad=False, quant_state=weight_quant_state) + + model.model.layers[i].input_layernorm.weight = state_dict[f'model.layers.{i}.input_layernorm.weight'] + model.model.layers[i].post_attention_layernorm.weight = state_dict[f'model.layers.{i}.post_attention_layernorm.weight'] + + model.model.embed_tokens.weight = state_dict['model.embed_tokens.weight'] + model.model.norm.weight = state_dict['model.norm.weight'] + model.lm_head.weight = state_dict['lm_head.weight'] + return model \ No newline at end of file diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..5819ea2 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,30 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + } +} diff --git a/tokenization_baichuan.py b/tokenization_baichuan.py new file mode 100644 index 0000000..9fbcd96 --- /dev/null +++ b/tokenization_baichuan.py @@ -0,0 +1,253 @@ +# Copyright 2023 Baichuan Inc. All Rights Reserved. + +# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from shutil import copyfile +from typing import Any, Dict, List, Optional, Tuple + +import sentencepiece as spm + +from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer +from transformers.utils import logging + + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"} + +PRETRAINED_VOCAB_FILES_MAP = { + "vocab_file": {}, + "tokenizer_file": {}, +} +PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {} + + +class BaichuanTokenizer(PreTrainedTokenizer): + """ + Construct a Baichuan tokenizer. Based on byte-level Byte-Pair-Encoding. + + Args: + vocab_file (`str`): + Path to the vocabulary file. + """ + + vocab_files_names = VOCAB_FILES_NAMES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES + model_input_names = ["input_ids", "attention_mask"] + + def __init__( + self, + vocab_file, + unk_token="", + bos_token="", + eos_token="", + pad_token=None, + sp_model_kwargs: Optional[Dict[str, Any]] = None, + add_bos_token=True, + add_eos_token=False, + clean_up_tokenization_spaces=False, + **kwargs, + ): + self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token + eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token + unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token + pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token + + self.vocab_file = vocab_file + self.add_bos_token = add_bos_token + self.add_eos_token = add_eos_token + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) + self.sp_model.Load(vocab_file) + + super().__init__( + bos_token=bos_token, + eos_token=eos_token, + unk_token=unk_token, + pad_token=pad_token, + add_bos_token=add_bos_token, + add_eos_token=add_eos_token, + sp_model_kwargs=self.sp_model_kwargs, + clean_up_tokenization_spaces=clean_up_tokenization_spaces, + **kwargs, + ) + + def __getstate__(self): + state = self.__dict__.copy() + state["sp_model"] = None + return state + + def __setstate__(self, d): + self.__dict__ = d + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) + self.sp_model.Load(self.vocab_file) + + @property + def vocab_size(self): + """Returns vocab size""" + return self.sp_model.get_piece_size() + + def get_vocab(self): + """Returns vocab as a dict""" + vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) + return vocab + + def _tokenize(self, text): + """Returns a tokenized string.""" + return self.sp_model.encode(text, out_type=str) + + def _convert_token_to_id(self, token): + """Converts a token (str) in an id using the vocab.""" + return self.sp_model.piece_to_id(token) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + token = self.sp_model.IdToPiece(index) + return token + + def convert_tokens_to_string(self, tokens): + """Converts a sequence of tokens (string) in a single string.""" + current_sub_tokens = [] + out_string = "" + prev_is_special = False + for i, token in enumerate(tokens): + # make sure that special tokens are not decoded using sentencepiece model + if token in self.all_special_tokens: + if not prev_is_special and i != 0: + out_string += " " + out_string += self.sp_model.decode(current_sub_tokens) + token + prev_is_special = True + current_sub_tokens = [] + else: + current_sub_tokens.append(token) + prev_is_special = False + out_string += self.sp_model.decode(current_sub_tokens) + return out_string + + def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]: + """ + Save the vocabulary and special tokens file to a directory. + + Args: + save_directory (`str`): + The directory in which to save the vocabulary. + + Returns: + `Tuple(str)`: Paths to the files saved. + """ + if not os.path.isdir(save_directory): + logger.error(f"Vocabulary path ({save_directory}) should be a directory") + return + out_vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + + if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file): + copyfile(self.vocab_file, out_vocab_file) + elif not os.path.isfile(self.vocab_file): + with open(out_vocab_file, "wb") as fi: + content_spiece_model = self.sp_model.serialized_model_proto() + fi.write(content_spiece_model) + + return (out_vocab_file,) + + def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): + bos_token_id = [self.bos_token_id] if self.add_bos_token else [] + eos_token_id = [self.eos_token_id] if self.add_eos_token else [] + + output = bos_token_id + token_ids_0 + eos_token_id + + if token_ids_1 is not None: + output = output + bos_token_id + token_ids_1 + eos_token_id + + return output + + def get_special_tokens_mask( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False + ) -> List[int]: + """ + Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer `prepare_for_model` method. + + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not the token list is already formatted with special tokens for the model. + + Returns: + `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + if already_has_special_tokens: + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) + + bos_token_id = [1] if self.add_bos_token else [] + eos_token_id = [1] if self.add_eos_token else [] + + if token_ids_1 is None: + return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id + return ( + bos_token_id + + ([0] * len(token_ids_0)) + + eos_token_id + + bos_token_id + + ([0] * len(token_ids_1)) + + eos_token_id + ) + + def create_token_type_ids_from_sequences( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT + sequence pair mask has the following format: + + ``` + 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 + | first sequence | second sequence | + ``` + + if token_ids_1 is None, only returns the first portion of the mask (0s). + + Args: + token_ids_0 (`List[int]`): + List of ids. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s). + """ + bos_token_id = [self.bos_token_id] if self.add_bos_token else [] + eos_token_id = [self.eos_token_id] if self.add_eos_token else [] + + output = [0] * len(bos_token_id + token_ids_0 + eos_token_id) + + if token_ids_1 is not None: + output += [1] * len(bos_token_id + token_ids_1 + eos_token_id) + + return output diff --git a/tokenizer.model b/tokenizer.model new file mode 100644 index 0000000..b3902c4 --- /dev/null +++ b/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79452955be6b419a65984273a9f08af86042e1c2a75ee3ba989cbf620a133cc2 +size 2001107 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..e5433ac --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,48 @@ +{ + "add_bos_token": false, + "add_eos_token": false, + "auto_map": { + "AutoTokenizer": [ + "tokenization_baichuan.BaichuanTokenizer", + null + ] + }, + "bos_token": { + "__type": "AddedToken", + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "clean_up_tokenization_spaces": false, + "eos_token": { + "__type": "AddedToken", + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": true + }, + "model_max_length": 2048, + "pad_token": { + "__type": "AddedToken", + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": true + }, + "padding_side": "right", + "sp_model_kwargs": {}, + "tokenizer_class": "BaichuanTokenizer", + "unk_token": { + "__type": "AddedToken", + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": true + }, + "use_fast": false +} diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000..94da897 --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,4708 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0, + "eval_steps": 500, + "global_step": 1560, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 5.128205128205128e-07, + "loss": 2.3344, + "step": 2 + }, + { + "epoch": 0.01, + "learning_rate": 1.0256410256410257e-06, + "loss": 2.2235, + "step": 4 + }, + { + "epoch": 0.01, + "learning_rate": 1.5384615384615387e-06, + "loss": 1.9635, + "step": 6 + }, + { + "epoch": 0.01, + "learning_rate": 2.0512820512820513e-06, + "loss": 1.6158, + "step": 8 + }, + { + "epoch": 0.01, + "learning_rate": 2.564102564102564e-06, + "loss": 1.3268, + "step": 10 + }, + { + "epoch": 0.02, + "learning_rate": 3.0769230769230774e-06, + "loss": 1.2103, + "step": 12 + }, + { + "epoch": 0.02, + "learning_rate": 3.58974358974359e-06, + "loss": 1.2289, + "step": 14 + }, + { + "epoch": 0.02, + "learning_rate": 4.102564102564103e-06, + "loss": 1.198, + "step": 16 + }, + { + "epoch": 0.02, + "learning_rate": 4.615384615384616e-06, + "loss": 1.2065, + "step": 18 + }, + { + "epoch": 0.03, + "learning_rate": 5.128205128205128e-06, + "loss": 1.158, + "step": 20 + }, + { + "epoch": 0.03, + "learning_rate": 5.641025641025641e-06, + "loss": 1.1527, + "step": 22 + }, + { + "epoch": 0.03, + "learning_rate": 6.153846153846155e-06, + "loss": 1.1619, + "step": 24 + }, + { + "epoch": 0.03, + "learning_rate": 6.666666666666667e-06, + "loss": 1.1903, + "step": 26 + }, + { + "epoch": 0.04, + "learning_rate": 7.17948717948718e-06, + "loss": 1.1122, + "step": 28 + }, + { + "epoch": 0.04, + "learning_rate": 7.692307692307694e-06, + "loss": 1.1569, + "step": 30 + }, + { + "epoch": 0.04, + "learning_rate": 8.205128205128205e-06, + "loss": 1.1404, + "step": 32 + }, + { + "epoch": 0.04, + "learning_rate": 8.717948717948719e-06, + "loss": 1.136, + "step": 34 + }, + { + "epoch": 0.05, + "learning_rate": 9.230769230769232e-06, + "loss": 1.1614, + "step": 36 + }, + { + "epoch": 0.05, + "learning_rate": 9.743589743589744e-06, + "loss": 1.1315, + "step": 38 + }, + { + "epoch": 0.05, + "learning_rate": 1.0256410256410256e-05, + "loss": 1.1779, + "step": 40 + }, + { + "epoch": 0.05, + "learning_rate": 1.076923076923077e-05, + "loss": 1.1671, + "step": 42 + }, + { + "epoch": 0.06, + "learning_rate": 1.1282051282051283e-05, + "loss": 1.2101, + "step": 44 + }, + { + "epoch": 0.06, + "learning_rate": 1.1794871794871796e-05, + "loss": 1.144, + "step": 46 + }, + { + "epoch": 0.06, + "learning_rate": 1.230769230769231e-05, + "loss": 1.1268, + "step": 48 + }, + { + "epoch": 0.06, + "learning_rate": 1.2820512820512823e-05, + "loss": 1.1095, + "step": 50 + }, + { + "epoch": 0.07, + "learning_rate": 1.3333333333333333e-05, + "loss": 1.1796, + "step": 52 + }, + { + "epoch": 0.07, + "learning_rate": 1.3846153846153847e-05, + "loss": 1.1147, + "step": 54 + }, + { + "epoch": 0.07, + "learning_rate": 1.435897435897436e-05, + "loss": 1.2055, + "step": 56 + }, + { + "epoch": 0.07, + "learning_rate": 1.4871794871794874e-05, + "loss": 1.1113, + "step": 58 + }, + { + "epoch": 0.08, + "learning_rate": 1.5384615384615387e-05, + "loss": 1.1802, + "step": 60 + }, + { + "epoch": 0.08, + "learning_rate": 1.5897435897435897e-05, + "loss": 1.1897, + "step": 62 + }, + { + "epoch": 0.08, + "learning_rate": 1.641025641025641e-05, + "loss": 1.1682, + "step": 64 + }, + { + "epoch": 0.08, + "learning_rate": 1.6923076923076924e-05, + "loss": 1.1573, + "step": 66 + }, + { + "epoch": 0.09, + "learning_rate": 1.7435897435897438e-05, + "loss": 1.2222, + "step": 68 + }, + { + "epoch": 0.09, + "learning_rate": 1.794871794871795e-05, + "loss": 1.1388, + "step": 70 + }, + { + "epoch": 0.09, + "learning_rate": 1.8461538461538465e-05, + "loss": 1.1363, + "step": 72 + }, + { + "epoch": 0.09, + "learning_rate": 1.8974358974358975e-05, + "loss": 1.2128, + "step": 74 + }, + { + "epoch": 0.1, + "learning_rate": 1.9487179487179488e-05, + "loss": 1.2218, + "step": 76 + }, + { + "epoch": 0.1, + "learning_rate": 2e-05, + "loss": 1.2294, + "step": 78 + }, + { + "epoch": 0.1, + "learning_rate": 1.999991012628722e-05, + "loss": 1.1995, + "step": 80 + }, + { + "epoch": 0.11, + "learning_rate": 1.999964050676434e-05, + "loss": 1.186, + "step": 82 + }, + { + "epoch": 0.11, + "learning_rate": 1.999919114627769e-05, + "loss": 1.2237, + "step": 84 + }, + { + "epoch": 0.11, + "learning_rate": 1.999856205290442e-05, + "loss": 1.166, + "step": 86 + }, + { + "epoch": 0.11, + "learning_rate": 1.999775323795232e-05, + "loss": 1.2089, + "step": 88 + }, + { + "epoch": 0.12, + "learning_rate": 1.999676471595962e-05, + "loss": 1.1779, + "step": 90 + }, + { + "epoch": 0.12, + "learning_rate": 1.9995596504694764e-05, + "loss": 1.2088, + "step": 92 + }, + { + "epoch": 0.12, + "learning_rate": 1.999424862515604e-05, + "loss": 1.1989, + "step": 94 + }, + { + "epoch": 0.12, + "learning_rate": 1.9992721101571238e-05, + "loss": 1.225, + "step": 96 + }, + { + "epoch": 0.13, + "learning_rate": 1.99910139613972e-05, + "loss": 1.1995, + "step": 98 + }, + { + "epoch": 0.13, + "learning_rate": 1.998912723531933e-05, + "loss": 1.2278, + "step": 100 + }, + { + "epoch": 0.13, + "learning_rate": 1.9987060957251047e-05, + "loss": 1.2026, + "step": 102 + }, + { + "epoch": 0.13, + "learning_rate": 1.9984815164333163e-05, + "loss": 1.2518, + "step": 104 + }, + { + "epoch": 0.14, + "learning_rate": 1.998238989693323e-05, + "loss": 1.2238, + "step": 106 + }, + { + "epoch": 0.14, + "learning_rate": 1.997978519864481e-05, + "loss": 1.1515, + "step": 108 + }, + { + "epoch": 0.14, + "learning_rate": 1.9977001116286675e-05, + "loss": 1.2342, + "step": 110 + }, + { + "epoch": 0.14, + "learning_rate": 1.9974037699901993e-05, + "loss": 1.1843, + "step": 112 + }, + { + "epoch": 0.15, + "learning_rate": 1.9970895002757413e-05, + "loss": 1.1743, + "step": 114 + }, + { + "epoch": 0.15, + "learning_rate": 1.9967573081342103e-05, + "loss": 1.2454, + "step": 116 + }, + { + "epoch": 0.15, + "learning_rate": 1.9964071995366744e-05, + "loss": 1.168, + "step": 118 + }, + { + "epoch": 0.15, + "learning_rate": 1.9960391807762462e-05, + "loss": 1.2358, + "step": 120 + }, + { + "epoch": 0.16, + "learning_rate": 1.9956532584679676e-05, + "loss": 1.209, + "step": 122 + }, + { + "epoch": 0.16, + "learning_rate": 1.995249439548693e-05, + "loss": 1.2039, + "step": 124 + }, + { + "epoch": 0.16, + "learning_rate": 1.994827731276963e-05, + "loss": 1.1564, + "step": 126 + }, + { + "epoch": 0.16, + "learning_rate": 1.994388141232876e-05, + "loss": 1.1712, + "step": 128 + }, + { + "epoch": 0.17, + "learning_rate": 1.9939306773179498e-05, + "loss": 1.2427, + "step": 130 + }, + { + "epoch": 0.17, + "learning_rate": 1.9934553477549795e-05, + "loss": 1.1702, + "step": 132 + }, + { + "epoch": 0.17, + "learning_rate": 1.992962161087893e-05, + "loss": 1.2099, + "step": 134 + }, + { + "epoch": 0.17, + "learning_rate": 1.9924511261815928e-05, + "loss": 1.2159, + "step": 136 + }, + { + "epoch": 0.18, + "learning_rate": 1.9919222522217998e-05, + "loss": 1.19, + "step": 138 + }, + { + "epoch": 0.18, + "learning_rate": 1.9913755487148874e-05, + "loss": 1.2428, + "step": 140 + }, + { + "epoch": 0.18, + "learning_rate": 1.9908110254877107e-05, + "loss": 1.1523, + "step": 142 + }, + { + "epoch": 0.18, + "learning_rate": 1.990228692687429e-05, + "loss": 1.1896, + "step": 144 + }, + { + "epoch": 0.19, + "learning_rate": 1.9896285607813245e-05, + "loss": 1.1611, + "step": 146 + }, + { + "epoch": 0.19, + "learning_rate": 1.989010640556614e-05, + "loss": 1.1762, + "step": 148 + }, + { + "epoch": 0.19, + "learning_rate": 1.988374943120254e-05, + "loss": 1.2099, + "step": 150 + }, + { + "epoch": 0.19, + "learning_rate": 1.9877214798987428e-05, + "loss": 1.2032, + "step": 152 + }, + { + "epoch": 0.2, + "learning_rate": 1.9870502626379127e-05, + "loss": 1.2296, + "step": 154 + }, + { + "epoch": 0.2, + "learning_rate": 1.9863613034027224e-05, + "loss": 1.2021, + "step": 156 + }, + { + "epoch": 0.2, + "learning_rate": 1.985654614577036e-05, + "loss": 1.249, + "step": 158 + }, + { + "epoch": 0.21, + "learning_rate": 1.9849302088634034e-05, + "loss": 1.2069, + "step": 160 + }, + { + "epoch": 0.21, + "learning_rate": 1.9841880992828306e-05, + "loss": 1.1965, + "step": 162 + }, + { + "epoch": 0.21, + "learning_rate": 1.9834282991745465e-05, + "loss": 1.215, + "step": 164 + }, + { + "epoch": 0.21, + "learning_rate": 1.9826508221957624e-05, + "loss": 1.1737, + "step": 166 + }, + { + "epoch": 0.22, + "learning_rate": 1.981855682321427e-05, + "loss": 1.2171, + "step": 168 + }, + { + "epoch": 0.22, + "learning_rate": 1.981042893843974e-05, + "loss": 1.183, + "step": 170 + }, + { + "epoch": 0.22, + "learning_rate": 1.980212471373068e-05, + "loss": 1.1829, + "step": 172 + }, + { + "epoch": 0.22, + "learning_rate": 1.979364429835339e-05, + "loss": 1.1689, + "step": 174 + }, + { + "epoch": 0.23, + "learning_rate": 1.978498784474115e-05, + "loss": 1.2268, + "step": 176 + }, + { + "epoch": 0.23, + "learning_rate": 1.9776155508491482e-05, + "loss": 1.195, + "step": 178 + }, + { + "epoch": 0.23, + "learning_rate": 1.9767147448363366e-05, + "loss": 1.1841, + "step": 180 + }, + { + "epoch": 0.23, + "learning_rate": 1.9757963826274357e-05, + "loss": 1.1793, + "step": 182 + }, + { + "epoch": 0.24, + "learning_rate": 1.97486048072977e-05, + "loss": 1.1804, + "step": 184 + }, + { + "epoch": 0.24, + "learning_rate": 1.9739070559659347e-05, + "loss": 1.1906, + "step": 186 + }, + { + "epoch": 0.24, + "learning_rate": 1.972936125473495e-05, + "loss": 1.136, + "step": 188 + }, + { + "epoch": 0.24, + "learning_rate": 1.9719477067046768e-05, + "loss": 1.1701, + "step": 190 + }, + { + "epoch": 0.25, + "learning_rate": 1.9709418174260523e-05, + "loss": 1.2474, + "step": 192 + }, + { + "epoch": 0.25, + "learning_rate": 1.9699184757182225e-05, + "loss": 1.1973, + "step": 194 + }, + { + "epoch": 0.25, + "learning_rate": 1.9688776999754913e-05, + "loss": 1.1185, + "step": 196 + }, + { + "epoch": 0.25, + "learning_rate": 1.9678195089055347e-05, + "loss": 1.1669, + "step": 198 + }, + { + "epoch": 0.26, + "learning_rate": 1.966743921529065e-05, + "loss": 1.1868, + "step": 200 + }, + { + "epoch": 0.26, + "learning_rate": 1.965650957179488e-05, + "loss": 1.1726, + "step": 202 + }, + { + "epoch": 0.26, + "learning_rate": 1.9645406355025565e-05, + "loss": 1.2315, + "step": 204 + }, + { + "epoch": 0.26, + "learning_rate": 1.963412976456017e-05, + "loss": 1.1774, + "step": 206 + }, + { + "epoch": 0.27, + "learning_rate": 1.9622680003092503e-05, + "loss": 1.1994, + "step": 208 + }, + { + "epoch": 0.27, + "learning_rate": 1.9611057276429085e-05, + "loss": 1.2318, + "step": 210 + }, + { + "epoch": 0.27, + "learning_rate": 1.9599261793485432e-05, + "loss": 1.1817, + "step": 212 + }, + { + "epoch": 0.27, + "learning_rate": 1.958729376628231e-05, + "loss": 1.226, + "step": 214 + }, + { + "epoch": 0.28, + "learning_rate": 1.957515340994193e-05, + "loss": 1.2009, + "step": 216 + }, + { + "epoch": 0.28, + "learning_rate": 1.956284094268407e-05, + "loss": 1.1453, + "step": 218 + }, + { + "epoch": 0.28, + "learning_rate": 1.955035658582216e-05, + "loss": 1.2106, + "step": 220 + }, + { + "epoch": 0.28, + "learning_rate": 1.9537700563759303e-05, + "loss": 1.1813, + "step": 222 + }, + { + "epoch": 0.29, + "learning_rate": 1.9524873103984234e-05, + "loss": 1.1937, + "step": 224 + }, + { + "epoch": 0.29, + "learning_rate": 1.9511874437067243e-05, + "loss": 1.2106, + "step": 226 + }, + { + "epoch": 0.29, + "learning_rate": 1.949870479665602e-05, + "loss": 1.1408, + "step": 228 + }, + { + "epoch": 0.29, + "learning_rate": 1.9485364419471454e-05, + "loss": 1.1855, + "step": 230 + }, + { + "epoch": 0.3, + "learning_rate": 1.9471853545303407e-05, + "loss": 1.213, + "step": 232 + }, + { + "epoch": 0.3, + "learning_rate": 1.9458172417006347e-05, + "loss": 1.2119, + "step": 234 + }, + { + "epoch": 0.3, + "learning_rate": 1.9444321280495045e-05, + "loss": 1.1559, + "step": 236 + }, + { + "epoch": 0.31, + "learning_rate": 1.9430300384740108e-05, + "loss": 1.1915, + "step": 238 + }, + { + "epoch": 0.31, + "learning_rate": 1.9416109981763526e-05, + "loss": 1.185, + "step": 240 + }, + { + "epoch": 0.31, + "learning_rate": 1.9401750326634144e-05, + "loss": 1.1699, + "step": 242 + }, + { + "epoch": 0.31, + "learning_rate": 1.9387221677463064e-05, + "loss": 1.1679, + "step": 244 + }, + { + "epoch": 0.32, + "learning_rate": 1.9372524295399014e-05, + "loss": 1.2314, + "step": 246 + }, + { + "epoch": 0.32, + "learning_rate": 1.9357658444623655e-05, + "loss": 1.1184, + "step": 248 + }, + { + "epoch": 0.32, + "learning_rate": 1.9342624392346826e-05, + "loss": 1.1837, + "step": 250 + }, + { + "epoch": 0.32, + "learning_rate": 1.9327422408801744e-05, + "loss": 1.206, + "step": 252 + }, + { + "epoch": 0.33, + "learning_rate": 1.9312052767240153e-05, + "loss": 1.1639, + "step": 254 + }, + { + "epoch": 0.33, + "learning_rate": 1.92965157439274e-05, + "loss": 1.2148, + "step": 256 + }, + { + "epoch": 0.33, + "learning_rate": 1.9280811618137486e-05, + "loss": 1.1959, + "step": 258 + }, + { + "epoch": 0.33, + "learning_rate": 1.9264940672148018e-05, + "loss": 1.2305, + "step": 260 + }, + { + "epoch": 0.34, + "learning_rate": 1.9248903191235177e-05, + "loss": 1.1884, + "step": 262 + }, + { + "epoch": 0.34, + "learning_rate": 1.9232699463668543e-05, + "loss": 1.1777, + "step": 264 + }, + { + "epoch": 0.34, + "learning_rate": 1.9216329780705955e-05, + "loss": 1.202, + "step": 266 + }, + { + "epoch": 0.34, + "learning_rate": 1.9199794436588244e-05, + "loss": 1.1599, + "step": 268 + }, + { + "epoch": 0.35, + "learning_rate": 1.9183093728533966e-05, + "loss": 1.1797, + "step": 270 + }, + { + "epoch": 0.35, + "learning_rate": 1.916622795673405e-05, + "loss": 1.1717, + "step": 272 + }, + { + "epoch": 0.35, + "learning_rate": 1.9149197424346405e-05, + "loss": 1.1554, + "step": 274 + }, + { + "epoch": 0.35, + "learning_rate": 1.913200243749046e-05, + "loss": 1.1803, + "step": 276 + }, + { + "epoch": 0.36, + "learning_rate": 1.9114643305241678e-05, + "loss": 1.1519, + "step": 278 + }, + { + "epoch": 0.36, + "learning_rate": 1.9097120339625994e-05, + "loss": 1.191, + "step": 280 + }, + { + "epoch": 0.36, + "learning_rate": 1.9079433855614203e-05, + "loss": 1.1641, + "step": 282 + }, + { + "epoch": 0.36, + "learning_rate": 1.9061584171116302e-05, + "loss": 1.1831, + "step": 284 + }, + { + "epoch": 0.37, + "learning_rate": 1.9043571606975776e-05, + "loss": 1.194, + "step": 286 + }, + { + "epoch": 0.37, + "learning_rate": 1.9025396486963827e-05, + "loss": 1.1389, + "step": 288 + }, + { + "epoch": 0.37, + "learning_rate": 1.900705913777356e-05, + "loss": 1.1828, + "step": 290 + }, + { + "epoch": 0.37, + "learning_rate": 1.89885598890141e-05, + "loss": 1.1803, + "step": 292 + }, + { + "epoch": 0.38, + "learning_rate": 1.8969899073204687e-05, + "loss": 1.1592, + "step": 294 + }, + { + "epoch": 0.38, + "learning_rate": 1.895107702576868e-05, + "loss": 1.229, + "step": 296 + }, + { + "epoch": 0.38, + "learning_rate": 1.8932094085027534e-05, + "loss": 1.2132, + "step": 298 + }, + { + "epoch": 0.38, + "learning_rate": 1.891295059219472e-05, + "loss": 1.153, + "step": 300 + }, + { + "epoch": 0.39, + "learning_rate": 1.88936468913696e-05, + "loss": 1.1967, + "step": 302 + }, + { + "epoch": 0.39, + "learning_rate": 1.8874183329531222e-05, + "loss": 1.1831, + "step": 304 + }, + { + "epoch": 0.39, + "learning_rate": 1.8854560256532098e-05, + "loss": 1.1763, + "step": 306 + }, + { + "epoch": 0.39, + "learning_rate": 1.883477802509192e-05, + "loss": 1.1754, + "step": 308 + }, + { + "epoch": 0.4, + "learning_rate": 1.88148369907912e-05, + "loss": 1.116, + "step": 310 + }, + { + "epoch": 0.4, + "learning_rate": 1.879473751206489e-05, + "loss": 1.1712, + "step": 312 + }, + { + "epoch": 0.4, + "learning_rate": 1.877447995019596e-05, + "loss": 1.142, + "step": 314 + }, + { + "epoch": 0.41, + "learning_rate": 1.875406466930886e-05, + "loss": 1.1779, + "step": 316 + }, + { + "epoch": 0.41, + "learning_rate": 1.8733492036363007e-05, + "loss": 1.1667, + "step": 318 + }, + { + "epoch": 0.41, + "learning_rate": 1.8712762421146185e-05, + "loss": 1.1793, + "step": 320 + }, + { + "epoch": 0.41, + "learning_rate": 1.8691876196267892e-05, + "loss": 1.1881, + "step": 322 + }, + { + "epoch": 0.42, + "learning_rate": 1.867083373715264e-05, + "loss": 1.1687, + "step": 324 + }, + { + "epoch": 0.42, + "learning_rate": 1.8649635422033218e-05, + "loss": 1.2383, + "step": 326 + }, + { + "epoch": 0.42, + "learning_rate": 1.862828163194388e-05, + "loss": 1.1476, + "step": 328 + }, + { + "epoch": 0.42, + "learning_rate": 1.8606772750713503e-05, + "loss": 1.228, + "step": 330 + }, + { + "epoch": 0.43, + "learning_rate": 1.8585109164958698e-05, + "loss": 1.1453, + "step": 332 + }, + { + "epoch": 0.43, + "learning_rate": 1.8563291264076834e-05, + "loss": 1.1834, + "step": 334 + }, + { + "epoch": 0.43, + "learning_rate": 1.8541319440239066e-05, + "loss": 1.148, + "step": 336 + }, + { + "epoch": 0.43, + "learning_rate": 1.851919408838327e-05, + "loss": 1.1956, + "step": 338 + }, + { + "epoch": 0.44, + "learning_rate": 1.8496915606206952e-05, + "loss": 1.152, + "step": 340 + }, + { + "epoch": 0.44, + "learning_rate": 1.847448439416009e-05, + "loss": 1.1796, + "step": 342 + }, + { + "epoch": 0.44, + "learning_rate": 1.845190085543795e-05, + "loss": 1.1107, + "step": 344 + }, + { + "epoch": 0.44, + "learning_rate": 1.842916539597382e-05, + "loss": 1.1534, + "step": 346 + }, + { + "epoch": 0.45, + "learning_rate": 1.8406278424431737e-05, + "loss": 1.118, + "step": 348 + }, + { + "epoch": 0.45, + "learning_rate": 1.8383240352199118e-05, + "loss": 1.1504, + "step": 350 + }, + { + "epoch": 0.45, + "learning_rate": 1.8360051593379383e-05, + "loss": 1.1902, + "step": 352 + }, + { + "epoch": 0.45, + "learning_rate": 1.8336712564784506e-05, + "loss": 1.2, + "step": 354 + }, + { + "epoch": 0.46, + "learning_rate": 1.8313223685927507e-05, + "loss": 1.1258, + "step": 356 + }, + { + "epoch": 0.46, + "learning_rate": 1.8289585379014942e-05, + "loss": 1.2068, + "step": 358 + }, + { + "epoch": 0.46, + "learning_rate": 1.8265798068939295e-05, + "loss": 1.2186, + "step": 360 + }, + { + "epoch": 0.46, + "learning_rate": 1.8241862183271338e-05, + "loss": 1.1507, + "step": 362 + }, + { + "epoch": 0.47, + "learning_rate": 1.821777815225245e-05, + "loss": 1.2061, + "step": 364 + }, + { + "epoch": 0.47, + "learning_rate": 1.81935464087869e-05, + "loss": 1.1496, + "step": 366 + }, + { + "epoch": 0.47, + "learning_rate": 1.8169167388434024e-05, + "loss": 1.2071, + "step": 368 + }, + { + "epoch": 0.47, + "learning_rate": 1.8144641529400445e-05, + "loss": 1.128, + "step": 370 + }, + { + "epoch": 0.48, + "learning_rate": 1.8119969272532164e-05, + "loss": 1.1827, + "step": 372 + }, + { + "epoch": 0.48, + "learning_rate": 1.8095151061306647e-05, + "loss": 1.1704, + "step": 374 + }, + { + "epoch": 0.48, + "learning_rate": 1.8070187341824848e-05, + "loss": 1.1599, + "step": 376 + }, + { + "epoch": 0.48, + "learning_rate": 1.8045078562803203e-05, + "loss": 1.1552, + "step": 378 + }, + { + "epoch": 0.49, + "learning_rate": 1.8019825175565544e-05, + "loss": 1.2022, + "step": 380 + }, + { + "epoch": 0.49, + "learning_rate": 1.7994427634035016e-05, + "loss": 1.1764, + "step": 382 + }, + { + "epoch": 0.49, + "learning_rate": 1.7968886394725876e-05, + "loss": 1.1236, + "step": 384 + }, + { + "epoch": 0.49, + "learning_rate": 1.7943201916735337e-05, + "loss": 1.1416, + "step": 386 + }, + { + "epoch": 0.5, + "learning_rate": 1.791737466173527e-05, + "loss": 1.1957, + "step": 388 + }, + { + "epoch": 0.5, + "learning_rate": 1.789140509396394e-05, + "loss": 1.1644, + "step": 390 + }, + { + "epoch": 0.5, + "learning_rate": 1.7865293680217636e-05, + "loss": 1.1515, + "step": 392 + }, + { + "epoch": 0.51, + "learning_rate": 1.7839040889842307e-05, + "loss": 1.1695, + "step": 394 + }, + { + "epoch": 0.51, + "learning_rate": 1.7812647194725093e-05, + "loss": 1.1582, + "step": 396 + }, + { + "epoch": 0.51, + "learning_rate": 1.7786113069285877e-05, + "loss": 1.1106, + "step": 398 + }, + { + "epoch": 0.51, + "learning_rate": 1.7759438990468726e-05, + "loss": 1.1334, + "step": 400 + }, + { + "epoch": 0.52, + "learning_rate": 1.7732625437733338e-05, + "loss": 1.1705, + "step": 402 + }, + { + "epoch": 0.52, + "learning_rate": 1.7705672893046425e-05, + "loss": 1.1593, + "step": 404 + }, + { + "epoch": 0.52, + "learning_rate": 1.767858184087304e-05, + "loss": 1.1647, + "step": 406 + }, + { + "epoch": 0.52, + "learning_rate": 1.765135276816787e-05, + "loss": 1.1588, + "step": 408 + }, + { + "epoch": 0.53, + "learning_rate": 1.7623986164366487e-05, + "loss": 1.1233, + "step": 410 + }, + { + "epoch": 0.53, + "learning_rate": 1.7596482521376546e-05, + "loss": 1.2083, + "step": 412 + }, + { + "epoch": 0.53, + "learning_rate": 1.7568842333568952e-05, + "loss": 1.1786, + "step": 414 + }, + { + "epoch": 0.53, + "learning_rate": 1.7541066097768965e-05, + "loss": 1.2201, + "step": 416 + }, + { + "epoch": 0.54, + "learning_rate": 1.7513154313247273e-05, + "loss": 1.1545, + "step": 418 + }, + { + "epoch": 0.54, + "learning_rate": 1.7485107481711014e-05, + "loss": 1.1718, + "step": 420 + }, + { + "epoch": 0.54, + "learning_rate": 1.7456926107294765e-05, + "loss": 1.144, + "step": 422 + }, + { + "epoch": 0.54, + "learning_rate": 1.742861069655148e-05, + "loss": 1.1385, + "step": 424 + }, + { + "epoch": 0.55, + "learning_rate": 1.7400161758443377e-05, + "loss": 1.2025, + "step": 426 + }, + { + "epoch": 0.55, + "learning_rate": 1.737157980433279e-05, + "loss": 1.2103, + "step": 428 + }, + { + "epoch": 0.55, + "learning_rate": 1.7342865347972987e-05, + "loss": 1.1041, + "step": 430 + }, + { + "epoch": 0.55, + "learning_rate": 1.7314018905498932e-05, + "loss": 1.2066, + "step": 432 + }, + { + "epoch": 0.56, + "learning_rate": 1.7285040995418003e-05, + "loss": 1.1626, + "step": 434 + }, + { + "epoch": 0.56, + "learning_rate": 1.7255932138600665e-05, + "loss": 1.1846, + "step": 436 + }, + { + "epoch": 0.56, + "learning_rate": 1.7226692858271133e-05, + "loss": 1.1982, + "step": 438 + }, + { + "epoch": 0.56, + "learning_rate": 1.7197323679997943e-05, + "loss": 1.192, + "step": 440 + }, + { + "epoch": 0.57, + "learning_rate": 1.7167825131684516e-05, + "loss": 1.1833, + "step": 442 + }, + { + "epoch": 0.57, + "learning_rate": 1.7138197743559656e-05, + "loss": 1.1525, + "step": 444 + }, + { + "epoch": 0.57, + "learning_rate": 1.7108442048168038e-05, + "loss": 1.18, + "step": 446 + }, + { + "epoch": 0.57, + "learning_rate": 1.707855858036063e-05, + "loss": 1.1821, + "step": 448 + }, + { + "epoch": 0.58, + "learning_rate": 1.7048547877285078e-05, + "loss": 1.1484, + "step": 450 + }, + { + "epoch": 0.58, + "learning_rate": 1.7018410478376033e-05, + "loss": 1.1684, + "step": 452 + }, + { + "epoch": 0.58, + "learning_rate": 1.6988146925345487e-05, + "loss": 1.1598, + "step": 454 + }, + { + "epoch": 0.58, + "learning_rate": 1.695775776217301e-05, + "loss": 1.2171, + "step": 456 + }, + { + "epoch": 0.59, + "learning_rate": 1.6927243535095995e-05, + "loss": 1.1498, + "step": 458 + }, + { + "epoch": 0.59, + "learning_rate": 1.6896604792599813e-05, + "loss": 1.1854, + "step": 460 + }, + { + "epoch": 0.59, + "learning_rate": 1.686584208540797e-05, + "loss": 1.1838, + "step": 462 + }, + { + "epoch": 0.59, + "learning_rate": 1.6834955966472214e-05, + "loss": 1.1048, + "step": 464 + }, + { + "epoch": 0.6, + "learning_rate": 1.6803946990962577e-05, + "loss": 1.1611, + "step": 466 + }, + { + "epoch": 0.6, + "learning_rate": 1.6772815716257414e-05, + "loss": 1.1762, + "step": 468 + }, + { + "epoch": 0.6, + "learning_rate": 1.6741562701933366e-05, + "loss": 1.1379, + "step": 470 + }, + { + "epoch": 0.61, + "learning_rate": 1.671018850975533e-05, + "loss": 1.1169, + "step": 472 + }, + { + "epoch": 0.61, + "learning_rate": 1.6678693703666327e-05, + "loss": 1.1535, + "step": 474 + }, + { + "epoch": 0.61, + "learning_rate": 1.664707884977739e-05, + "loss": 1.1519, + "step": 476 + }, + { + "epoch": 0.61, + "learning_rate": 1.661534451635738e-05, + "loss": 1.1546, + "step": 478 + }, + { + "epoch": 0.62, + "learning_rate": 1.6583491273822763e-05, + "loss": 1.1519, + "step": 480 + }, + { + "epoch": 0.62, + "learning_rate": 1.655151969472738e-05, + "loss": 1.1728, + "step": 482 + }, + { + "epoch": 0.62, + "learning_rate": 1.6519430353752138e-05, + "loss": 1.1857, + "step": 484 + }, + { + "epoch": 0.62, + "learning_rate": 1.6487223827694673e-05, + "loss": 1.1992, + "step": 486 + }, + { + "epoch": 0.63, + "learning_rate": 1.6454900695459e-05, + "loss": 1.2115, + "step": 488 + }, + { + "epoch": 0.63, + "learning_rate": 1.6422461538045104e-05, + "loss": 1.1467, + "step": 490 + }, + { + "epoch": 0.63, + "learning_rate": 1.638990693853848e-05, + "loss": 1.1135, + "step": 492 + }, + { + "epoch": 0.63, + "learning_rate": 1.6357237482099682e-05, + "loss": 1.183, + "step": 494 + }, + { + "epoch": 0.64, + "learning_rate": 1.6324453755953772e-05, + "loss": 1.1526, + "step": 496 + }, + { + "epoch": 0.64, + "learning_rate": 1.6291556349379794e-05, + "loss": 1.1303, + "step": 498 + }, + { + "epoch": 0.64, + "learning_rate": 1.6258545853700157e-05, + "loss": 1.1992, + "step": 500 + }, + { + "epoch": 0.64, + "learning_rate": 1.622542286227003e-05, + "loss": 1.2016, + "step": 502 + }, + { + "epoch": 0.65, + "learning_rate": 1.6192187970466646e-05, + "loss": 1.1177, + "step": 504 + }, + { + "epoch": 0.65, + "learning_rate": 1.615884177567863e-05, + "loss": 1.1759, + "step": 506 + }, + { + "epoch": 0.65, + "learning_rate": 1.6125384877295255e-05, + "loss": 1.1465, + "step": 508 + }, + { + "epoch": 0.65, + "learning_rate": 1.6091817876695655e-05, + "loss": 1.1785, + "step": 510 + }, + { + "epoch": 0.66, + "learning_rate": 1.6058141377238026e-05, + "loss": 1.1657, + "step": 512 + }, + { + "epoch": 0.66, + "learning_rate": 1.602435598424877e-05, + "loss": 1.1519, + "step": 514 + }, + { + "epoch": 0.66, + "learning_rate": 1.599046230501163e-05, + "loss": 1.2162, + "step": 516 + }, + { + "epoch": 0.66, + "learning_rate": 1.5956460948756765e-05, + "loss": 1.138, + "step": 518 + }, + { + "epoch": 0.67, + "learning_rate": 1.5922352526649803e-05, + "loss": 1.1427, + "step": 520 + }, + { + "epoch": 0.67, + "learning_rate": 1.5888137651780847e-05, + "loss": 1.1866, + "step": 522 + }, + { + "epoch": 0.67, + "learning_rate": 1.585381693915346e-05, + "loss": 1.1558, + "step": 524 + }, + { + "epoch": 0.67, + "learning_rate": 1.581939100567363e-05, + "loss": 1.2041, + "step": 526 + }, + { + "epoch": 0.68, + "learning_rate": 1.5784860470138633e-05, + "loss": 1.1783, + "step": 528 + }, + { + "epoch": 0.68, + "learning_rate": 1.5750225953225968e-05, + "loss": 1.1589, + "step": 530 + }, + { + "epoch": 0.68, + "learning_rate": 1.5715488077482152e-05, + "loss": 1.1821, + "step": 532 + }, + { + "epoch": 0.68, + "learning_rate": 1.568064746731156e-05, + "loss": 1.1705, + "step": 534 + }, + { + "epoch": 0.69, + "learning_rate": 1.5645704748965193e-05, + "loss": 1.1844, + "step": 536 + }, + { + "epoch": 0.69, + "learning_rate": 1.5610660550529413e-05, + "loss": 1.15, + "step": 538 + }, + { + "epoch": 0.69, + "learning_rate": 1.557551550191467e-05, + "loss": 1.1499, + "step": 540 + }, + { + "epoch": 0.69, + "learning_rate": 1.554027023484416e-05, + "loss": 1.1426, + "step": 542 + }, + { + "epoch": 0.7, + "learning_rate": 1.550492538284249e-05, + "loss": 1.1351, + "step": 544 + }, + { + "epoch": 0.7, + "learning_rate": 1.5469481581224274e-05, + "loss": 1.1603, + "step": 546 + }, + { + "epoch": 0.7, + "learning_rate": 1.5433939467082713e-05, + "loss": 1.108, + "step": 548 + }, + { + "epoch": 0.71, + "learning_rate": 1.5398299679278172e-05, + "loss": 1.1839, + "step": 550 + }, + { + "epoch": 0.71, + "learning_rate": 1.5362562858426655e-05, + "loss": 1.2105, + "step": 552 + }, + { + "epoch": 0.71, + "learning_rate": 1.5326729646888314e-05, + "loss": 1.1467, + "step": 554 + }, + { + "epoch": 0.71, + "learning_rate": 1.5290800688755906e-05, + "loss": 1.1844, + "step": 556 + }, + { + "epoch": 0.72, + "learning_rate": 1.5254776629843204e-05, + "loss": 1.1516, + "step": 558 + }, + { + "epoch": 0.72, + "learning_rate": 1.5218658117673389e-05, + "loss": 1.145, + "step": 560 + }, + { + "epoch": 0.72, + "learning_rate": 1.518244580146742e-05, + "loss": 1.1279, + "step": 562 + }, + { + "epoch": 0.72, + "learning_rate": 1.5146140332132359e-05, + "loss": 1.11, + "step": 564 + }, + { + "epoch": 0.73, + "learning_rate": 1.5109742362249673e-05, + "loss": 1.1563, + "step": 566 + }, + { + "epoch": 0.73, + "learning_rate": 1.5073252546063493e-05, + "loss": 1.1349, + "step": 568 + }, + { + "epoch": 0.73, + "learning_rate": 1.5036671539468879e-05, + "loss": 1.177, + "step": 570 + }, + { + "epoch": 0.73, + "learning_rate": 1.5000000000000002e-05, + "loss": 1.1504, + "step": 572 + }, + { + "epoch": 0.74, + "learning_rate": 1.4963238586818346e-05, + "loss": 1.1015, + "step": 574 + }, + { + "epoch": 0.74, + "learning_rate": 1.4926387960700843e-05, + "loss": 1.1251, + "step": 576 + }, + { + "epoch": 0.74, + "learning_rate": 1.488944878402802e-05, + "loss": 1.0975, + "step": 578 + }, + { + "epoch": 0.74, + "learning_rate": 1.4852421720772064e-05, + "loss": 1.1238, + "step": 580 + }, + { + "epoch": 0.75, + "learning_rate": 1.4815307436484898e-05, + "loss": 1.1614, + "step": 582 + }, + { + "epoch": 0.75, + "learning_rate": 1.4778106598286235e-05, + "loss": 1.0992, + "step": 584 + }, + { + "epoch": 0.75, + "learning_rate": 1.4740819874851562e-05, + "loss": 1.1603, + "step": 586 + }, + { + "epoch": 0.75, + "learning_rate": 1.4703447936400135e-05, + "loss": 1.1239, + "step": 588 + }, + { + "epoch": 0.76, + "learning_rate": 1.4665991454682924e-05, + "loss": 1.1235, + "step": 590 + }, + { + "epoch": 0.76, + "learning_rate": 1.4628451102970546e-05, + "loss": 1.1074, + "step": 592 + }, + { + "epoch": 0.76, + "learning_rate": 1.4590827556041158e-05, + "loss": 1.125, + "step": 594 + }, + { + "epoch": 0.76, + "learning_rate": 1.4553121490168335e-05, + "loss": 1.176, + "step": 596 + }, + { + "epoch": 0.77, + "learning_rate": 1.4515333583108896e-05, + "loss": 1.1502, + "step": 598 + }, + { + "epoch": 0.77, + "learning_rate": 1.4477464514090745e-05, + "loss": 1.1628, + "step": 600 + }, + { + "epoch": 0.77, + "learning_rate": 1.443951496380065e-05, + "loss": 1.1416, + "step": 602 + }, + { + "epoch": 0.77, + "learning_rate": 1.4401485614372009e-05, + "loss": 1.123, + "step": 604 + }, + { + "epoch": 0.78, + "learning_rate": 1.4363377149372584e-05, + "loss": 1.1377, + "step": 606 + }, + { + "epoch": 0.78, + "learning_rate": 1.4325190253792222e-05, + "loss": 1.1249, + "step": 608 + }, + { + "epoch": 0.78, + "learning_rate": 1.4286925614030542e-05, + "loss": 1.153, + "step": 610 + }, + { + "epoch": 0.78, + "learning_rate": 1.4248583917884595e-05, + "loss": 1.1371, + "step": 612 + }, + { + "epoch": 0.79, + "learning_rate": 1.4210165854536495e-05, + "loss": 1.1357, + "step": 614 + }, + { + "epoch": 0.79, + "learning_rate": 1.4171672114541042e-05, + "loss": 1.1411, + "step": 616 + }, + { + "epoch": 0.79, + "learning_rate": 1.4133103389813302e-05, + "loss": 1.1326, + "step": 618 + }, + { + "epoch": 0.79, + "learning_rate": 1.409446037361617e-05, + "loss": 1.1138, + "step": 620 + }, + { + "epoch": 0.8, + "learning_rate": 1.4055743760547918e-05, + "loss": 1.1355, + "step": 622 + }, + { + "epoch": 0.8, + "learning_rate": 1.4016954246529697e-05, + "loss": 1.1834, + "step": 624 + }, + { + "epoch": 0.8, + "learning_rate": 1.3978092528793032e-05, + "loss": 1.1505, + "step": 626 + }, + { + "epoch": 0.81, + "learning_rate": 1.39391593058673e-05, + "loss": 1.1546, + "step": 628 + }, + { + "epoch": 0.81, + "learning_rate": 1.3900155277567157e-05, + "loss": 1.1374, + "step": 630 + }, + { + "epoch": 0.81, + "learning_rate": 1.3861081144979975e-05, + "loss": 1.1181, + "step": 632 + }, + { + "epoch": 0.81, + "learning_rate": 1.382193761045322e-05, + "loss": 1.0933, + "step": 634 + }, + { + "epoch": 0.82, + "learning_rate": 1.378272537758185e-05, + "loss": 1.1265, + "step": 636 + }, + { + "epoch": 0.82, + "learning_rate": 1.3743445151195658e-05, + "loss": 1.1638, + "step": 638 + }, + { + "epoch": 0.82, + "learning_rate": 1.37040976373466e-05, + "loss": 1.1848, + "step": 640 + }, + { + "epoch": 0.82, + "learning_rate": 1.3664683543296114e-05, + "loss": 1.1373, + "step": 642 + }, + { + "epoch": 0.83, + "learning_rate": 1.3625203577502384e-05, + "loss": 1.1536, + "step": 644 + }, + { + "epoch": 0.83, + "learning_rate": 1.3585658449607632e-05, + "loss": 1.1245, + "step": 646 + }, + { + "epoch": 0.83, + "learning_rate": 1.3546048870425356e-05, + "loss": 1.0689, + "step": 648 + }, + { + "epoch": 0.83, + "learning_rate": 1.3506375551927546e-05, + "loss": 1.1213, + "step": 650 + }, + { + "epoch": 0.84, + "learning_rate": 1.3466639207231882e-05, + "loss": 1.1406, + "step": 652 + }, + { + "epoch": 0.84, + "learning_rate": 1.3426840550588933e-05, + "loss": 1.1099, + "step": 654 + }, + { + "epoch": 0.84, + "learning_rate": 1.3386980297369308e-05, + "loss": 1.1177, + "step": 656 + }, + { + "epoch": 0.84, + "learning_rate": 1.3347059164050796e-05, + "loss": 1.1322, + "step": 658 + }, + { + "epoch": 0.85, + "learning_rate": 1.3307077868205487e-05, + "loss": 1.1567, + "step": 660 + }, + { + "epoch": 0.85, + "learning_rate": 1.3267037128486883e-05, + "loss": 1.1643, + "step": 662 + }, + { + "epoch": 0.85, + "learning_rate": 1.3226937664616977e-05, + "loss": 1.149, + "step": 664 + }, + { + "epoch": 0.85, + "learning_rate": 1.3186780197373306e-05, + "loss": 1.1518, + "step": 666 + }, + { + "epoch": 0.86, + "learning_rate": 1.3146565448576002e-05, + "loss": 1.1233, + "step": 668 + }, + { + "epoch": 0.86, + "learning_rate": 1.3106294141074825e-05, + "loss": 1.1056, + "step": 670 + }, + { + "epoch": 0.86, + "learning_rate": 1.3065966998736155e-05, + "loss": 1.1346, + "step": 672 + }, + { + "epoch": 0.86, + "learning_rate": 1.302558474643e-05, + "loss": 1.1276, + "step": 674 + }, + { + "epoch": 0.87, + "learning_rate": 1.2985148110016947e-05, + "loss": 1.0885, + "step": 676 + }, + { + "epoch": 0.87, + "learning_rate": 1.2944657816335124e-05, + "loss": 1.1238, + "step": 678 + }, + { + "epoch": 0.87, + "learning_rate": 1.2904114593187136e-05, + "loss": 1.1562, + "step": 680 + }, + { + "epoch": 0.87, + "learning_rate": 1.2863519169326984e-05, + "loss": 1.1464, + "step": 682 + }, + { + "epoch": 0.88, + "learning_rate": 1.2822872274446958e-05, + "loss": 1.1608, + "step": 684 + }, + { + "epoch": 0.88, + "learning_rate": 1.2782174639164528e-05, + "loss": 1.1692, + "step": 686 + }, + { + "epoch": 0.88, + "learning_rate": 1.2741426995009214e-05, + "loss": 1.1277, + "step": 688 + }, + { + "epoch": 0.88, + "learning_rate": 1.2700630074409427e-05, + "loss": 1.0813, + "step": 690 + }, + { + "epoch": 0.89, + "learning_rate": 1.2659784610679318e-05, + "loss": 1.1582, + "step": 692 + }, + { + "epoch": 0.89, + "learning_rate": 1.2618891338005574e-05, + "loss": 1.0978, + "step": 694 + }, + { + "epoch": 0.89, + "learning_rate": 1.2577950991434249e-05, + "loss": 1.1666, + "step": 696 + }, + { + "epoch": 0.89, + "learning_rate": 1.2536964306857526e-05, + "loss": 1.0977, + "step": 698 + }, + { + "epoch": 0.9, + "learning_rate": 1.2495932021000516e-05, + "loss": 1.1235, + "step": 700 + }, + { + "epoch": 0.9, + "learning_rate": 1.2454854871407993e-05, + "loss": 1.0975, + "step": 702 + }, + { + "epoch": 0.9, + "learning_rate": 1.2413733596431141e-05, + "loss": 1.151, + "step": 704 + }, + { + "epoch": 0.91, + "learning_rate": 1.2372568935214298e-05, + "loss": 1.1025, + "step": 706 + }, + { + "epoch": 0.91, + "learning_rate": 1.2331361627681645e-05, + "loss": 1.129, + "step": 708 + }, + { + "epoch": 0.91, + "learning_rate": 1.2290112414523927e-05, + "loss": 1.0876, + "step": 710 + }, + { + "epoch": 0.91, + "learning_rate": 1.2248822037185137e-05, + "loss": 1.133, + "step": 712 + }, + { + "epoch": 0.92, + "learning_rate": 1.2207491237849174e-05, + "loss": 1.0566, + "step": 714 + }, + { + "epoch": 0.92, + "learning_rate": 1.2166120759426515e-05, + "loss": 1.1285, + "step": 716 + }, + { + "epoch": 0.92, + "learning_rate": 1.2124711345540861e-05, + "loss": 1.1185, + "step": 718 + }, + { + "epoch": 0.92, + "learning_rate": 1.2083263740515764e-05, + "loss": 1.1878, + "step": 720 + }, + { + "epoch": 0.93, + "learning_rate": 1.2041778689361254e-05, + "loss": 1.1579, + "step": 722 + }, + { + "epoch": 0.93, + "learning_rate": 1.2000256937760446e-05, + "loss": 1.0989, + "step": 724 + }, + { + "epoch": 0.93, + "learning_rate": 1.1958699232056135e-05, + "loss": 1.1199, + "step": 726 + }, + { + "epoch": 0.93, + "learning_rate": 1.1917106319237386e-05, + "loss": 1.1138, + "step": 728 + }, + { + "epoch": 0.94, + "learning_rate": 1.1875478946926094e-05, + "loss": 1.0807, + "step": 730 + }, + { + "epoch": 0.94, + "learning_rate": 1.1833817863363563e-05, + "loss": 1.1003, + "step": 732 + }, + { + "epoch": 0.94, + "learning_rate": 1.1792123817397041e-05, + "loss": 1.1413, + "step": 734 + }, + { + "epoch": 0.94, + "learning_rate": 1.1750397558466273e-05, + "loss": 1.1445, + "step": 736 + }, + { + "epoch": 0.95, + "learning_rate": 1.1708639836590024e-05, + "loss": 1.1045, + "step": 738 + }, + { + "epoch": 0.95, + "learning_rate": 1.1666851402352587e-05, + "loss": 1.1179, + "step": 740 + }, + { + "epoch": 0.95, + "learning_rate": 1.1625033006890316e-05, + "loss": 1.1155, + "step": 742 + }, + { + "epoch": 0.95, + "learning_rate": 1.15831854018781e-05, + "loss": 1.1536, + "step": 744 + }, + { + "epoch": 0.96, + "learning_rate": 1.154130933951587e-05, + "loss": 1.1098, + "step": 746 + }, + { + "epoch": 0.96, + "learning_rate": 1.1499405572515059e-05, + "loss": 1.1107, + "step": 748 + }, + { + "epoch": 0.96, + "learning_rate": 1.1457474854085095e-05, + "loss": 1.1047, + "step": 750 + }, + { + "epoch": 0.96, + "learning_rate": 1.1415517937919846e-05, + "loss": 1.103, + "step": 752 + }, + { + "epoch": 0.97, + "learning_rate": 1.1373535578184083e-05, + "loss": 1.0788, + "step": 754 + }, + { + "epoch": 0.97, + "learning_rate": 1.1331528529499909e-05, + "loss": 1.1725, + "step": 756 + }, + { + "epoch": 0.97, + "learning_rate": 1.1289497546933212e-05, + "loss": 1.1143, + "step": 758 + }, + { + "epoch": 0.97, + "learning_rate": 1.124744338598008e-05, + "loss": 1.108, + "step": 760 + }, + { + "epoch": 0.98, + "learning_rate": 1.1205366802553231e-05, + "loss": 1.1535, + "step": 762 + }, + { + "epoch": 0.98, + "learning_rate": 1.1163268552968422e-05, + "loss": 1.1294, + "step": 764 + }, + { + "epoch": 0.98, + "learning_rate": 1.112114939393085e-05, + "loss": 1.1086, + "step": 766 + }, + { + "epoch": 0.98, + "learning_rate": 1.1079010082521557e-05, + "loss": 1.0848, + "step": 768 + }, + { + "epoch": 0.99, + "learning_rate": 1.1036851376183812e-05, + "loss": 1.0813, + "step": 770 + }, + { + "epoch": 0.99, + "learning_rate": 1.0994674032709514e-05, + "loss": 1.0692, + "step": 772 + }, + { + "epoch": 0.99, + "learning_rate": 1.095247881022555e-05, + "loss": 1.09, + "step": 774 + }, + { + "epoch": 0.99, + "learning_rate": 1.091026646718018e-05, + "loss": 1.0911, + "step": 776 + }, + { + "epoch": 1.0, + "learning_rate": 1.0868037762329405e-05, + "loss": 1.1097, + "step": 778 + }, + { + "epoch": 1.0, + "learning_rate": 1.0825793454723325e-05, + "loss": 1.0897, + "step": 780 + }, + { + "epoch": 1.0, + "learning_rate": 1.0783534303692493e-05, + "loss": 0.7686, + "step": 782 + }, + { + "epoch": 1.01, + "learning_rate": 1.0741261068834266e-05, + "loss": 0.7018, + "step": 784 + }, + { + "epoch": 1.01, + "learning_rate": 1.0698974509999159e-05, + "loss": 0.7157, + "step": 786 + }, + { + "epoch": 1.01, + "learning_rate": 1.0656675387277183e-05, + "loss": 0.7156, + "step": 788 + }, + { + "epoch": 1.01, + "learning_rate": 1.0614364460984178e-05, + "loss": 0.7095, + "step": 790 + }, + { + "epoch": 1.02, + "learning_rate": 1.057204249164815e-05, + "loss": 0.7385, + "step": 792 + }, + { + "epoch": 1.02, + "learning_rate": 1.0529710239995606e-05, + "loss": 0.7379, + "step": 794 + }, + { + "epoch": 1.02, + "learning_rate": 1.0487368466937866e-05, + "loss": 0.696, + "step": 796 + }, + { + "epoch": 1.02, + "learning_rate": 1.0445017933557404e-05, + "loss": 0.7305, + "step": 798 + }, + { + "epoch": 1.03, + "learning_rate": 1.0402659401094154e-05, + "loss": 0.7143, + "step": 800 + }, + { + "epoch": 1.03, + "learning_rate": 1.036029363093183e-05, + "loss": 0.7417, + "step": 802 + }, + { + "epoch": 1.03, + "learning_rate": 1.0317921384584245e-05, + "loss": 0.7243, + "step": 804 + }, + { + "epoch": 1.03, + "learning_rate": 1.0275543423681622e-05, + "loss": 0.7122, + "step": 806 + }, + { + "epoch": 1.04, + "learning_rate": 1.0233160509956893e-05, + "loss": 0.697, + "step": 808 + }, + { + "epoch": 1.04, + "learning_rate": 1.0190773405232024e-05, + "loss": 0.6923, + "step": 810 + }, + { + "epoch": 1.04, + "learning_rate": 1.014838287140431e-05, + "loss": 0.6897, + "step": 812 + }, + { + "epoch": 1.04, + "learning_rate": 1.010598967043268e-05, + "loss": 0.717, + "step": 814 + }, + { + "epoch": 1.05, + "learning_rate": 1.0063594564324014e-05, + "loss": 0.7213, + "step": 816 + }, + { + "epoch": 1.05, + "learning_rate": 1.0021198315119426e-05, + "loss": 0.6889, + "step": 818 + }, + { + "epoch": 1.05, + "learning_rate": 9.97880168488058e-06, + "loss": 0.7222, + "step": 820 + }, + { + "epoch": 1.05, + "learning_rate": 9.936405435675991e-06, + "loss": 0.7073, + "step": 822 + }, + { + "epoch": 1.06, + "learning_rate": 9.894010329567322e-06, + "loss": 0.6948, + "step": 824 + }, + { + "epoch": 1.06, + "learning_rate": 9.851617128595694e-06, + "loss": 0.6909, + "step": 826 + }, + { + "epoch": 1.06, + "learning_rate": 9.809226594767979e-06, + "loss": 0.7265, + "step": 828 + }, + { + "epoch": 1.06, + "learning_rate": 9.766839490043108e-06, + "loss": 0.7449, + "step": 830 + }, + { + "epoch": 1.07, + "learning_rate": 9.724456576318383e-06, + "loss": 0.6824, + "step": 832 + }, + { + "epoch": 1.07, + "learning_rate": 9.682078615415755e-06, + "loss": 0.7546, + "step": 834 + }, + { + "epoch": 1.07, + "learning_rate": 9.63970636906817e-06, + "loss": 0.7216, + "step": 836 + }, + { + "epoch": 1.07, + "learning_rate": 9.597340598905851e-06, + "loss": 0.7102, + "step": 838 + }, + { + "epoch": 1.08, + "learning_rate": 9.554982066442601e-06, + "loss": 0.7133, + "step": 840 + }, + { + "epoch": 1.08, + "learning_rate": 9.512631533062138e-06, + "loss": 0.7392, + "step": 842 + }, + { + "epoch": 1.08, + "learning_rate": 9.470289760004398e-06, + "loss": 0.7172, + "step": 844 + }, + { + "epoch": 1.08, + "learning_rate": 9.427957508351852e-06, + "loss": 0.7022, + "step": 846 + }, + { + "epoch": 1.09, + "learning_rate": 9.385635539015824e-06, + "loss": 0.708, + "step": 848 + }, + { + "epoch": 1.09, + "learning_rate": 9.343324612722819e-06, + "loss": 0.7169, + "step": 850 + }, + { + "epoch": 1.09, + "learning_rate": 9.301025490000843e-06, + "loss": 0.6813, + "step": 852 + }, + { + "epoch": 1.09, + "learning_rate": 9.25873893116574e-06, + "loss": 0.7186, + "step": 854 + }, + { + "epoch": 1.1, + "learning_rate": 9.216465696307513e-06, + "loss": 0.7025, + "step": 856 + }, + { + "epoch": 1.1, + "learning_rate": 9.174206545276678e-06, + "loss": 0.6878, + "step": 858 + }, + { + "epoch": 1.1, + "learning_rate": 9.131962237670599e-06, + "loss": 0.7283, + "step": 860 + }, + { + "epoch": 1.11, + "learning_rate": 9.089733532819825e-06, + "loss": 0.7224, + "step": 862 + }, + { + "epoch": 1.11, + "learning_rate": 9.047521189774456e-06, + "loss": 0.695, + "step": 864 + }, + { + "epoch": 1.11, + "learning_rate": 9.005325967290489e-06, + "loss": 0.6861, + "step": 866 + }, + { + "epoch": 1.11, + "learning_rate": 8.963148623816191e-06, + "loss": 0.7158, + "step": 868 + }, + { + "epoch": 1.12, + "learning_rate": 8.920989917478446e-06, + "loss": 0.7186, + "step": 870 + }, + { + "epoch": 1.12, + "learning_rate": 8.878850606069152e-06, + "loss": 0.7168, + "step": 872 + }, + { + "epoch": 1.12, + "learning_rate": 8.836731447031581e-06, + "loss": 0.7207, + "step": 874 + }, + { + "epoch": 1.12, + "learning_rate": 8.79463319744677e-06, + "loss": 0.7077, + "step": 876 + }, + { + "epoch": 1.13, + "learning_rate": 8.752556614019924e-06, + "loss": 0.6978, + "step": 878 + }, + { + "epoch": 1.13, + "learning_rate": 8.710502453066791e-06, + "loss": 0.6966, + "step": 880 + }, + { + "epoch": 1.13, + "learning_rate": 8.668471470500094e-06, + "loss": 0.6956, + "step": 882 + }, + { + "epoch": 1.13, + "learning_rate": 8.626464421815919e-06, + "loss": 0.7097, + "step": 884 + }, + { + "epoch": 1.14, + "learning_rate": 8.584482062080154e-06, + "loss": 0.7226, + "step": 886 + }, + { + "epoch": 1.14, + "learning_rate": 8.542525145914907e-06, + "loss": 0.715, + "step": 888 + }, + { + "epoch": 1.14, + "learning_rate": 8.500594427484946e-06, + "loss": 0.6873, + "step": 890 + }, + { + "epoch": 1.14, + "learning_rate": 8.458690660484134e-06, + "loss": 0.718, + "step": 892 + }, + { + "epoch": 1.15, + "learning_rate": 8.416814598121901e-06, + "loss": 0.7284, + "step": 894 + }, + { + "epoch": 1.15, + "learning_rate": 8.374966993109689e-06, + "loss": 0.7015, + "step": 896 + }, + { + "epoch": 1.15, + "learning_rate": 8.333148597647414e-06, + "loss": 0.7077, + "step": 898 + }, + { + "epoch": 1.15, + "learning_rate": 8.291360163409978e-06, + "loss": 0.7196, + "step": 900 + }, + { + "epoch": 1.16, + "learning_rate": 8.249602441533727e-06, + "loss": 0.694, + "step": 902 + }, + { + "epoch": 1.16, + "learning_rate": 8.207876182602959e-06, + "loss": 0.6717, + "step": 904 + }, + { + "epoch": 1.16, + "learning_rate": 8.16618213663644e-06, + "loss": 0.6761, + "step": 906 + }, + { + "epoch": 1.16, + "learning_rate": 8.12452105307391e-06, + "loss": 0.6891, + "step": 908 + }, + { + "epoch": 1.17, + "learning_rate": 8.082893680762619e-06, + "loss": 0.7079, + "step": 910 + }, + { + "epoch": 1.17, + "learning_rate": 8.041300767943867e-06, + "loss": 0.7259, + "step": 912 + }, + { + "epoch": 1.17, + "learning_rate": 7.999743062239557e-06, + "loss": 0.6779, + "step": 914 + }, + { + "epoch": 1.17, + "learning_rate": 7.958221310638749e-06, + "loss": 0.7244, + "step": 916 + }, + { + "epoch": 1.18, + "learning_rate": 7.916736259484239e-06, + "loss": 0.728, + "step": 918 + }, + { + "epoch": 1.18, + "learning_rate": 7.875288654459144e-06, + "loss": 0.7062, + "step": 920 + }, + { + "epoch": 1.18, + "learning_rate": 7.833879240573487e-06, + "loss": 0.6839, + "step": 922 + }, + { + "epoch": 1.18, + "learning_rate": 7.792508762150833e-06, + "loss": 0.7387, + "step": 924 + }, + { + "epoch": 1.19, + "learning_rate": 7.751177962814867e-06, + "loss": 0.6914, + "step": 926 + }, + { + "epoch": 1.19, + "learning_rate": 7.709887585476075e-06, + "loss": 0.7109, + "step": 928 + }, + { + "epoch": 1.19, + "learning_rate": 7.668638372318359e-06, + "loss": 0.7034, + "step": 930 + }, + { + "epoch": 1.19, + "learning_rate": 7.627431064785705e-06, + "loss": 0.6729, + "step": 932 + }, + { + "epoch": 1.2, + "learning_rate": 7.5862664035688604e-06, + "loss": 0.6782, + "step": 934 + }, + { + "epoch": 1.2, + "learning_rate": 7.545145128592009e-06, + "loss": 0.691, + "step": 936 + }, + { + "epoch": 1.2, + "learning_rate": 7.504067978999484e-06, + "loss": 0.7078, + "step": 938 + }, + { + "epoch": 1.21, + "learning_rate": 7.463035693142473e-06, + "loss": 0.7147, + "step": 940 + }, + { + "epoch": 1.21, + "learning_rate": 7.422049008565757e-06, + "loss": 0.6988, + "step": 942 + }, + { + "epoch": 1.21, + "learning_rate": 7.38110866199443e-06, + "loss": 0.7316, + "step": 944 + }, + { + "epoch": 1.21, + "learning_rate": 7.340215389320686e-06, + "loss": 0.6732, + "step": 946 + }, + { + "epoch": 1.22, + "learning_rate": 7.299369925590575e-06, + "loss": 0.6761, + "step": 948 + }, + { + "epoch": 1.22, + "learning_rate": 7.258573004990789e-06, + "loss": 0.7179, + "step": 950 + }, + { + "epoch": 1.22, + "learning_rate": 7.217825360835475e-06, + "loss": 0.6954, + "step": 952 + }, + { + "epoch": 1.22, + "learning_rate": 7.1771277255530456e-06, + "loss": 0.6958, + "step": 954 + }, + { + "epoch": 1.23, + "learning_rate": 7.136480830673018e-06, + "loss": 0.6819, + "step": 956 + }, + { + "epoch": 1.23, + "learning_rate": 7.095885406812866e-06, + "loss": 0.6857, + "step": 958 + }, + { + "epoch": 1.23, + "learning_rate": 7.05534218366488e-06, + "loss": 0.6807, + "step": 960 + }, + { + "epoch": 1.23, + "learning_rate": 7.014851889983058e-06, + "loss": 0.6931, + "step": 962 + }, + { + "epoch": 1.24, + "learning_rate": 6.974415253570003e-06, + "loss": 0.6919, + "step": 964 + }, + { + "epoch": 1.24, + "learning_rate": 6.934033001263847e-06, + "loss": 0.6966, + "step": 966 + }, + { + "epoch": 1.24, + "learning_rate": 6.893705858925179e-06, + "loss": 0.6898, + "step": 968 + }, + { + "epoch": 1.24, + "learning_rate": 6.853434551424001e-06, + "loss": 0.6783, + "step": 970 + }, + { + "epoch": 1.25, + "learning_rate": 6.813219802626698e-06, + "loss": 0.6992, + "step": 972 + }, + { + "epoch": 1.25, + "learning_rate": 6.773062335383024e-06, + "loss": 0.7435, + "step": 974 + }, + { + "epoch": 1.25, + "learning_rate": 6.73296287151312e-06, + "loss": 0.6891, + "step": 976 + }, + { + "epoch": 1.25, + "learning_rate": 6.692922131794517e-06, + "loss": 0.6635, + "step": 978 + }, + { + "epoch": 1.26, + "learning_rate": 6.652940835949208e-06, + "loss": 0.6625, + "step": 980 + }, + { + "epoch": 1.26, + "learning_rate": 6.6130197026306945e-06, + "loss": 0.6909, + "step": 982 + }, + { + "epoch": 1.26, + "learning_rate": 6.573159449411071e-06, + "loss": 0.6937, + "step": 984 + }, + { + "epoch": 1.26, + "learning_rate": 6.533360792768122e-06, + "loss": 0.7026, + "step": 986 + }, + { + "epoch": 1.27, + "learning_rate": 6.4936244480724575e-06, + "loss": 0.6587, + "step": 988 + }, + { + "epoch": 1.27, + "learning_rate": 6.453951129574644e-06, + "loss": 0.6776, + "step": 990 + }, + { + "epoch": 1.27, + "learning_rate": 6.4143415503923676e-06, + "loss": 0.693, + "step": 992 + }, + { + "epoch": 1.27, + "learning_rate": 6.374796422497622e-06, + "loss": 0.7083, + "step": 994 + }, + { + "epoch": 1.28, + "learning_rate": 6.335316456703891e-06, + "loss": 0.6781, + "step": 996 + }, + { + "epoch": 1.28, + "learning_rate": 6.295902362653401e-06, + "loss": 0.6893, + "step": 998 + }, + { + "epoch": 1.28, + "learning_rate": 6.256554848804343e-06, + "loss": 0.6693, + "step": 1000 + }, + { + "epoch": 1.28, + "learning_rate": 6.2172746224181524e-06, + "loss": 0.7192, + "step": 1002 + }, + { + "epoch": 1.29, + "learning_rate": 6.178062389546784e-06, + "loss": 0.6856, + "step": 1004 + }, + { + "epoch": 1.29, + "learning_rate": 6.138918855020028e-06, + "loss": 0.7078, + "step": 1006 + }, + { + "epoch": 1.29, + "learning_rate": 6.099844722432844e-06, + "loss": 0.6873, + "step": 1008 + }, + { + "epoch": 1.29, + "learning_rate": 6.060840694132701e-06, + "loss": 0.6734, + "step": 1010 + }, + { + "epoch": 1.3, + "learning_rate": 6.021907471206971e-06, + "loss": 0.6936, + "step": 1012 + }, + { + "epoch": 1.3, + "learning_rate": 5.983045753470308e-06, + "loss": 0.6832, + "step": 1014 + }, + { + "epoch": 1.3, + "learning_rate": 5.944256239452085e-06, + "loss": 0.6686, + "step": 1016 + }, + { + "epoch": 1.31, + "learning_rate": 5.905539626383831e-06, + "loss": 0.7301, + "step": 1018 + }, + { + "epoch": 1.31, + "learning_rate": 5.866896610186701e-06, + "loss": 0.6733, + "step": 1020 + }, + { + "epoch": 1.31, + "learning_rate": 5.82832788545896e-06, + "loss": 0.6825, + "step": 1022 + }, + { + "epoch": 1.31, + "learning_rate": 5.789834145463506e-06, + "loss": 0.7164, + "step": 1024 + }, + { + "epoch": 1.32, + "learning_rate": 5.7514160821154085e-06, + "loss": 0.7157, + "step": 1026 + }, + { + "epoch": 1.32, + "learning_rate": 5.713074385969457e-06, + "loss": 0.6715, + "step": 1028 + }, + { + "epoch": 1.32, + "learning_rate": 5.67480974620778e-06, + "loss": 0.6993, + "step": 1030 + }, + { + "epoch": 1.32, + "learning_rate": 5.63662285062742e-06, + "loss": 0.6721, + "step": 1032 + }, + { + "epoch": 1.33, + "learning_rate": 5.598514385627997e-06, + "loss": 0.6976, + "step": 1034 + }, + { + "epoch": 1.33, + "learning_rate": 5.56048503619935e-06, + "loss": 0.7012, + "step": 1036 + }, + { + "epoch": 1.33, + "learning_rate": 5.522535485909258e-06, + "loss": 0.6768, + "step": 1038 + }, + { + "epoch": 1.33, + "learning_rate": 5.484666416891109e-06, + "loss": 0.6737, + "step": 1040 + }, + { + "epoch": 1.34, + "learning_rate": 5.446878509831668e-06, + "loss": 0.6513, + "step": 1042 + }, + { + "epoch": 1.34, + "learning_rate": 5.409172443958844e-06, + "loss": 0.6873, + "step": 1044 + }, + { + "epoch": 1.34, + "learning_rate": 5.371548897029457e-06, + "loss": 0.6846, + "step": 1046 + }, + { + "epoch": 1.34, + "learning_rate": 5.334008545317082e-06, + "loss": 0.6688, + "step": 1048 + }, + { + "epoch": 1.35, + "learning_rate": 5.2965520635998676e-06, + "loss": 0.6702, + "step": 1050 + }, + { + "epoch": 1.35, + "learning_rate": 5.259180125148442e-06, + "loss": 0.6873, + "step": 1052 + }, + { + "epoch": 1.35, + "learning_rate": 5.22189340171377e-06, + "loss": 0.7016, + "step": 1054 + }, + { + "epoch": 1.35, + "learning_rate": 5.184692563515104e-06, + "loss": 0.7156, + "step": 1056 + }, + { + "epoch": 1.36, + "learning_rate": 5.147578279227943e-06, + "loss": 0.6664, + "step": 1058 + }, + { + "epoch": 1.36, + "learning_rate": 5.110551215971981e-06, + "loss": 0.6799, + "step": 1060 + }, + { + "epoch": 1.36, + "learning_rate": 5.073612039299157e-06, + "loss": 0.6935, + "step": 1062 + }, + { + "epoch": 1.36, + "learning_rate": 5.036761413181659e-06, + "loss": 0.721, + "step": 1064 + }, + { + "epoch": 1.37, + "learning_rate": 5.000000000000003e-06, + "loss": 0.6742, + "step": 1066 + }, + { + "epoch": 1.37, + "learning_rate": 4.963328460531127e-06, + "loss": 0.649, + "step": 1068 + }, + { + "epoch": 1.37, + "learning_rate": 4.926747453936509e-06, + "loss": 0.707, + "step": 1070 + }, + { + "epoch": 1.37, + "learning_rate": 4.890257637750332e-06, + "loss": 0.7019, + "step": 1072 + }, + { + "epoch": 1.38, + "learning_rate": 4.853859667867641e-06, + "loss": 0.6573, + "step": 1074 + }, + { + "epoch": 1.38, + "learning_rate": 4.817554198532582e-06, + "loss": 0.6775, + "step": 1076 + }, + { + "epoch": 1.38, + "learning_rate": 4.781341882326615e-06, + "loss": 0.6691, + "step": 1078 + }, + { + "epoch": 1.38, + "learning_rate": 4.745223370156797e-06, + "loss": 0.7141, + "step": 1080 + }, + { + "epoch": 1.39, + "learning_rate": 4.709199311244098e-06, + "loss": 0.7, + "step": 1082 + }, + { + "epoch": 1.39, + "learning_rate": 4.673270353111687e-06, + "loss": 0.729, + "step": 1084 + }, + { + "epoch": 1.39, + "learning_rate": 4.63743714157335e-06, + "loss": 0.6957, + "step": 1086 + }, + { + "epoch": 1.39, + "learning_rate": 4.6017003207218294e-06, + "loss": 0.6602, + "step": 1088 + }, + { + "epoch": 1.4, + "learning_rate": 4.566060532917288e-06, + "loss": 0.6949, + "step": 1090 + }, + { + "epoch": 1.4, + "learning_rate": 4.530518418775734e-06, + "loss": 0.6756, + "step": 1092 + }, + { + "epoch": 1.4, + "learning_rate": 4.4950746171575135e-06, + "loss": 0.6676, + "step": 1094 + }, + { + "epoch": 1.41, + "learning_rate": 4.459729765155842e-06, + "loss": 0.6849, + "step": 1096 + }, + { + "epoch": 1.41, + "learning_rate": 4.424484498085335e-06, + "loss": 0.6607, + "step": 1098 + }, + { + "epoch": 1.41, + "learning_rate": 4.389339449470592e-06, + "loss": 0.7236, + "step": 1100 + }, + { + "epoch": 1.41, + "learning_rate": 4.354295251034811e-06, + "loss": 0.6699, + "step": 1102 + }, + { + "epoch": 1.42, + "learning_rate": 4.319352532688444e-06, + "loss": 0.6536, + "step": 1104 + }, + { + "epoch": 1.42, + "learning_rate": 4.284511922517853e-06, + "loss": 0.6484, + "step": 1106 + }, + { + "epoch": 1.42, + "learning_rate": 4.249774046774034e-06, + "loss": 0.7371, + "step": 1108 + }, + { + "epoch": 1.42, + "learning_rate": 4.2151395298613675e-06, + "loss": 0.6922, + "step": 1110 + }, + { + "epoch": 1.43, + "learning_rate": 4.180608994326371e-06, + "loss": 0.7273, + "step": 1112 + }, + { + "epoch": 1.43, + "learning_rate": 4.1461830608465385e-06, + "loss": 0.7084, + "step": 1114 + }, + { + "epoch": 1.43, + "learning_rate": 4.111862348219158e-06, + "loss": 0.7049, + "step": 1116 + }, + { + "epoch": 1.43, + "learning_rate": 4.077647473350201e-06, + "loss": 0.7021, + "step": 1118 + }, + { + "epoch": 1.44, + "learning_rate": 4.04353905124324e-06, + "loss": 0.6795, + "step": 1120 + }, + { + "epoch": 1.44, + "learning_rate": 4.009537694988372e-06, + "loss": 0.6702, + "step": 1122 + }, + { + "epoch": 1.44, + "learning_rate": 3.975644015751234e-06, + "loss": 0.6583, + "step": 1124 + }, + { + "epoch": 1.44, + "learning_rate": 3.941858622761975e-06, + "loss": 0.6646, + "step": 1126 + }, + { + "epoch": 1.45, + "learning_rate": 3.908182123304344e-06, + "loss": 0.715, + "step": 1128 + }, + { + "epoch": 1.45, + "learning_rate": 3.8746151227047455e-06, + "loss": 0.6578, + "step": 1130 + }, + { + "epoch": 1.45, + "learning_rate": 3.84115822432137e-06, + "loss": 0.7, + "step": 1132 + }, + { + "epoch": 1.45, + "learning_rate": 3.807812029533362e-06, + "loss": 0.7206, + "step": 1134 + }, + { + "epoch": 1.46, + "learning_rate": 3.7745771377299758e-06, + "loss": 0.6845, + "step": 1136 + }, + { + "epoch": 1.46, + "learning_rate": 3.7414541462998446e-06, + "loss": 0.683, + "step": 1138 + }, + { + "epoch": 1.46, + "learning_rate": 3.708443650620206e-06, + "loss": 0.7184, + "step": 1140 + }, + { + "epoch": 1.46, + "learning_rate": 3.6755462440462288e-06, + "loss": 0.6447, + "step": 1142 + }, + { + "epoch": 1.47, + "learning_rate": 3.6427625179003223e-06, + "loss": 0.6881, + "step": 1144 + }, + { + "epoch": 1.47, + "learning_rate": 3.6100930614615204e-06, + "loss": 0.7014, + "step": 1146 + }, + { + "epoch": 1.47, + "learning_rate": 3.5775384619549e-06, + "loss": 0.6709, + "step": 1148 + }, + { + "epoch": 1.47, + "learning_rate": 3.5450993045409997e-06, + "loss": 0.7159, + "step": 1150 + }, + { + "epoch": 1.48, + "learning_rate": 3.5127761723053313e-06, + "loss": 0.6575, + "step": 1152 + }, + { + "epoch": 1.48, + "learning_rate": 3.4805696462478634e-06, + "loss": 0.6786, + "step": 1154 + }, + { + "epoch": 1.48, + "learning_rate": 3.448480305272619e-06, + "loss": 0.659, + "step": 1156 + }, + { + "epoch": 1.48, + "learning_rate": 3.41650872617724e-06, + "loss": 0.6946, + "step": 1158 + }, + { + "epoch": 1.49, + "learning_rate": 3.384655483642624e-06, + "loss": 0.7109, + "step": 1160 + }, + { + "epoch": 1.49, + "learning_rate": 3.352921150222612e-06, + "loss": 0.7044, + "step": 1162 + }, + { + "epoch": 1.49, + "learning_rate": 3.321306296333673e-06, + "loss": 0.6759, + "step": 1164 + }, + { + "epoch": 1.49, + "learning_rate": 3.2898114902446708e-06, + "loss": 0.6957, + "step": 1166 + }, + { + "epoch": 1.5, + "learning_rate": 3.2584372980666344e-06, + "loss": 0.691, + "step": 1168 + }, + { + "epoch": 1.5, + "learning_rate": 3.2271842837425917e-06, + "loss": 0.7044, + "step": 1170 + }, + { + "epoch": 1.5, + "learning_rate": 3.1960530090374277e-06, + "loss": 0.6487, + "step": 1172 + }, + { + "epoch": 1.51, + "learning_rate": 3.165044033527789e-06, + "loss": 0.6212, + "step": 1174 + }, + { + "epoch": 1.51, + "learning_rate": 3.134157914592032e-06, + "loss": 0.659, + "step": 1176 + }, + { + "epoch": 1.51, + "learning_rate": 3.1033952074001882e-06, + "loss": 0.7016, + "step": 1178 + }, + { + "epoch": 1.51, + "learning_rate": 3.0727564649040066e-06, + "loss": 0.6829, + "step": 1180 + }, + { + "epoch": 1.52, + "learning_rate": 3.042242237826991e-06, + "loss": 0.6488, + "step": 1182 + }, + { + "epoch": 1.52, + "learning_rate": 3.011853074654515e-06, + "loss": 0.7057, + "step": 1184 + }, + { + "epoch": 1.52, + "learning_rate": 2.981589521623973e-06, + "loss": 0.6641, + "step": 1186 + }, + { + "epoch": 1.52, + "learning_rate": 2.951452122714926e-06, + "loss": 0.678, + "step": 1188 + }, + { + "epoch": 1.53, + "learning_rate": 2.9214414196393702e-06, + "loss": 0.6965, + "step": 1190 + }, + { + "epoch": 1.53, + "learning_rate": 2.8915579518319626e-06, + "loss": 0.6813, + "step": 1192 + }, + { + "epoch": 1.53, + "learning_rate": 2.861802256440348e-06, + "loss": 0.6936, + "step": 1194 + }, + { + "epoch": 1.53, + "learning_rate": 2.8321748683154893e-06, + "loss": 0.6601, + "step": 1196 + }, + { + "epoch": 1.54, + "learning_rate": 2.8026763200020557e-06, + "loss": 0.6615, + "step": 1198 + }, + { + "epoch": 1.54, + "learning_rate": 2.773307141728867e-06, + "loss": 0.7019, + "step": 1200 + }, + { + "epoch": 1.54, + "learning_rate": 2.744067861399333e-06, + "loss": 0.6577, + "step": 1202 + }, + { + "epoch": 1.54, + "learning_rate": 2.714959004582003e-06, + "loss": 0.6926, + "step": 1204 + }, + { + "epoch": 1.55, + "learning_rate": 2.6859810945010687e-06, + "loss": 0.6947, + "step": 1206 + }, + { + "epoch": 1.55, + "learning_rate": 2.6571346520270147e-06, + "loss": 0.6739, + "step": 1208 + }, + { + "epoch": 1.55, + "learning_rate": 2.628420195667214e-06, + "loss": 0.6861, + "step": 1210 + }, + { + "epoch": 1.55, + "learning_rate": 2.5998382415566258e-06, + "loss": 0.6433, + "step": 1212 + }, + { + "epoch": 1.56, + "learning_rate": 2.5713893034485216e-06, + "loss": 0.6695, + "step": 1214 + }, + { + "epoch": 1.56, + "learning_rate": 2.5430738927052346e-06, + "loss": 0.7075, + "step": 1216 + }, + { + "epoch": 1.56, + "learning_rate": 2.514892518288988e-06, + "loss": 0.7039, + "step": 1218 + }, + { + "epoch": 1.56, + "learning_rate": 2.4868456867527315e-06, + "loss": 0.6662, + "step": 1220 + }, + { + "epoch": 1.57, + "learning_rate": 2.4589339022310386e-06, + "loss": 0.6684, + "step": 1222 + }, + { + "epoch": 1.57, + "learning_rate": 2.431157666431052e-06, + "loss": 0.7121, + "step": 1224 + }, + { + "epoch": 1.57, + "learning_rate": 2.403517478623456e-06, + "loss": 0.6479, + "step": 1226 + }, + { + "epoch": 1.57, + "learning_rate": 2.3760138356335172e-06, + "loss": 0.6908, + "step": 1228 + }, + { + "epoch": 1.58, + "learning_rate": 2.348647231832131e-06, + "loss": 0.6859, + "step": 1230 + }, + { + "epoch": 1.58, + "learning_rate": 2.3214181591269603e-06, + "loss": 0.7063, + "step": 1232 + }, + { + "epoch": 1.58, + "learning_rate": 2.2943271069535754e-06, + "loss": 0.6944, + "step": 1234 + }, + { + "epoch": 1.58, + "learning_rate": 2.267374562266662e-06, + "loss": 0.6558, + "step": 1236 + }, + { + "epoch": 1.59, + "learning_rate": 2.240561009531281e-06, + "loss": 0.6612, + "step": 1238 + }, + { + "epoch": 1.59, + "learning_rate": 2.2138869307141266e-06, + "loss": 0.6312, + "step": 1240 + }, + { + "epoch": 1.59, + "learning_rate": 2.1873528052749094e-06, + "loss": 0.6734, + "step": 1242 + }, + { + "epoch": 1.59, + "learning_rate": 2.1609591101576945e-06, + "loss": 0.6691, + "step": 1244 + }, + { + "epoch": 1.6, + "learning_rate": 2.1347063197823648e-06, + "loss": 0.6724, + "step": 1246 + }, + { + "epoch": 1.6, + "learning_rate": 2.1085949060360654e-06, + "loss": 0.6983, + "step": 1248 + }, + { + "epoch": 1.6, + "learning_rate": 2.0826253382647334e-06, + "loss": 0.7246, + "step": 1250 + }, + { + "epoch": 1.61, + "learning_rate": 2.056798083264667e-06, + "loss": 0.6467, + "step": 1252 + }, + { + "epoch": 1.61, + "learning_rate": 2.0311136052741274e-06, + "loss": 0.6814, + "step": 1254 + }, + { + "epoch": 1.61, + "learning_rate": 2.0055723659649907e-06, + "loss": 0.6907, + "step": 1256 + }, + { + "epoch": 1.61, + "learning_rate": 1.9801748244344587e-06, + "loss": 0.6459, + "step": 1258 + }, + { + "epoch": 1.62, + "learning_rate": 1.9549214371968008e-06, + "loss": 0.6704, + "step": 1260 + }, + { + "epoch": 1.62, + "learning_rate": 1.9298126581751542e-06, + "loss": 0.71, + "step": 1262 + }, + { + "epoch": 1.62, + "learning_rate": 1.9048489386933545e-06, + "loss": 0.6703, + "step": 1264 + }, + { + "epoch": 1.62, + "learning_rate": 1.8800307274678364e-06, + "loss": 0.7293, + "step": 1266 + }, + { + "epoch": 1.63, + "learning_rate": 1.8553584705995564e-06, + "loss": 0.6838, + "step": 1268 + }, + { + "epoch": 1.63, + "learning_rate": 1.8308326115659757e-06, + "loss": 0.6407, + "step": 1270 + }, + { + "epoch": 1.63, + "learning_rate": 1.8064535912131032e-06, + "loss": 0.6961, + "step": 1272 + }, + { + "epoch": 1.63, + "learning_rate": 1.7822218477475496e-06, + "loss": 0.6786, + "step": 1274 + }, + { + "epoch": 1.64, + "learning_rate": 1.7581378167286655e-06, + "loss": 0.6886, + "step": 1276 + }, + { + "epoch": 1.64, + "learning_rate": 1.7342019310607062e-06, + "loss": 0.6726, + "step": 1278 + }, + { + "epoch": 1.64, + "learning_rate": 1.7104146209850591e-06, + "loss": 0.6401, + "step": 1280 + }, + { + "epoch": 1.64, + "learning_rate": 1.6867763140724969e-06, + "loss": 0.7228, + "step": 1282 + }, + { + "epoch": 1.65, + "learning_rate": 1.6632874352154982e-06, + "loss": 0.6508, + "step": 1284 + }, + { + "epoch": 1.65, + "learning_rate": 1.6399484066206183e-06, + "loss": 0.6471, + "step": 1286 + }, + { + "epoch": 1.65, + "learning_rate": 1.6167596478008817e-06, + "loss": 0.6791, + "step": 1288 + }, + { + "epoch": 1.65, + "learning_rate": 1.5937215755682667e-06, + "loss": 0.6975, + "step": 1290 + }, + { + "epoch": 1.66, + "learning_rate": 1.5708346040261812e-06, + "loss": 0.6959, + "step": 1292 + }, + { + "epoch": 1.66, + "learning_rate": 1.5480991445620541e-06, + "loss": 0.7002, + "step": 1294 + }, + { + "epoch": 1.66, + "learning_rate": 1.5255156058399124e-06, + "loss": 0.6931, + "step": 1296 + }, + { + "epoch": 1.66, + "learning_rate": 1.5030843937930485e-06, + "loss": 0.6847, + "step": 1298 + }, + { + "epoch": 1.67, + "learning_rate": 1.4808059116167306e-06, + "loss": 0.6789, + "step": 1300 + }, + { + "epoch": 1.67, + "learning_rate": 1.4586805597609333e-06, + "loss": 0.6396, + "step": 1302 + }, + { + "epoch": 1.67, + "learning_rate": 1.4367087359231668e-06, + "loss": 0.6904, + "step": 1304 + }, + { + "epoch": 1.67, + "learning_rate": 1.4148908350413048e-06, + "loss": 0.685, + "step": 1306 + }, + { + "epoch": 1.68, + "learning_rate": 1.3932272492864984e-06, + "loss": 0.6563, + "step": 1308 + }, + { + "epoch": 1.68, + "learning_rate": 1.3717183680561253e-06, + "loss": 0.6695, + "step": 1310 + }, + { + "epoch": 1.68, + "learning_rate": 1.3503645779667852e-06, + "loss": 0.6376, + "step": 1312 + }, + { + "epoch": 1.68, + "learning_rate": 1.3291662628473634e-06, + "loss": 0.6618, + "step": 1314 + }, + { + "epoch": 1.69, + "learning_rate": 1.308123803732111e-06, + "loss": 0.6975, + "step": 1316 + }, + { + "epoch": 1.69, + "learning_rate": 1.2872375788538171e-06, + "loss": 0.7143, + "step": 1318 + }, + { + "epoch": 1.69, + "learning_rate": 1.266507963636997e-06, + "loss": 0.6923, + "step": 1320 + }, + { + "epoch": 1.69, + "learning_rate": 1.2459353306911438e-06, + "loss": 0.6916, + "step": 1322 + }, + { + "epoch": 1.7, + "learning_rate": 1.2255200498040432e-06, + "loss": 0.6814, + "step": 1324 + }, + { + "epoch": 1.7, + "learning_rate": 1.2052624879351105e-06, + "loss": 0.6645, + "step": 1326 + }, + { + "epoch": 1.7, + "learning_rate": 1.1851630092088051e-06, + "loss": 0.6505, + "step": 1328 + }, + { + "epoch": 1.71, + "learning_rate": 1.1652219749080817e-06, + "loss": 0.6674, + "step": 1330 + }, + { + "epoch": 1.71, + "learning_rate": 1.1454397434679022e-06, + "loss": 0.6732, + "step": 1332 + }, + { + "epoch": 1.71, + "learning_rate": 1.12581667046878e-06, + "loss": 0.697, + "step": 1334 + }, + { + "epoch": 1.71, + "learning_rate": 1.1063531086304003e-06, + "loss": 0.6439, + "step": 1336 + }, + { + "epoch": 1.72, + "learning_rate": 1.0870494078052796e-06, + "loss": 0.6709, + "step": 1338 + }, + { + "epoch": 1.72, + "learning_rate": 1.067905914972468e-06, + "loss": 0.6585, + "step": 1340 + }, + { + "epoch": 1.72, + "learning_rate": 1.0489229742313223e-06, + "loss": 0.665, + "step": 1342 + }, + { + "epoch": 1.72, + "learning_rate": 1.0301009267953145e-06, + "loss": 0.6689, + "step": 1344 + }, + { + "epoch": 1.73, + "learning_rate": 1.0114401109859019e-06, + "loss": 0.6836, + "step": 1346 + }, + { + "epoch": 1.73, + "learning_rate": 9.929408622264448e-07, + "loss": 0.6462, + "step": 1348 + }, + { + "epoch": 1.73, + "learning_rate": 9.746035130361741e-07, + "loss": 0.6645, + "step": 1350 + }, + { + "epoch": 1.73, + "learning_rate": 9.564283930242258e-07, + "loss": 0.7051, + "step": 1352 + }, + { + "epoch": 1.74, + "learning_rate": 9.384158288836987e-07, + "loss": 0.6666, + "step": 1354 + }, + { + "epoch": 1.74, + "learning_rate": 9.205661443857994e-07, + "loss": 0.6401, + "step": 1356 + }, + { + "epoch": 1.74, + "learning_rate": 9.028796603740097e-07, + "loss": 0.705, + "step": 1358 + }, + { + "epoch": 1.74, + "learning_rate": 8.853566947583259e-07, + "loss": 0.6418, + "step": 1360 + }, + { + "epoch": 1.75, + "learning_rate": 8.67997562509546e-07, + "loss": 0.6825, + "step": 1362 + }, + { + "epoch": 1.75, + "learning_rate": 8.508025756535987e-07, + "loss": 0.6618, + "step": 1364 + }, + { + "epoch": 1.75, + "learning_rate": 8.337720432659513e-07, + "loss": 0.6897, + "step": 1366 + }, + { + "epoch": 1.75, + "learning_rate": 8.169062714660347e-07, + "loss": 0.6325, + "step": 1368 + }, + { + "epoch": 1.76, + "learning_rate": 8.002055634117578e-07, + "loss": 0.6433, + "step": 1370 + }, + { + "epoch": 1.76, + "learning_rate": 7.836702192940493e-07, + "loss": 0.6817, + "step": 1372 + }, + { + "epoch": 1.76, + "learning_rate": 7.673005363314578e-07, + "loss": 0.6696, + "step": 1374 + }, + { + "epoch": 1.76, + "learning_rate": 7.510968087648262e-07, + "loss": 0.6868, + "step": 1376 + }, + { + "epoch": 1.77, + "learning_rate": 7.350593278519824e-07, + "loss": 0.676, + "step": 1378 + }, + { + "epoch": 1.77, + "learning_rate": 7.19188381862519e-07, + "loss": 0.657, + "step": 1380 + }, + { + "epoch": 1.77, + "learning_rate": 7.034842560726008e-07, + "loss": 0.6616, + "step": 1382 + }, + { + "epoch": 1.77, + "learning_rate": 6.879472327598502e-07, + "loss": 0.664, + "step": 1384 + }, + { + "epoch": 1.78, + "learning_rate": 6.725775911982602e-07, + "loss": 0.6902, + "step": 1386 + }, + { + "epoch": 1.78, + "learning_rate": 6.573756076531779e-07, + "loss": 0.6478, + "step": 1388 + }, + { + "epoch": 1.78, + "learning_rate": 6.423415553763479e-07, + "loss": 0.6739, + "step": 1390 + }, + { + "epoch": 1.78, + "learning_rate": 6.274757046009871e-07, + "loss": 0.6771, + "step": 1392 + }, + { + "epoch": 1.79, + "learning_rate": 6.127783225369377e-07, + "loss": 0.7025, + "step": 1394 + }, + { + "epoch": 1.79, + "learning_rate": 5.982496733658582e-07, + "loss": 0.6547, + "step": 1396 + }, + { + "epoch": 1.79, + "learning_rate": 5.83890018236476e-07, + "loss": 0.6664, + "step": 1398 + }, + { + "epoch": 1.79, + "learning_rate": 5.696996152598966e-07, + "loss": 0.6783, + "step": 1400 + }, + { + "epoch": 1.8, + "learning_rate": 5.556787195049573e-07, + "loss": 0.6564, + "step": 1402 + }, + { + "epoch": 1.8, + "learning_rate": 5.418275829936537e-07, + "loss": 0.6886, + "step": 1404 + }, + { + "epoch": 1.8, + "learning_rate": 5.281464546965953e-07, + "loss": 0.6783, + "step": 1406 + }, + { + "epoch": 1.81, + "learning_rate": 5.146355805285452e-07, + "loss": 0.6822, + "step": 1408 + }, + { + "epoch": 1.81, + "learning_rate": 5.012952033439844e-07, + "loss": 0.6595, + "step": 1410 + }, + { + "epoch": 1.81, + "learning_rate": 4.881255629327608e-07, + "loss": 0.6639, + "step": 1412 + }, + { + "epoch": 1.81, + "learning_rate": 4.7512689601576843e-07, + "loss": 0.6598, + "step": 1414 + }, + { + "epoch": 1.82, + "learning_rate": 4.6229943624069963e-07, + "loss": 0.6847, + "step": 1416 + }, + { + "epoch": 1.82, + "learning_rate": 4.4964341417784165e-07, + "loss": 0.6823, + "step": 1418 + }, + { + "epoch": 1.82, + "learning_rate": 4.3715905731593233e-07, + "loss": 0.6886, + "step": 1420 + }, + { + "epoch": 1.82, + "learning_rate": 4.248465900580734e-07, + "loss": 0.6734, + "step": 1422 + }, + { + "epoch": 1.83, + "learning_rate": 4.127062337176935e-07, + "loss": 0.6464, + "step": 1424 + }, + { + "epoch": 1.83, + "learning_rate": 4.0073820651457043e-07, + "loss": 0.6648, + "step": 1426 + }, + { + "epoch": 1.83, + "learning_rate": 3.889427235709153e-07, + "loss": 0.625, + "step": 1428 + }, + { + "epoch": 1.83, + "learning_rate": 3.773199969074959e-07, + "loss": 0.6812, + "step": 1430 + }, + { + "epoch": 1.84, + "learning_rate": 3.658702354398325e-07, + "loss": 0.662, + "step": 1432 + }, + { + "epoch": 1.84, + "learning_rate": 3.5459364497443696e-07, + "loss": 0.6536, + "step": 1434 + }, + { + "epoch": 1.84, + "learning_rate": 3.4349042820512325e-07, + "loss": 0.688, + "step": 1436 + }, + { + "epoch": 1.84, + "learning_rate": 3.325607847093537e-07, + "loss": 0.6674, + "step": 1438 + }, + { + "epoch": 1.85, + "learning_rate": 3.2180491094465414e-07, + "loss": 0.6821, + "step": 1440 + }, + { + "epoch": 1.85, + "learning_rate": 3.112230002450889e-07, + "loss": 0.6742, + "step": 1442 + }, + { + "epoch": 1.85, + "learning_rate": 3.0081524281777687e-07, + "loss": 0.7051, + "step": 1444 + }, + { + "epoch": 1.85, + "learning_rate": 2.905818257394799e-07, + "loss": 0.6503, + "step": 1446 + }, + { + "epoch": 1.86, + "learning_rate": 2.805229329532344e-07, + "loss": 0.6861, + "step": 1448 + }, + { + "epoch": 1.86, + "learning_rate": 2.706387452650494e-07, + "loss": 0.6831, + "step": 1450 + }, + { + "epoch": 1.86, + "learning_rate": 2.609294403406537e-07, + "loss": 0.6693, + "step": 1452 + }, + { + "epoch": 1.86, + "learning_rate": 2.513951927023017e-07, + "loss": 0.6833, + "step": 1454 + }, + { + "epoch": 1.87, + "learning_rate": 2.420361737256438e-07, + "loss": 0.6734, + "step": 1456 + }, + { + "epoch": 1.87, + "learning_rate": 2.3285255163663535e-07, + "loss": 0.6683, + "step": 1458 + }, + { + "epoch": 1.87, + "learning_rate": 2.2384449150851695e-07, + "loss": 0.6825, + "step": 1460 + }, + { + "epoch": 1.87, + "learning_rate": 2.1501215525885245e-07, + "loss": 0.616, + "step": 1462 + }, + { + "epoch": 1.88, + "learning_rate": 2.063557016466111e-07, + "loss": 0.6554, + "step": 1464 + }, + { + "epoch": 1.88, + "learning_rate": 1.978752862693212e-07, + "loss": 0.6607, + "step": 1466 + }, + { + "epoch": 1.88, + "learning_rate": 1.8957106156026084e-07, + "loss": 0.6362, + "step": 1468 + }, + { + "epoch": 1.88, + "learning_rate": 1.8144317678573497e-07, + "loss": 0.6186, + "step": 1470 + }, + { + "epoch": 1.89, + "learning_rate": 1.7349177804237837e-07, + "loss": 0.6684, + "step": 1472 + }, + { + "epoch": 1.89, + "learning_rate": 1.6571700825453674e-07, + "loss": 0.6458, + "step": 1474 + }, + { + "epoch": 1.89, + "learning_rate": 1.5811900717169537e-07, + "loss": 0.6379, + "step": 1476 + }, + { + "epoch": 1.89, + "learning_rate": 1.506979113659679e-07, + "loss": 0.6912, + "step": 1478 + }, + { + "epoch": 1.9, + "learning_rate": 1.4345385422964043e-07, + "loss": 0.656, + "step": 1480 + }, + { + "epoch": 1.9, + "learning_rate": 1.3638696597277678e-07, + "loss": 0.6447, + "step": 1482 + }, + { + "epoch": 1.9, + "learning_rate": 1.2949737362087156e-07, + "loss": 0.6355, + "step": 1484 + }, + { + "epoch": 1.91, + "learning_rate": 1.227852010125752e-07, + "loss": 0.6725, + "step": 1486 + }, + { + "epoch": 1.91, + "learning_rate": 1.1625056879746133e-07, + "loss": 0.6324, + "step": 1488 + }, + { + "epoch": 1.91, + "learning_rate": 1.0989359443386305e-07, + "loss": 0.6771, + "step": 1490 + }, + { + "epoch": 1.91, + "learning_rate": 1.0371439218675671e-07, + "loss": 0.6929, + "step": 1492 + }, + { + "epoch": 1.92, + "learning_rate": 9.771307312571254e-08, + "loss": 0.6808, + "step": 1494 + }, + { + "epoch": 1.92, + "learning_rate": 9.188974512289617e-08, + "loss": 0.6505, + "step": 1496 + }, + { + "epoch": 1.92, + "learning_rate": 8.624451285112689e-08, + "loss": 0.6443, + "step": 1498 + }, + { + "epoch": 1.92, + "learning_rate": 8.077747778200474e-08, + "loss": 0.7029, + "step": 1500 + }, + { + "epoch": 1.93, + "learning_rate": 7.54887381840752e-08, + "loss": 0.6621, + "step": 1502 + }, + { + "epoch": 1.93, + "learning_rate": 7.037838912107298e-08, + "loss": 0.6867, + "step": 1504 + }, + { + "epoch": 1.93, + "learning_rate": 6.544652245020433e-08, + "loss": 0.6881, + "step": 1506 + }, + { + "epoch": 1.93, + "learning_rate": 6.069322682050516e-08, + "loss": 0.6912, + "step": 1508 + }, + { + "epoch": 1.94, + "learning_rate": 5.611858767124001e-08, + "loss": 0.6987, + "step": 1510 + }, + { + "epoch": 1.94, + "learning_rate": 5.1722687230369995e-08, + "loss": 0.6567, + "step": 1512 + }, + { + "epoch": 1.94, + "learning_rate": 4.7505604513072845e-08, + "loss": 0.655, + "step": 1514 + }, + { + "epoch": 1.94, + "learning_rate": 4.346741532032628e-08, + "loss": 0.661, + "step": 1516 + }, + { + "epoch": 1.95, + "learning_rate": 3.96081922375402e-08, + "loss": 0.6614, + "step": 1518 + }, + { + "epoch": 1.95, + "learning_rate": 3.592800463325663e-08, + "loss": 0.6765, + "step": 1520 + }, + { + "epoch": 1.95, + "learning_rate": 3.242691865790071e-08, + "loss": 0.6299, + "step": 1522 + }, + { + "epoch": 1.95, + "learning_rate": 2.9104997242590528e-08, + "loss": 0.6732, + "step": 1524 + }, + { + "epoch": 1.96, + "learning_rate": 2.5962300098008042e-08, + "loss": 0.6359, + "step": 1526 + }, + { + "epoch": 1.96, + "learning_rate": 2.2998883713326592e-08, + "loss": 0.6451, + "step": 1528 + }, + { + "epoch": 1.96, + "learning_rate": 2.0214801355192826e-08, + "loss": 0.6826, + "step": 1530 + }, + { + "epoch": 1.96, + "learning_rate": 1.761010306676969e-08, + "loss": 0.6993, + "step": 1532 + }, + { + "epoch": 1.97, + "learning_rate": 1.518483566683826e-08, + "loss": 0.6909, + "step": 1534 + }, + { + "epoch": 1.97, + "learning_rate": 1.2939042748955078e-08, + "loss": 0.6449, + "step": 1536 + }, + { + "epoch": 1.97, + "learning_rate": 1.0872764680671666e-08, + "loss": 0.7105, + "step": 1538 + }, + { + "epoch": 1.97, + "learning_rate": 8.986038602802894e-09, + "loss": 0.6682, + "step": 1540 + }, + { + "epoch": 1.98, + "learning_rate": 7.278898428764169e-09, + "loss": 0.6799, + "step": 1542 + }, + { + "epoch": 1.98, + "learning_rate": 5.751374843961932e-09, + "loss": 0.6296, + "step": 1544 + }, + { + "epoch": 1.98, + "learning_rate": 4.403495305237426e-09, + "loss": 0.6599, + "step": 1546 + }, + { + "epoch": 1.98, + "learning_rate": 3.2352840403804264e-09, + "loss": 0.6654, + "step": 1548 + }, + { + "epoch": 1.99, + "learning_rate": 2.246762047685147e-09, + "loss": 0.6752, + "step": 1550 + }, + { + "epoch": 1.99, + "learning_rate": 1.437947095582759e-09, + "loss": 0.6753, + "step": 1552 + }, + { + "epoch": 1.99, + "learning_rate": 8.088537223116533e-10, + "loss": 0.6672, + "step": 1554 + }, + { + "epoch": 1.99, + "learning_rate": 3.594932356654202e-10, + "loss": 0.6977, + "step": 1556 + }, + { + "epoch": 2.0, + "learning_rate": 8.987371278079693e-11, + "loss": 0.6471, + "step": 1558 + }, + { + "epoch": 2.0, + "learning_rate": 0.0, + "loss": 0.6957, + "step": 1560 + }, + { + "epoch": 2.0, + "step": 1560, + "total_flos": 3.392417160138588e+18, + "train_loss": 0.928296754299066, + "train_runtime": 10205.5682, + "train_samples_per_second": 9.783, + "train_steps_per_second": 0.153 + } + ], + "logging_steps": 2, + "max_steps": 1560, + "num_train_epochs": 2, + "save_steps": 780, + "total_flos": 3.392417160138588e+18, + "trial_name": null, + "trial_params": null +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000..b76e798 --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a28f04ce9b5bd9db3a3ed3fd21daf4b59f2bf2a7d4739fce7bac0966e093e95d +size 4923