初始化项目，由ModelHub XC社区提供模型

Model: Finnish-NLP/Ahma-7B Source: Original Platform
2026-06-01 02:08:18 +08:00
commit be39ad8722
45 changed files with 297486 additions and 0 deletions
--- a/.gitattributes
+++ b/.gitattributes
@@ -0,0 +1,35 @@
 *.7z filter=lfs diff=lfs merge=lfs -text
 *.arrow filter=lfs diff=lfs merge=lfs -text
 *.bin filter=lfs diff=lfs merge=lfs -text
 *.bz2 filter=lfs diff=lfs merge=lfs -text
 *.ckpt filter=lfs diff=lfs merge=lfs -text
 *.ftz filter=lfs diff=lfs merge=lfs -text
 *.gz filter=lfs diff=lfs merge=lfs -text
 *.h5 filter=lfs diff=lfs merge=lfs -text
 *.joblib filter=lfs diff=lfs merge=lfs -text
 *.lfs.* filter=lfs diff=lfs merge=lfs -text
 *.mlmodel filter=lfs diff=lfs merge=lfs -text
 *.model filter=lfs diff=lfs merge=lfs -text
 *.msgpack filter=lfs diff=lfs merge=lfs -text
 *.npy filter=lfs diff=lfs merge=lfs -text
 *.npz filter=lfs diff=lfs merge=lfs -text
 *.onnx filter=lfs diff=lfs merge=lfs -text
 *.ot filter=lfs diff=lfs merge=lfs -text
 *.parquet filter=lfs diff=lfs merge=lfs -text
 *.pb filter=lfs diff=lfs merge=lfs -text
 *.pickle filter=lfs diff=lfs merge=lfs -text
 *.pkl filter=lfs diff=lfs merge=lfs -text
 *.pt filter=lfs diff=lfs merge=lfs -text
 *.pth filter=lfs diff=lfs merge=lfs -text
 *.rar filter=lfs diff=lfs merge=lfs -text
 *.safetensors filter=lfs diff=lfs merge=lfs -text
 saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.tar.* filter=lfs diff=lfs merge=lfs -text
 *.tar filter=lfs diff=lfs merge=lfs -text
 *.tflite filter=lfs diff=lfs merge=lfs -text
 *.tgz filter=lfs diff=lfs merge=lfs -text
 *.wasm filter=lfs diff=lfs merge=lfs -text
 *.xz filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1 @@
 __pycache__
--- a/EasyLM/init.py
+++ b/EasyLM/init.py
--- a/EasyLM/bpt.py
+++ b/EasyLM/bpt.py
@@ -0,0 +1,228 @@
 """
 An implementation of Blockwise parallel transformer https://arxiv.org/abs/2305.19370
 Also include a reference implementation of memory-efficient transformer https://arxiv.org/abs/2112.05682
 """
 import functools
 from typing import NamedTuple
 import flax.linen as nn
 import jax
 import jax.lax as lax
 import jax.numpy as jnp
 from einops import rearrange
 """
 Computing ffn blockwise without materializing the large hidden tensor, training
 4x longer sequences than the memory-efficient transformer.
 Blockwise parallel transformer https://arxiv.org/abs/2305.19370 Liu et al. 2023
 """
 def blockwise_ffn(remat_ffn, inputs, chunk_size=2048, deterministic=True):
    # remat_ffn: a rematerialized ffn with policy jax.checkpoint_policies.nothing_saveable()
    # inputs: (batch, seq_len, dim)
    # chunk_size: the chunk size to split the sequence
    inputs = rearrange(inputs, 'b (c n) d -> b c n d', c=chunk_size)
    def scan_ffn(remat_ffn, carry, hidden_states):
        outputs = remat_ffn(hidden_states, deterministic=deterministic)
        return carry, outputs
    scan_axis = inputs.ndim - 2
    _, res = nn.scan(
        scan_ffn,
        variable_broadcast="params",
        split_rngs={"params": False, "dropout": True},
        in_axes=scan_axis,
        out_axes=scan_axis,
    )(remat_ffn, None, inputs)
    res = rearrange(res, 'b c n d -> b (c n) d')
    return res
 """
 Compute attention blockwise without materializing the full attention matrix,
 initially proposed in memory-efficient transformer https://arxiv.org/abs/2112.05682 Rabe et al. 2021;
 flash attention https://arxiv.org/abs/2205.14135 Dao et al. 2022 proposes a CUDA
 efficient implementation; blockwise parallel transformer https://arxiv.org/abs/2305.19370
 Liu et al. 2023 proposes blockwise computing both attention and FFN, enabling 4x
 longer sequences than memory-efficient/flash-attention and fusion of attention and FFN.
 """
 def blockwise_attn(
        query, key, value,
        bias=None,
        deterministic=True,
        dropout_rng=None,
        attn_pdrop=0.0,
        causal=True,
        query_chunk_size=2048,
        key_chunk_size=2048,
        dtype=jnp.float32,
        policy=jax.checkpoint_policies.nothing_saveable(),
        precision=None,
        float32_logits=True,
        prevent_cse=True,
    ):
    # query, key, value: (batch, seq_len, num_heads, dim_per_head)
    # bias: (batch, seq_len) can be used to mask out attention (e.g. padding)
    # causal: whether to use causal mask
    # policy: one of jax.checkpoint_policies
    query = query / jnp.sqrt(query.shape[-1]).astype(dtype)
    if float32_logits:
        query = query.astype(jnp.float32)
        key = key.astype(jnp.float32)
    batch, q_len, num_heads, dim_per_head = query.shape
    batch, kv_len, num_heads, dim_per_head = key.shape
    batch, kv_len, num_heads, dim_per_head = value.shape
    num_q = q_len // query_chunk_size
    num_kv = kv_len // key_chunk_size
    query = query.reshape((batch, num_q, query_chunk_size, num_heads, dim_per_head))
    key = key.reshape((batch, num_kv, key_chunk_size, num_heads, dim_per_head))
    value = value.reshape((batch, num_kv, key_chunk_size, num_heads, dim_per_head))
    query = jnp.moveaxis(query, 1, 0)
    key = jnp.moveaxis(key, 1, 0)
    value = jnp.moveaxis(value, 1, 0)
    if bias is not None:
        for bias_dim, broadcast_dim in zip(bias.shape, (batch, num_heads, q_len, kv_len)):
            assert bias_dim == 1 or bias_dim == broadcast_dim
    if not deterministic and attn_pdrop > 0.0:
        attn_dropout_rng, dropout_rng = jax.random.split(dropout_rng)
        attn_dropout = jax.random.bernoulli(attn_dropout_rng, attn_pdrop, (batch, num_heads, q_len, kv_len))
    else:
        attn_dropout = None
    _chunk_bias_fn = functools.partial(
        _chunk_attention_bias,
        query_chunk_size, key_chunk_size, bias, deterministic,
        attn_dropout, attn_pdrop, causal, dtype)
    def scan_attention(args):
        query_chunk, query_chunk_idx = args
        @functools.partial(jax.checkpoint, prevent_cse=prevent_cse, policy=policy)
        def scan_kv_block(carry, args):
            key_chunk, value_chunk, key_chunk_idx = args
            (numerator, denominator, prev_max_score) = carry
            attn_weights = jnp.einsum('bqhd,bkhd->bqhk', query_chunk, key_chunk, precision=precision)
            bias_chunk = _chunk_bias_fn(query_chunk_idx, key_chunk_idx)
            bias_chunk = jnp.moveaxis(bias_chunk, 1, 2)
            attn_weights = attn_weights + bias_chunk
            max_score = jnp.max(attn_weights, axis=-1, keepdims=True)
            max_score = jnp.maximum(prev_max_score, max_score)
            max_score = jax.lax.stop_gradient(max_score)
            exp_weights = jnp.exp(attn_weights - max_score)
            exp_values = jnp.einsum(
                'bqhv,bvhd->bqhd', exp_weights, value_chunk, precision=precision
            )
            correction = jnp.exp(prev_max_score - max_score)
            numerator = numerator * correction + exp_values
            denominator = denominator * correction + exp_weights.sum(axis=-1, keepdims=True)
            return Carry(numerator, denominator, max_score), None
        def skip_upper_half(carry, args):
            key_chunk, value_chunk, key_chunk_idx = args
            skip_block = jnp.array(False)
            if causal:
                skip_block = query_chunk_idx < key_chunk_idx
            return jax.lax.cond(
                skip_block,
                lambda carry, args: (carry, None),
                scan_kv_block,
                carry,
                args,
            )
        init_carry = Carry(
            jnp.zeros((batch, query_chunk_size, num_heads, dim_per_head), dtype=query.dtype),
            jnp.zeros((batch, query_chunk_size, num_heads, dim_per_head), dtype=query.dtype),
            (-jnp.inf) * jnp.ones((batch, query_chunk_size, num_heads, 1), dtype=query.dtype),
        )
        (numerator, denominator, max_score), _ = lax.scan(
            skip_upper_half, init_carry, xs=(key, value, jnp.arange(0, num_kv))
        )
        outputs = (numerator / denominator).astype(dtype)
        return outputs
    _, res = lax.scan(
        lambda _, x: ((), scan_attention(x)),
        (), xs=(query, jnp.arange(0, num_q))
    )
    res = rearrange(res, 'n b c h d -> b (n c) h d')
    return res
 class Carry(NamedTuple):
    numerator: jax.Array
    denominator: jax.Array
    max_so_far: jax.Array
 def _chunk_attention_bias(query_chunk_size, key_chunk_size,
            bias, deterministic, attn_dropout, attn_pdrop, causal,
            dtype, query_chunk_idx, key_chunk_idx):
    query_offset = query_chunk_idx * query_chunk_size
    key_offset = key_chunk_idx * key_chunk_size
    chunk_bias = jnp.zeros((1, 1, 1, 1), dtype=dtype)
    if bias is not None:
        chunk_bias = lax.dynamic_slice(
            bias,
            start_indices=(0, 0, query_offset, key_offset),
            slice_sizes=(*bias.shape[:2], min(bias.shape[-2], query_chunk_size), min(bias.shape[-1], key_chunk_size)),
        )
    if causal:
        query_idx = lax.broadcasted_iota(dtype=jnp.int32, shape=(query_chunk_size, 1), dimension=0)
        key_idx = lax.broadcasted_iota(dtype=jnp.int32, shape=(1, key_chunk_size), dimension=1)
        offset = query_offset - key_offset
        query_idx += offset
        causal_mask_value = (query_idx < key_idx) * jnp.finfo(dtype).min
        chunk_bias += causal_mask_value.reshape(1, 1, *causal_mask_value.shape)
    if not deterministic and attn_pdrop > 0.0:
        attn_dropout_slice = lax.dynamic_slice(
            attn_dropout,
            start_indices=(0, 0, query_offset, key_offset),
            slice_sizes=(
                *attn_dropout.shape[:2],
                min(attn_dropout.shape[-2], query_chunk_size),
                min(attn_dropout.shape[-1], key_chunk_size),
            ),
        )
        chunk_bias += attn_dropout_slice * jnp.finfo(dtype).min
    return chunk_bias.astype(dtype)
 if __name__ == '__main__':
    # test
    def reference_attn(query, key, value, causal, dtype):
        query = query / jnp.sqrt(query.shape[-1]).astype(dtype)
        logits = jnp.einsum("bqhc,bkhc->bhqk", query, key)
        if causal:
            mask_value = jnp.finfo(logits.dtype).min
            _, q_seq_len, _, _ = query.shape
            _, kv_seq_len, _, _ = key.shape
            mask_shape = (q_seq_len, kv_seq_len)
            row_ids = jax.lax.broadcasted_iota(jnp.int32, mask_shape, 0)
            col_ids = jax.lax.broadcasted_iota(jnp.int32, mask_shape, 1)
            causal_mask = (row_ids < col_ids)[None, None, :, :]
            logits = logits + jnp.where(causal_mask, mask_value, 0.0)
        weights = jax.nn.softmax(logits, axis=-1)
        out = jnp.einsum("bhqk,bkhc->bqhc", weights, value)
        return out
    # random inputs
    shape = (1, 32, 8, 64)
    query = jax.random.normal(jax.random.PRNGKey(0), shape)
    key = jax.random.normal(jax.random.PRNGKey(1), shape)
    value = jax.random.normal(jax.random.PRNGKey(2), shape)
    causal = True
    chunk_size = 4
    policy = jax.checkpoint_policies.nothing_saveable()
    blockwise = blockwise_attn(query, key, value, None, False, None, 0.0, causal, chunk_size, chunk_size, jnp.float32, policy, 'float32', True, False)
    reference = reference_attn(query, key, value, causal, 'float32')
    assert jnp.allclose(reference, blockwise, atol=1e-6)
--- a/EasyLM/checkpoint.py
+++ b/EasyLM/checkpoint.py
@@ -0,0 +1,212 @@
 import os
 import numpy as np
 from ml_collections import ConfigDict
 import mlxu
 import jax
 import jax.numpy as jnp
 import flax
 from flax.serialization import (
    from_bytes, to_bytes, to_state_dict, from_state_dict
 )
 from flax.traverse_util import flatten_dict, unflatten_dict, empty_node
 import msgpack
 from EasyLM.jax_utils import tree_apply, float_tensor_to_dtype
 class StreamingCheckpointer(object):
    """ Custom msgpack checkpointer that saves large train states by serializing
        and saving tensors one by one in a streaming fashion. Avoids running
        out of memory or local TPU disk with default flax checkpointer.
    """
    @staticmethod
    def get_default_config(updates=None):
        config = ConfigDict()
        config.float_dtype = 'bf16'
        config.save_optimizer_state = False
        if updates is not None:
            config.update(ConfigDict(updates).copy_and_resolve_references())
        return config
    def __init__(self, config, checkpoint_dir, enable=True):
        self.config = self.get_default_config(config)
        self.checkpoint_dir = checkpoint_dir
        self.enable = enable
    def save_checkpoint(self, train_state, filename, gather_fns=None):
        if self.enable:
            path = os.path.join(self.checkpoint_dir, filename)
        else:
            path = '/dev/null'
        self.save_train_state_to_file(
            train_state, path, gather_fns, self.config.float_dtype
        )
    @staticmethod
    def save_train_state_to_file(train_state, path, gather_fns=None, float_dtype=None):
        train_state = to_state_dict(train_state)
        packer = msgpack.Packer()
        flattend_train_state = flatten_dict(train_state)
        if gather_fns is not None:
            gather_fns = flatten_dict(to_state_dict(gather_fns))
        with mlxu.open_file(path, "wb") as fout:
            for key, value in flattend_train_state.items():
                if gather_fns is not None:
                    value = gather_fns[key](value)
                value = float_tensor_to_dtype(value, float_dtype)
                fout.write(packer.pack((key, to_bytes(value))))
    def save_pickle(self, obj, filename):
        if self.enable:
            path = os.path.join(self.checkpoint_dir, filename)
        else:
            path = '/dev/null'
        mlxu.save_pickle(obj, path)
    def save_all(self, train_state, gather_fns, metadata=None, dataset=None, milestone=False):
        step = int(jax.device_get(train_state.step))
        if self.config.save_optimizer_state:
            checkpoint_state = train_state
            checkpoint_name = 'streaming_train_state'
            checkpoint_gather_fns = gather_fns
        else:
            checkpoint_state = train_state.params['params']
            checkpoint_name = 'streaming_params'
            checkpoint_gather_fns = gather_fns.params['params']
        if milestone:
            # Save a milestone checkpoint that will not be overwritten
            self.save_pickle(metadata, f'metadata_{step}.pkl')
            self.save_pickle(dataset, f'dataset_{step}.pkl')
            self.save_checkpoint(
                checkpoint_state, f'{checkpoint_name}_{step}', checkpoint_gather_fns
            )
        else:
            # Save a normal checkpoint that can be overwritten
            self.save_pickle(metadata, 'metadata.pkl')
            self.save_pickle(dataset, 'dataset.pkl')
            self.save_checkpoint(
                checkpoint_state, f'{checkpoint_name}', checkpoint_gather_fns
            )
    @staticmethod
    def load_checkpoint(path, target=None, shard_fns=None, remove_dict_prefix=None):
        if shard_fns is not None:
            shard_fns = flatten_dict(
                to_state_dict(shard_fns)
            )
        if remove_dict_prefix is not None:
            remove_dict_prefix = tuple(remove_dict_prefix)
        flattend_train_state = {}
        with mlxu.open_file(path) as fin:
            # 83886080 bytes = 80 MB, which is 16 blocks on GCS
            unpacker = msgpack.Unpacker(fin, read_size=83886080, max_buffer_size=0)
            for key, value in unpacker:
                key = tuple(key)
                if remove_dict_prefix is not None:
                    if key[:len(remove_dict_prefix)] == remove_dict_prefix:
                        key = key[len(remove_dict_prefix):]
                    else:
                        continue
                tensor = from_bytes(None, value)
                if shard_fns is not None:
                    tensor = shard_fns[key](tensor)
                flattend_train_state[key] = tensor
        if target is not None:
            flattened_target = flatten_dict(
                to_state_dict(target), keep_empty_nodes=True
            )
            for key, value in flattened_target.items():
                if key not in flattend_train_state and value == empty_node:
                    flattend_train_state[key] = value
        train_state = unflatten_dict(flattend_train_state)
        if target is None:
            return train_state
        return from_state_dict(target, train_state)
    @staticmethod
    def load_flax_checkpoint(path, target=None, shard_fns=None):
        """ Load a standard flax checkpoint that's not saved with the
            msgpack streaming format.
        """
        with mlxu.open_file(path, "rb") as fin:
            encoded_bytes = fin.read()
        state_dict = flax.serialization.msgpack_restore(encoded_bytes)
        if shard_fns is not None:
            shard_fns = to_state_dict(shard_fns)
            state_dict = tree_apply(shard_fns, state_dict)
        if target is None:
            return state_dict
        return from_state_dict(target, state_dict)
    @classmethod
    def load_trainstate_checkpoint(cls, load_from, trainstate_target=None,
                                   trainstate_shard_fns=None,
                                   disallow_trainstate=False):
        if trainstate_target is not None:
            params_target = trainstate_target.params['params']
        else:
            params_target = None
        if trainstate_shard_fns is not None:
            params_shard_fns = trainstate_shard_fns.params['params']
        else:
            params_shard_fns = None
        load_type, load_path = load_from.split('::', 1)
        if disallow_trainstate:
            assert load_type != 'trainstate', 'Loading full trainstate is not allowed!'
        train_state = None
        restored_params = None
        if load_type == 'trainstate':
            # Load the entire train state in the streaming format
            train_state = cls.load_checkpoint(
                path=load_path,
                target=trainstate_target,
                shard_fns=trainstate_shard_fns,
            )
        elif load_type == 'trainstate_params':
            # Load the params part of the train state in the streaming format
            restored_params = cls.load_checkpoint(
                path=load_path,
                target=params_target,
                shard_fns=params_shard_fns,
                remove_dict_prefix=('params', 'params'),
            )
            restored_params = flax.core.frozen_dict.freeze(
                {'params': restored_params}
            )
        elif load_type == 'params':
            # Load the params in the streaming format
            restored_params = cls.load_checkpoint(
                path=load_path,
                target=params_target,
                shard_fns=params_shard_fns,
            )
            restored_params = flax.core.frozen_dict.freeze(
                {'params': restored_params}
            )
        elif load_type == 'flax_params':
            # Load the params in the standard flax format (non-streaming)
            # This requires the entire params to fit in memory
            restored_params = cls.load_flax_checkpoint(
                path=load_path,
                target=params_target,
                shard_fns=params_shard_fns
            )
            restored_params = flax.core.frozen_dict.freeze(
                {'params': restored_params}
            )
        else:
            raise ValueError(f'Invalid load_from type: {load_type}')
        return train_state, restored_params
--- a/EasyLM/data.py
+++ b/EasyLM/data.py
@@ -0,0 +1,436 @@
 import dataclasses
 import pprint
 import time
 from functools import partial
 import json
 import base64
 from multiprocessing import Pool
 import h5py
 import mlxu
 from ml_collections.config_dict import config_dict
 from ml_collections import ConfigDict
 from tqdm import tqdm, trange
 import numpy as np
 from datasets import load_dataset, load_from_disk
 class DatasetFactory(object):
    """ Datset builder class. """
    @staticmethod
    def get_default_config(updates=None):
        config = ConfigDict()
        config.type = 'huggingface'
        config.text_processor = TextProcessor.get_default_config()
        config.huggingface_dataset = HuggingfaceDataset.get_default_config()
        config.json_dataset = JsonDataset.get_default_config()
        if updates is not None:
            config.update(ConfigDict(updates).copy_and_resolve_references())
        return config
    @classmethod
    def load_dataset(cls, config, tokenizer, **kwargs):
        config = cls.get_default_config(config)
        text_processor = TextProcessor(config.text_processor, tokenizer)
        if config.type == 'huggingface':
            return HuggingfaceDataset(
                config.huggingface_dataset, tokenizer, text_processor, **kwargs
            )
        elif config.type == 'json':
            return JsonDataset(config.json_dataset, tokenizer, text_processor, **kwargs)
        else:
            raise ValueError(f'Unknown dataset type: {config.type}')
    def __init__(self):
        raise ValueError('DatasetFactory is a static class and should not be instantiated.')
 class TextProcessor(object):
    """ Example processor that converts a dictionary of texts into tokens. """
    @staticmethod
    def get_default_config(updates=None):
        config = ConfigDict()
        config.fields_from_example = ''
        config.fields = ''
        config.subfield_separator = ' '
        config.add_bos_token = True
        config.add_eos_token = True
        config.prepend_text = ''
        config.base64_token_dtype = 'i4'
        if updates is not None:
            config.update(ConfigDict(updates).copy_and_resolve_references())
        return config
    def __init__(self, config, tokenizer):
        self.config = self.get_default_config(config)
        assert self.config.fields != '' or self.config.fields_from_example != '', (
            'Either fields or fields_from_example must be specified.'
        )
        self.tokenizer = tokenizer
    def __call__(self, example, has_aux=False):
        if has_aux:
            example, *aux = example
        else:
            aux = tuple()
        token_buffer = []
        loss_mask_buffer = []
        if self.config.add_bos_token:
            token_buffer.append(self.tokenizer.bos_token_id)
            loss_mask_buffer.append(0.0)
        if self.config.fields_from_example != '':
            fields = example[self.config.fields_from_example].split(',')
        else:
            fields = self.config.fields.split(',')
        for i, field in enumerate(fields):
            if field.startswith('[') and field.endswith(']'):
                # No loss for this field.
                field = field[1:-1]
                mask = 0.0
            else:
                mask = 1.0
            if field.startswith('<|') and field.endswith('|>'):
                # Special tokens.
                field = field[2:-2]
                if field == 'bos':
                    token_buffer.append(self.tokenizer.bos_token_id)
                elif field == 'eos':
                    token_buffer.append(self.tokenizer.eos_token_id)
                else:
                    # Token ID specified directly.
                    token_buffer.append(int(field))
                loss_mask_buffer.append(mask)
            elif field.startswith('{') and field.endswith('}'):
                field = field[1:-1]
                # Base64 encoded raw tokens.
                tokens = np.frombuffer(
                    base64.b64decode(example[field]),
                    dtype=self.config.base64_token_dtype
                ).tolist()
                token_buffer.extend(tokens)
                loss_mask_buffer.extend([mask for _ in range(len(tokens))])
            else:
                subfields = field.split('+')
                text = self.config.subfield_separator.join(
                    [example[subfield] for subfield in subfields]
                )
                if i == 0:
                    text = self.config.prepend_text + text
                tokens = self.tokenizer.encode(text)
                token_buffer.extend(tokens)
                loss_mask_buffer.extend([mask for _ in range(len(tokens))])
        if self.config.add_eos_token:
            token_buffer.append(self.tokenizer.eos_token_id)
            loss_mask_buffer.append(1.0)
        return token_buffer, loss_mask_buffer, *aux
 class HuggingfaceDataset(object):
    """ Huggingface dataset, where the dataset is loaded using the huggingface
        datasets.load_dataset() function.
    """
    @staticmethod
    def get_default_config(updates=None):
        config = ConfigDict()
        config.path = 'c4'
        config.name = 'en'
        config.split = 'train'
        config.streaming = False
        config.seq_length = 1024
        config.batch_size = 8
        config.always_start_with_bos = False
        config.start_seek_loc = 0
        config.tokens_count_at_start = 0
        config.batch_token_dtype = 'i4'
        config.reset_dataset_loc = False
        if updates is not None:
            config.update(ConfigDict(updates).copy_and_resolve_references())
        return config
    def __init__(self, config, tokenizer, text_processor, eval_dataset=False):
        self.config = self.get_default_config(config)
        name = self.config.name if self.config.name != '' else None
        split = self.config.split if self.config.split != '' else None
        self._tokenizer = tokenizer
        self._text_processor = text_processor
        self._dataset = load_from_disk(
            self.config.path
        )[split]
        self._dataset = self._dataset.to_iterable_dataset(num_shards=128 if len(self._dataset) > 128 else len(self._dataset))
        self._eval_dataset = eval_dataset
        self._train_epochs = 0
        self._dataset_loc = self.config.start_seek_loc
        self._total_tokens = self.config.tokens_count_at_start
        self._index = 0
        self.reset_dataset_loc = self.config.reset_dataset_loc
    def __iter__(self):
        if not self._eval_dataset and self._train_epochs > 0:
            self._dataset = self._dataset.shuffle(seed=42, buffer_size=10000)
        chunk_size = self.config.batch_size * self.config.seq_length
        while True:
            token_buffer = []
            loss_mask_buffer = []
            if not self._eval_dataset and self._train_epochs > 0:
                self._dataset.set_epoch(self._train_epochs)
            for index, example in enumerate(self._dataset):
                self._index = index
                if not self._eval_dataset and self._dataset_loc > index:
                    continue
                tokens, loss_masks = self.text_processor(example)
                token_buffer.extend(tokens)
                loss_mask_buffer.extend(loss_masks)
                while len(token_buffer) > chunk_size + 1:
                    self._total_tokens += chunk_size
                    metrics = {
                        'dataset_example_index': index,
                        'dataset_total_tokens': self._total_tokens,
                        'epoch': self._train_epochs,
                    }
                    batch = {
                        'input_tokens': np.array(token_buffer[:chunk_size], dtype=self.config.batch_token_dtype).reshape(
                            self.config.batch_size, -1
                        ),
                        'target_tokens': np.array(token_buffer[1:chunk_size + 1], dtype=self.config.batch_token_dtype).reshape(
                            self.config.batch_size, -1
                        ),
                        'loss_masks': np.array(loss_mask_buffer[1:chunk_size + 1], dtype=np.float32).reshape(
                            self.config.batch_size, -1
                        ),
                    }
                    if self.config.always_start_with_bos:
                        batch['input_tokens'][:, 0] = self.tokenizer.bos_token_id
                    yield batch, metrics
                    token_buffer = token_buffer[chunk_size:]
                    loss_mask_buffer = loss_mask_buffer[chunk_size:]
            if self._eval_dataset:
                break
            else:
                if self._train_epochs == 0:
                    self._dataset = self._dataset.shuffle(seed=42, buffer_size=10000)
                self._dataset_loc = 0
                self._train_epochs += 1
    def get_state_dict(self):
        return dict(
            config=self.config,
            dataset_loc=self._index,
            total_tokens=self._total_tokens,
            epochs=self._train_epochs,
        )
    def load_state_dict(self, state_dict):
        if 'config' in state_dict:
            self.config.update(ConfigDict(state_dict['config']))
        self._dataset_loc = state_dict.get('dataset_loc', self.config.start_seek_loc)
        self._total_tokens = state_dict.get('total_tokens', self.config.tokens_count_at_start)
        self._train_epochs = state_dict.get('epochs', 0)
        if self.reset_dataset_loc:
            self._dataset_loc = 0
            self._train_epochs = 0
    @property
    def seq_length(self):
        return self.config.seq_length
    @property
    def tokenizer(self):
        return self._tokenizer
    @property
    def text_processor(self):
        return self._text_processor
    @property
    def dataset(self):
        return self._dataset
    @property
    def vocab_size(self):
        return len(self._tokenizer)
 class JsonDataset(object):
    """ JSON dataset, where each line of the data file contains a JSON
        dictionary with text fields.
    """
    @staticmethod
    def get_default_config(updates=None):
        config = ConfigDict()
        config.path = ''
        config.seq_length = 1024
        config.batch_size = 8
        config.always_start_with_bos = False
        config.start_seek_loc = 0
        config.example_index_at_start = 0
        config.tokens_count_at_start = 0
        config.tokenizer_processes = 1
        config.tokenizer_parallel_chunk_size = 32
        config.tokenizer_parallel_batch_size = 1024
        config.throughput_average_window_size = 200
        if updates is not None:
            config.update(ConfigDict(updates).copy_and_resolve_references())
        return config
    def __init__(self, config, tokenizer, text_processor):
        self.config = self.get_default_config(config)
        assert self.config.path != ''
        self._tokenizer = tokenizer
        self._text_processor = text_processor
        self._index = self.config.example_index_at_start
        self._file_loc = self.config.start_seek_loc
        self._total_tokens = self.config.tokens_count_at_start
    def parse_json(self, line):
        if not line or line == '\n':
            return None
        try:
            data = json.loads(line)
        except json.decoder.JSONDecodeError:
            print(f'Error parsing json line:\n{line}')
            return None
        return data
    def json_iterator(self):
        with mlxu.open_file(self.config.path, 'r') as fin:
            fin.seek(self._file_loc)
            while True:
                line = fin.readline()
                self._file_loc = fin.tell()
                if not line:   # Reached EOF
                    self._index = 0
                    fin.seek(0)
                    continue
                data = self.parse_json(line)
                if data is not None:
                    # JSON parsing succeeded
                    yield data, self._file_loc, self._index
                self._index += 1
    def batched(self, iterator, batch_size):
        batch = []
        for example in iterator:
            batch.append(example)
            if len(batch) == batch_size:
                yield batch
                batch = []
        if len(batch) > 0:
            yield batch
    def parallel_example_iterator(self):
        if self.config.tokenizer_processes == 1:
            for example, loc, index in self.json_iterator():
                yield self.text_processor((example, loc, index), has_aux=True)
        else:
            process_pool = Pool(self.config.tokenizer_processes)
            batched_iterator = self.batched(
                self.json_iterator(), self.config.tokenizer_parallel_batch_size
            )
            with process_pool as pool:
                map_fn = partial(self.text_processor, has_aux=True)
                next_batch = pool.map_async(
                    map_fn, next(batched_iterator),
                    chunksize=self.config.tokenizer_parallel_chunk_size
                )
                while True:
                    current_batch = next_batch
                    next_batch = pool.map_async(
                        map_fn, next(batched_iterator),
                        chunksize=self.config.tokenizer_parallel_chunk_size
                    )
                    for example in current_batch.get():
                        yield example
    def __iter__(self):
        chunk_size = self.config.batch_size * self.config.seq_length
        token_buffer = []
        loss_mask_buffer = []
        last_time = 0.0
        step_times = []
        start_time = time.time()
        start_tokens = self._total_tokens
        for tokens, loss_masks, loc, index in self.parallel_example_iterator():
            token_buffer.extend(tokens)
            loss_mask_buffer.extend(loss_masks)
            while len(token_buffer) > chunk_size + 1:
                self._total_tokens += chunk_size
                step_times.append(time.time() - last_time)
                last_time = time.time()
                if len(step_times) > self.config.throughput_average_window_size:
                    step_times = step_times[-self.config.throughput_average_window_size:]
                average_throughput = chunk_size / np.mean(step_times)
                accumulated_throughput = (
                    (self._total_tokens - start_tokens) / (time.time() - start_time)
                )
                metrics = {
                    'dataset_file_loc': loc,
                    'dataset_example_index': index,
                    'dataset_total_tokens': self._total_tokens,
                    'dataset_accumulated_tps': accumulated_throughput,
                    'dataset_average_tps': average_throughput,
                }
                batch = {
                    'input_tokens': np.array(token_buffer[:chunk_size], dtype=np.int32).reshape(
                        self.config.batch_size, -1
                    ),
                    'target_tokens': np.array(token_buffer[1:chunk_size + 1], dtype=np.int32).reshape(
                        self.config.batch_size, -1
                    ),
                    'loss_masks': np.array(loss_mask_buffer[1:chunk_size + 1], dtype=np.float32).reshape(
                        self.config.batch_size, -1
                    ),
                }
                if self.config.always_start_with_bos:
                    batch['input_tokens'][:, 0] = self.tokenizer.bos_token_id
                yield batch, metrics
                token_buffer = token_buffer[chunk_size:]
                loss_mask_buffer = loss_mask_buffer[chunk_size:]
    def get_state_dict(self):
        return dict(
            config=self.config,
            index=self._index,
            file_loc=self._file_loc,
            total_tokens=self._total_tokens,
        )
    def load_state_dict(self, state_dict):
        if 'config' in state_dict:
            self.config.update(ConfigDict(state_dict['config']))
        self._index = state_dict.get('index', self.config.example_index_at_start)
        self._file_loc = state_dict.get('file_loc', self.config.start_seek_loc)
        self._total_tokens = state_dict.get('total_tokens', self.config.tokens_count_at_start)
    @property
    def seq_length(self):
        return self.config.seq_length
    @property
    def tokenizer(self):
        return self._tokenizer
    @property
    def text_processor(self):
        return self._text_processor
    @property
    def vocab_size(self):
        return len(self.tokenizer)
--- a/EasyLM/jax_utils.py
+++ b/EasyLM/jax_utils.py
@@ -0,0 +1,403 @@
 import os
 import math
 from typing import Any, Mapping, Text, Tuple, Union, NamedTuple
 from functools import partial
 import re
 import dataclasses
 import random
 from ml_collections import ConfigDict
 from ml_collections.config_dict.config_dict import placeholder
 import flax
 import jax
 import jax.numpy as jnp
 from jax.sharding import PartitionSpec as PS
 from jax.sharding import Mesh
 from jax.experimental import mesh_utils
 from jax.experimental.pjit import with_sharding_constraint as _with_sharding_constraint
 from jax.experimental.pjit import pjit
 from jax.interpreters import pxla
 import numpy as np
 from transformers import FlaxLogitsWarper
 class JaxRNG(object):
    """ A convenient stateful Jax RNG wrapper. Can be used to wrap RNG inside
        pure function.
    """
    @classmethod
    def from_seed(cls, seed):
        return cls(jax.random.PRNGKey(seed))
    def __init__(self, rng):
        self.rng = rng
    def __call__(self, keys=None):
        if keys is None:
            self.rng, split_rng = jax.random.split(self.rng)
            return split_rng
        elif isinstance(keys, int):
            split_rngs = jax.random.split(self.rng, num=keys + 1)
            self.rng = split_rngs[0]
            return tuple(split_rngs[1:])
        else:
            split_rngs = jax.random.split(self.rng, num=len(keys) + 1)
            self.rng = split_rngs[0]
            return {key: val for key, val in zip(keys, split_rngs[1:])}
 class JaxDistributedConfig(object):
    """ Utility class for initializing JAX distributed. """
    @staticmethod
    def get_default_config(updates=None):
        config = ConfigDict()
        config.initialize_jax_distributed = False
        config.coordinator_address = placeholder(str)
        config.num_processes = placeholder(int)
        config.process_id = placeholder(int)
        config.local_device_ids = placeholder(str)
        if updates is not None:
            config.update(ConfigDict(updates).copy_and_resolve_references())
        return config
    @classmethod
    def initialize(cls, config):
        config = cls.get_default_config(config)
        if config.initialize_jax_distributed:
            if config.local_device_ids is not None:
                local_device_ids = [int(x) for x in config.local_device_ids.split(',')]
            else:
                local_device_ids = None
            jax.distributed.initialize(
                coordinator_address=config.coordinator_address,
                num_processes=config.num_processes,
                process_id=config.process_id,
                local_device_ids=local_device_ids,
            )
 class FlaxTemperatureLogitsWarper(FlaxLogitsWarper):
    """ JIT traceable version of FlaxLogitsWarper that performs temperature scaling."""
    def __init__(self, temperature):
        self.temperature = temperature
    def __call__(self, input_ids, scores, cur_len):
        return scores / jnp.clip(self.temperature, a_min=1e-8)
 def make_shard_and_gather_fns(partition_specs, dtype_specs=None):
    """ Create pytree of sharding and gathering functions from pytree of
        partition specs.
    """
    float_dtypes = (jnp.bfloat16, jnp.float16, jnp.float32, jnp.float64)
    def make_to_dtype_fn(dtype_spec):
        def to_dtype(tensor):
            if dtype_specs in float_dtypes and getattr(tensor, 'dtype', None) in float_dtypes:
                # Convert all float tensors to the same dtype
                return tensor.astype(dtype_specs)
            elif hasattr(dtype_spec, 'dtype') and hasattr(tensor, 'dtype'):
                return tensor.astype(dtype_spec.dtype)
            return tensor
        return to_dtype
    def make_shard_fn(partition_spec, dtype_spec=None):
        jax_shard_function = pjit(
            make_to_dtype_fn(dtype_spec),
            in_shardings=None,
            out_shardings=partition_spec
        )
        def shard_fn(tensor):
            return jax_shard_function(tensor).block_until_ready()
        return shard_fn
    def make_gather_fn(partition_spec, dtype_spec=None):
        jax_gather_fn = pjit(
            make_to_dtype_fn(dtype_spec),
            in_shardings=partition_spec,
            out_shardings=None
        )
        def gather_fn(tensor):
            return jax.device_get(jax_gather_fn(tensor))
        return gather_fn
    if dtype_specs is None or dtype_specs in float_dtypes:
        shard_fns = jax.tree_util.tree_map(make_shard_fn, partition_specs)
        gather_fns = jax.tree_util.tree_map(make_gather_fn, partition_specs)
    else:
        shard_fns = jax.tree_util.tree_map(
            make_shard_fn, partition_specs, dtype_specs
        )
        gather_fns = jax.tree_util.tree_map(
            make_gather_fn, partition_specs, dtype_specs
        )
    return shard_fns, gather_fns
 def set_random_seed(seed):
    np.random.seed(seed)
    random.seed(seed)
    init_rng(seed)
 def get_jax_mesh(axis_dims, names):
    if axis_dims.startswith('!'):
        # Allow splitting a physical mesh axis if needed
        mesh_axis_splitting = True
        axis_dims = axis_dims[1:]
    else:
        mesh_axis_splitting = False
    if ':' in axis_dims:
        dims = []
        dim_names = []
        for axis in axis_dims.split(','):
            name, dim = axis.split(':')
            assert name in names
            dims.append(int(dim))
            dim_names.append(name)
        assert(set(dim_names) == set(names))
    else:
        dims = [int(x) for x in axis_dims.split(',')]
        dim_names = names
    assert len(dims) == len(names)
    mesh_shape = np.arange(jax.device_count()).reshape(dims).shape
    if mesh_axis_splitting:
        physical_mesh = np.array(jax.devices()).reshape(mesh_shape)
    else:
        physical_mesh = mesh_utils.create_device_mesh(mesh_shape)
    return Mesh(physical_mesh, dim_names)
 def names_in_current_mesh(*names):
    """ Check if current mesh axes contain these names. """
    mesh_axis_names = pxla.thread_resources.env.physical_mesh.axis_names
    return set(names) <= set(mesh_axis_names)
 def get_names_from_parition_spec(partition_specs):
    """ Return axis names from partition specs. """
    names = set()
    if isinstance(partition_specs, dict):
        partition_specs = partition_specs.values()
    for item in partition_specs:
        if item is None:
            continue
        elif isinstance(item, str):
            names.add(item)
        else:
            names.update(get_names_from_parition_spec(item))
    return list(names)
 def with_sharding_constraint(x, partition_specs):
    """ A smarter version of with_sharding_constraint that only applies the
        constraint if the current mesh contains the axes in the partition specs.
    """
    axis_names = get_names_from_parition_spec(partition_specs)
    if names_in_current_mesh(*axis_names):
        x = _with_sharding_constraint(x, partition_specs)
    return x
 def wrap_function_with_rng(rng):
    """ To be used as decorator, automatically bookkeep a RNG for the wrapped function. """
    def wrap_function(function):
        def wrapped(*args, **kwargs):
            nonlocal rng
            rng, split_rng = jax.random.split(rng)
            return function(split_rng, *args, **kwargs)
        return wrapped
    return wrap_function
 def init_rng(seed):
    global jax_utils_rng
    jax_utils_rng = JaxRNG.from_seed(seed)
 def next_rng(*args, **kwargs):
    global jax_utils_rng
    return jax_utils_rng(*args, **kwargs)
 def get_metrics(metrics, unreplicate=False, stack=False):
    if unreplicate:
        metrics = flax.jax_utils.unreplicate(metrics)
    metrics = jax.device_get(metrics)
    if stack:
        return jax.tree_map(lambda *args: np.stack(args), *metrics)
    else:
        return {key: float(val) for key, val in metrics.items()}
 def mse_loss(val, target, valid=None):
    if valid is None:
        valid = jnp.ones((*target.shape[:2], 1))
    valid = valid.astype(jnp.float32)
    loss = jnp.mean(
        jnp.where(
            valid > 0.0,
            jnp.square(val - target),
            0.0
        )
    )
    return loss
 def cross_entropy_loss_and_accuracy(logits, tokens, valid=None):
    if valid is None:
        valid = jnp.ones(tokens.shape[:2])
    valid = valid.astype(jnp.float32)
    valid_text_length = jnp.maximum(jnp.sum(valid, axis=-1), 1e-10)
    logits = logits.astype(jnp.float32) # for numerical stability
    token_log_prob = jnp.squeeze(
        jnp.take_along_axis(
            jax.nn.log_softmax(logits, axis=-1),
            jnp.expand_dims(tokens, -1),
            axis=-1,
        ),
        -1,
    )
    token_log_prob = jnp.where(valid > 0.0, token_log_prob, jnp.array(0.0))
    loss = -jnp.mean(jnp.sum(token_log_prob, axis=-1) / valid_text_length)
    correct = jnp.where(
        valid > 0.0,
        jnp.argmax(logits, axis=-1) == tokens,
        jnp.array(False)
    )
    accuracy = jnp.mean(jnp.sum(correct, axis=-1) / valid_text_length)
    return loss, accuracy
 def global_norm(tree):
    """ Return the global L2 norm of a pytree. """
    squared = jax.tree_util.tree_map(lambda x: jnp.sum(jnp.square(x)), tree)
    flattened, _ = jax.flatten_util.ravel_pytree(squared)
    return jnp.sqrt(jnp.sum(flattened))
 def average_metrics(metrics):
    with jax.spmd_mode("allow_all"):
        return jax.tree_map(
            lambda *args: jnp.mean(jnp.stack(args)),
            *metrics
        )
 def get_float_dtype_by_name(dtype):
    return {
        'bf16': jnp.bfloat16,
        'bfloat16': jnp.bfloat16,
        'fp16': jnp.float16,
        'float16': jnp.float16,
        'fp32': jnp.float32,
        'float32': jnp.float32,
        'fp64': jnp.float64,
        'float64': jnp.float64,
    }[dtype]
 def float_tensor_to_dtype(tensor, dtype):
    if dtype is None or dtype == '':
        return tensor
    if isinstance(dtype, str):
        dtype = get_float_dtype_by_name(dtype)
    float_dtypes = (jnp.bfloat16, jnp.float16, jnp.float32, jnp.float64)
    if getattr(tensor, 'dtype', None) in float_dtypes:
        tensor = tensor.astype(dtype)
    return tensor
 def float_to_dtype(tree, dtype):
    return jax.tree_util.tree_map(
        partial(float_tensor_to_dtype, dtype=dtype), tree
    )
 def get_gradient_checkpoint_policy(name):
    return {
        'everything_saveable': jax.checkpoint_policies.everything_saveable,
        'nothing_saveable': jax.checkpoint_policies.nothing_saveable,
        'checkpoint_dots': jax.checkpoint_policies.checkpoint_dots,
        'checkpoint_dots_with_no_batch_dims': jax.checkpoint_policies.checkpoint_dots_with_no_batch_dims,
    }[name]
 def tree_path_to_string(path, sep=None):
    keys = []
    for key in path:
        if isinstance(key, jax.tree_util.SequenceKey):
            keys.append(str(key.idx))
        elif isinstance(key, jax.tree_util.DictKey):
            keys.append(str(key.key))
        elif isinstance(key, jax.tree_util.GetAttrKey):
            keys.append(str(key.name))
        elif isinstance(key, jax.tree_util.FlattenedIndexKey):
            keys.append(str(key.key))
        else:
            keys.append(str(key))
    if sep is None:
        return tuple(keys)
    return sep.join(keys)
 def flatten_tree(xs, is_leaf=None, sep=None):
    flattened, _ = jax.tree_util.tree_flatten_with_path(xs, is_leaf=is_leaf)
    output = {}
    for key, val in flattened:
        output[tree_path_to_string(key, sep=sep)] = val
    return output
 def named_tree_map(f, tree, *rest, is_leaf=None, sep=None):
    """ An extended version of jax.tree_util.tree_map, where the mapped function
        f takes both the name (path) and the tree leaf as input.
    """
    return jax.tree_util.tree_map_with_path(
        lambda path, x, *r: f(tree_path_to_string(path, sep=sep), x, *r),
        tree, *rest,
        is_leaf=is_leaf
    )
 def match_partition_rules(rules, params):
    """ Returns a pytree of PartitionSpec according to rules. Supports handling
        Flax TrainState and Optax optimizer state.
    """
    def get_partition_spec(name, leaf):
        if len(leaf.shape) == 0 or np.prod(leaf.shape) == 1:
            """ Don't partition scalar values. """
            return PS()
        for rule, ps in rules:
            if re.search(rule, name) is not None:
                return ps
        raise ValueError(f'Partition rule not found for param: {name}')
    return named_tree_map(get_partition_spec, params, sep='/')
 def get_weight_decay_mask(exclusions):
    """ Return a weight decay mask function that computes the pytree masks
        according to the given exclusion rules.
    """
    def decay(name, _):
        for rule in exclusions:
            if re.search(rule, name) is not None:
                return False
        return True
    def weight_decay_mask(params):
        return named_tree_map(decay, params, sep='/')
    return weight_decay_mask
 def tree_apply(fns, tree):
    """ Apply a pytree of functions to the pytree. """
    return jax.tree_util.tree_map(lambda fn, x: fn(x), fns, tree)
--- a/EasyLM/models/init.py
+++ b/EasyLM/models/init.py
--- a/EasyLM/models/gptj/init.py
+++ b/EasyLM/models/gptj/init.py
--- a/EasyLM/models/gptj/gptj_model.py
+++ b/EasyLM/models/gptj/gptj_model.py
--- a/EasyLM/models/gptj/gptj_serve.py
+++ b/EasyLM/models/gptj/gptj_serve.py
@@ -0,0 +1,396 @@
 import pprint
 from functools import partial
 import numpy as np
 import mlxu
 import jax
 import jax.numpy as jnp
 from jax.experimental.pjit import pjit
 from jax.sharding import PartitionSpec as PS
 import flax
 from flax import linen as nn
 from flax.jax_utils import prefetch_to_device
 from flax.training.train_state import TrainState
 import optax
 from transformers import GenerationConfig, FlaxLogitsProcessorList
 from EasyLM.checkpoint import StreamingCheckpointer
 from EasyLM.serving import LMServer
 from EasyLM.jax_utils import (
    JaxRNG, JaxDistributedConfig, next_rng, match_partition_rules, tree_apply,
    set_random_seed, get_float_dtype_by_name, make_shard_and_gather_fns,
    with_sharding_constraint, FlaxTemperatureLogitsWarper
 )
 from EasyLM.models.gptj.gptj_model import (
    GPTJConfig, FlaxGPTJForCausalLMModule, FlaxGPTJForCausalLM
 )
 FLAGS, FLAGS_DEF = mlxu.define_flags_with_default(
    seed=42,
    initialize_jax_distributed=False,
    mesh_dim='1,-1,1',
    dtype='bf16',
    input_length=1024,
    seq_length=2048,
    top_k=50,
    top_p=1.0,
    do_sample=True,
    num_beams=1,
    add_bos_token=False,
    load_gptj_config='',
    load_checkpoint='',
    tokenizer=GPTJConfig.get_tokenizer_config(),
    lm_server=LMServer.get_default_config(),
    jax_distributed=JaxDistributedConfig.get_default_config(),
 )
 def main(argv):
    JaxDistributedConfig.initialize(FLAGS.jax_distributed)
    set_random_seed(FLAGS.seed)
    prefix_tokenizer = GPTJConfig.get_tokenizer(
        FLAGS.tokenizer, truncation_side='left', padding_side='left'
    )
    tokenizer = GPTJConfig.get_tokenizer(
        FLAGS.tokenizer, truncation_side='right', padding_side='right'
    )
    with jax.default_device(jax.devices("cpu")[0]):
        gptj_config = GPTJConfig.load_config(FLAGS.load_gptj_config)
        load_type, load_path = FLAGS.load_checkpoint.split('::', 1)
        if load_type == 'huggingface':
            params = gptj_config.load_pretrained(load_path)
        else:
            _, params = StreamingCheckpointer.load_trainstate_checkpoint(
                FLAGS.load_checkpoint, disallow_trainstate=True
            )
        hf_model = FlaxGPTJForCausalLM(
            gptj_config,
            input_shape=(1, FLAGS.seq_length),
            seed=FLAGS.seed,
            _do_init=False
        )
    model_ps = match_partition_rules(
        GPTJConfig.get_partition_rules(), params
    )
    shard_fns, _ = make_shard_and_gather_fns(
        model_ps, get_float_dtype_by_name(FLAGS.dtype)
    )
    @partial(
        pjit,
        in_shardings=(model_ps, PS(), PS()),
        out_shardings=(PS(), PS(), PS())
    )
    def forward_loglikelihood(params, rng, batch):
        batch = with_sharding_constraint(batch, PS(('dp', 'fsdp')))
        rng_generator = JaxRNG(rng)
        input_tokens = batch['input_tokens']
        output_tokens = batch['output_tokens']
        input_mask = batch['input_mask']
        output_mask = batch['output_mask']
        logits = hf_model.module.apply(
            params, input_tokens, attention_mask=input_mask,
            deterministic=True, rngs=rng_generator(gptj_config.rng_keys()),
        ).logits
        if gptj_config.n_real_tokens is not None:
          logits = logits.at[:, :, gptj_config.n_real_tokens:].set(-1e8)
        loglikelihood = -optax.softmax_cross_entropy_with_integer_labels(
            logits, output_tokens
        )
        loglikelihood = jnp.sum(loglikelihood * output_mask, axis=-1)
        match_count = jnp.sum(
            (jnp.argmax(logits, axis=-1) == output_tokens) * output_mask,
            axis=-1
        )
        total = jnp.sum(output_mask, axis=-1)
        is_greedy = match_count == total
        return loglikelihood, is_greedy, rng_generator()
    @partial(
        pjit,
        in_shardings=(model_ps, PS(), PS(), PS()),
        out_shardings=(PS(), PS())
    )
    def forward_generate(params, rng, batch, temperature):
        batch = with_sharding_constraint(batch, PS(('dp', 'fsdp')))
        rng_generator = JaxRNG(rng)
        output = hf_model.generate(
            batch['input_tokens'],
            attention_mask=batch['attention_mask'],
            params=params['params'],
            prng_key=rng_generator(),
            logits_processor=FlaxLogitsProcessorList(
                [FlaxTemperatureLogitsWarper(temperature)]
            ),
            generation_config=GenerationConfig(
                max_new_tokens=FLAGS.seq_length - FLAGS.input_length,
                pad_token_id=tokenizer.eos_token_id,
                bos_token_id=tokenizer.bos_token_id,
                eos_token_id=tokenizer.eos_token_id,
                do_sample=FLAGS.do_sample,
                num_beams=FLAGS.num_beams,
                top_k=FLAGS.top_k,
                top_p=FLAGS.top_p,
            )
        ).sequences[:, batch['input_tokens'].shape[1]:]
        return output, rng_generator()
    @partial(
        pjit,
        in_shardings=(model_ps, PS(), PS()),
        out_shardings=(PS(), PS())
    )
    def forward_greedy_generate(params, rng, batch):
        batch = with_sharding_constraint(batch, PS(('dp', 'fsdp')))
        rng_generator = JaxRNG(rng)
        output = hf_model.generate(
            batch['input_tokens'],
            attention_mask=batch['attention_mask'],
            params=params['params'],
            prng_key=rng_generator(),
            generation_config=GenerationConfig(
                max_new_tokens=FLAGS.seq_length - FLAGS.input_length,
                pad_token_id=tokenizer.eos_token_id,
                bos_token_id=tokenizer.bos_token_id,
                eos_token_id=tokenizer.eos_token_id,
                do_sample=False,
                num_beams=1,
            )
        ).sequences[:, batch['input_tokens'].shape[1]:]
        return output, rng_generator()
    mesh = GPTJConfig.get_jax_mesh(FLAGS.mesh_dim)
    with mesh:
        params = tree_apply(shard_fns, params)
        sharded_rng = next_rng()
    class ModelServer(LMServer):
        @staticmethod
        def loglikelihood(prefix_text, text):
            nonlocal sharded_rng
            prefix = prefix_tokenizer(
                prefix_text,
                padding='max_length',
                truncation=True,
                max_length=FLAGS.input_length,
                return_tensors='np',
            )
            inputs = tokenizer(
                text,
                padding='max_length',
                truncation=True,
                max_length=FLAGS.seq_length - FLAGS.input_length,
                return_tensors='np',
            )
            output_tokens = np.concatenate([prefix.input_ids, inputs.input_ids], axis=1)
            bos_tokens = np.full(
                (output_tokens.shape[0], 1), tokenizer.bos_token_id, dtype=np.int32
            )
            input_tokens = np.concatenate([bos_tokens, output_tokens[:, :-1]], axis=-1)
            input_mask = np.concatenate(
                [prefix.attention_mask, inputs.attention_mask], axis=1
            )
            if FLAGS.add_bos_token:
                bos_mask = np.ones_like(input_mask[:, :1])
            else:
                bos_mask = np.zeros_like(input_mask[:, :1])
            input_mask = np.concatenate([bos_mask, input_mask[:, :-1]], axis=1)
            output_mask = np.concatenate(
                [np.zeros_like(prefix.attention_mask), inputs.attention_mask], axis=1
            )
            batch = dict(
                input_tokens=input_tokens,
                output_tokens=output_tokens,
                input_mask=input_mask,
                output_mask=output_mask,
            )
            with mesh:
                loglikelihood, is_greedy, sharded_rng = forward_loglikelihood(
                    params, sharded_rng, batch
                )
                loglikelihood, is_greedy = jax.device_get((loglikelihood, is_greedy))
            return loglikelihood, is_greedy
        @staticmethod
        def loglikelihood_rolling(text):
            nonlocal sharded_rng
            inputs = tokenizer(
                text,
                padding='longest',
                truncation=False,
                max_length=np.iinfo(np.int32).max,
                return_tensors='np',
            )
            batch_size = inputs.input_ids.shape[0]
            output_tokens = inputs.input_ids
            attention_mask = inputs.attention_mask
            if output_tokens.shape[1] < FLAGS.seq_length:
                padding_length = FLAGS.seq_length - output_tokens.shape[1]
                pad_tokens = np.full(
                    (batch_size, padding_length), tokenizer.pad_token_id, dtype=np.int32
                )
                output_tokens = np.concatenate([output_tokens, pad_tokens], axis=-1)
                pad_mask = np.zeros(
                    (batch_size, padding_length), dtype=inputs.attention_mask.dtype
                )
                attention_mask = np.concatenate([attention_mask, pad_mask], axis=-1)
            bos_tokens = np.full(
                (batch_size, 1), tokenizer.bos_token_id, dtype=np.int32
            )
            input_tokens = np.concatenate([bos_tokens, output_tokens[:, :-1]], axis=-1)
            bos_mask = np.ones((batch_size, 1), dtype=inputs.attention_mask.dtype)
            total_seq_length = output_tokens.shape[1]
            total_loglikelihood = 0.0
            total_is_greedy = True
            # Sliding window
            for i in range(0, total_seq_length, FLAGS.seq_length):
                # Last window
                if i + FLAGS.seq_length > total_seq_length:
                    last_output_mask = np.copy(attention_mask[:, -FLAGS.seq_length:])
                    last_output_mask[:, :i - total_seq_length] = 0.0
                    batch = dict(
                        input_tokens=input_tokens[:, -FLAGS.seq_length:],
                        output_tokens=output_tokens[:, -FLAGS.seq_length:],
                        input_mask=attention_mask[:, -FLAGS.seq_length:],
                        output_mask=last_output_mask,
                    )
                # Normal window
                else:
                    batch = dict(
                        input_tokens=input_tokens[:, i:i + FLAGS.seq_length],
                        output_tokens=output_tokens[:, i:i + FLAGS.seq_length],
                        input_mask=attention_mask[:, i:i + FLAGS.seq_length],
                        output_mask=attention_mask[:, i:i + FLAGS.seq_length],
                    )
                with mesh:
                    loglikelihood, is_greedy, sharded_rng = forward_loglikelihood(
                        params, sharded_rng, batch
                    )
                    loglikelihood, is_greedy = jax.device_get((loglikelihood, is_greedy))
                total_loglikelihood += loglikelihood
                total_is_greedy = np.logical_and(is_greedy, total_is_greedy)
            return total_loglikelihood, total_is_greedy
        @staticmethod
        def generate(text, temperature):
            nonlocal sharded_rng
            inputs = prefix_tokenizer(
                text,
                padding='max_length',
                truncation=True,
                max_length=FLAGS.input_length,
                return_tensors='np',
            )
            input_tokens = inputs.input_ids
            input_mask = inputs.attention_mask
            if FLAGS.add_bos_token:
                input_tokens[:, 0] = tokenizer.bos_token_id
                input_mask[:, 0] = 1
            batch = dict(
                input_tokens=input_tokens,
                attention_mask=input_mask,
            )
            with mesh:
                output, sharded_rng = forward_generate(
                    params, sharded_rng, batch, temperature
                )
                output = jax.device_get(output)
            output_text = []
            for text in list(tokenizer.batch_decode(output)):
                if tokenizer.eos_token in text:
                    text = text.split(tokenizer.eos_token, maxsplit=1)[0]
                output_text.append(text)
            return output_text
        @staticmethod
        def greedy_until(prefix_text, until, max_length):
            nonlocal sharded_rng
            all_outputs = []
            for pf, ut in zip(prefix_text, until):
                if isinstance(ut, str):
                    ut = [ut]
                total_length = 0
                total_generated = ''
                while total_length < max_length:
                    pf_tokens = tokenizer(
                        pf,
                        padding=False,
                        truncation=False,
                        max_length=np.iinfo(np.int32).max,
                        return_tensors='np',
                    )
                    input_tokens = pf_tokens.input_ids
                    attention_mask = pf_tokens.attention_mask
                    if input_tokens.shape[1] < FLAGS.input_length:
                        extra = FLAGS.input_length - input_tokens.shape[1]
                        pad_tokens = np.full(
                            (1, extra), tokenizer.pad_token_id, dtype=np.int32
                        )
                        input_tokens = np.concatenate(
                            [pad_tokens, input_tokens], axis=1
                        )
                        pad_attention = np.zeros((1, extra), dtype=attention_mask.dtype)
                        attention_mask = np.concatenate(
                            [pad_attention, attention_mask], axis=1
                        )
                    elif input_tokens.shape[1] > FLAGS.input_length:
                        input_tokens = input_tokens[:, -FLAGS.input_length:]
                        attention_mask = attention_mask[:, -FLAGS.input_length:]
                    if FLAGS.add_bos_token:
                        input_tokens[:, 0] = tokenizer.bos_token_id
                        attention_mask[:, 0] = 1
                    batch = dict(input_tokens=input_tokens, attention_mask=attention_mask)
                    with mesh:
                        output, sharded_rng = forward_greedy_generate(
                            params, sharded_rng, batch
                        )
                        output = jax.device_get(output)
                    total_length += output.shape[1]
                    output_text = tokenizer.batch_decode(output)[0]
                    total_generated = total_generated + output_text
                    pf = pf + output_text
                    done = False
                    for s in ut:
                        if s in total_generated:
                            total_generated = total_generated.split(s, maxsplit=1)[0]
                            done = True
                    if done:
                        break
                all_outputs.append(total_generated)
            return all_outputs
    server = ModelServer(FLAGS.lm_server)
    server.run()
 if __name__ == "__main__":
    mlxu.run(main)
--- a/EasyLM/models/gptj/gptj_train.py
+++ b/EasyLM/models/gptj/gptj_train.py
@@ -0,0 +1,272 @@
 import pprint
 from functools import partial
 from tqdm import tqdm, trange
 import numpy as np
 import mlxu
 import jax
 import jax.numpy as jnp
 from jax.experimental.pjit import pjit, with_sharding_constraint
 from jax.sharding import PartitionSpec as PS
 from flax.training.train_state import TrainState
 from EasyLM.data import DatasetFactory
 from EasyLM.checkpoint import StreamingCheckpointer
 from EasyLM.optimizers import OptimizerFactory
 from EasyLM.jax_utils import (
    JaxRNG, JaxDistributedConfig, next_rng, match_partition_rules,
    cross_entropy_loss_and_accuracy, global_norm, get_float_dtype_by_name,
    set_random_seed, average_metrics, get_weight_decay_mask,
    make_shard_and_gather_fns, tree_apply
 )
 from EasyLM.models.gptj.gptj_model import GPTJConfig, FlaxGPTJForCausalLMModule
 FLAGS, FLAGS_DEF = mlxu.define_flags_with_default(
    seed=42,
    mesh_dim='1,-1,1',
    dtype='fp32',
    total_steps=10000,
    load_gptj_config='',
    update_gptj_config='',
    load_checkpoint='',
    load_dataset_state='',
    log_freq=50,
    save_model_freq=0,
    save_milestone_freq=0,
    eval_steps=0,
    tokenizer=GPTJConfig.get_tokenizer_config(),
    train_dataset=DatasetFactory.get_default_config(),
    eval_dataset=DatasetFactory.get_default_config(),
    optimizer=OptimizerFactory.get_default_config(),
    checkpointer=StreamingCheckpointer.get_default_config(),
    gptj=GPTJConfig.get_default_config(),
    logger=mlxu.WandBLogger.get_default_config(),
    log_all_worker=False,
    jax_distributed=JaxDistributedConfig.get_default_config(),
 )
 def main(argv):
    JaxDistributedConfig.initialize(FLAGS.jax_distributed)
    variant = mlxu.get_user_flags(FLAGS, FLAGS_DEF)
    flags_config_dict = mlxu.user_flags_to_config_dict(FLAGS, FLAGS_DEF)
    logger = mlxu.WandBLogger(
        config=FLAGS.logger,
        variant=variant,
        enable=FLAGS.log_all_worker or (jax.process_index() == 0),
    )
    set_random_seed(FLAGS.seed)
    tokenizer = GPTJConfig.get_tokenizer(FLAGS.tokenizer)
    dataset = DatasetFactory.load_dataset(FLAGS.train_dataset, tokenizer)
    if FLAGS.load_dataset_state != '':
        dataset.load_state_dict(mlxu.load_pickle(FLAGS.load_dataset_state))
    if FLAGS.eval_steps > 0:
        eval_dataset = DatasetFactory.load_dataset(
            FLAGS.eval_dataset, dataset.tokenizer
        )
        eval_iterator = iter(eval_dataset)
    seq_length = dataset.seq_length
    if FLAGS.load_gptj_config != '':
        gptj_config = GPTJConfig.load_config(FLAGS.load_gptj_config)
    else:
        gptj_config = GPTJConfig(**FLAGS.gptj)
    if FLAGS.update_gptj_config != '':
        gptj_config.update(dict(eval(FLAGS.update_gptj_config)))
    gptj_config.update(dict(
        bos_token_id=dataset.tokenizer.bos_token_id,
        eos_token_id=dataset.tokenizer.eos_token_id,
    ))
    if gptj_config.vocab_size < dataset.vocab_size:
        gptj_config.update(dict(vocab_size=dataset.vocab_size))
    model = FlaxGPTJForCausalLMModule(
        gptj_config, dtype=get_float_dtype_by_name(FLAGS.dtype)
    )
    optimizer, optimizer_info = OptimizerFactory.get_optimizer(
        FLAGS.optimizer,
        get_weight_decay_mask(GPTJConfig.get_weight_decay_exclusions()),
    )
    def create_trainstate_from_params(params):
        return TrainState.create(params=params, tx=optimizer, apply_fn=None)
    def init_fn(rng):
        rng_generator = JaxRNG(rng)
        params = model.init(
            input_ids=jnp.zeros((4, seq_length), dtype=jnp.int32),
            position_ids=jnp.zeros((4, seq_length), dtype=jnp.int32),
            attention_mask=jnp.ones((4, seq_length), dtype=jnp.int32),
            rngs=rng_generator(gptj_config.rng_keys()),
        )
        return TrainState.create(params=params, tx=optimizer, apply_fn=None)
    def train_step(train_state, rng, batch):
        rng_generator = JaxRNG(rng)
        batch = with_sharding_constraint(batch, PS(('dp', 'fsdp')))
        def loss_and_accuracy(params):
            logits = model.apply(
                params, batch['input_tokens'], deterministic=False,
                rngs=rng_generator(gptj_config.rng_keys()),
            ).logits
            return cross_entropy_loss_and_accuracy(
                logits, batch['target_tokens'], batch['loss_masks']
            )
        grad_fn = jax.value_and_grad(loss_and_accuracy, has_aux=True)
        (loss, accuracy), grads = grad_fn(train_state.params)
        train_state = train_state.apply_gradients(grads=grads)
        metrics = dict(
            loss=loss,
            accuracy=accuracy,
            learning_rate=optimizer_info['learning_rate_schedule'](train_state.step),
            gradient_norm=global_norm(grads),
            param_norm=global_norm(train_state.params),
        )
        return train_state, rng_generator(), metrics
    def eval_step(train_state, rng, batch):
        rng_generator = JaxRNG(rng)
        batch = with_sharding_constraint(batch, PS(('dp', 'fsdp')))
        logits = model.apply(
            train_state.params, batch['input_tokens'], deterministic=True,
            rngs=rng_generator(gptj_config.rng_keys()),
        ).logits
        loss, accuracy = cross_entropy_loss_and_accuracy(
            logits, batch['target_tokens'], batch['loss_masks']
        )
        metrics = dict(
            eval_loss=loss,
            eval_accuracy=accuracy,
        )
        return rng_generator(), metrics
    train_state_shapes = jax.eval_shape(init_fn, next_rng())
    train_state_partition = match_partition_rules(
        GPTJConfig.get_partition_rules(), train_state_shapes
    )
    shard_fns, gather_fns = make_shard_and_gather_fns(
        train_state_partition, train_state_shapes
    )
    checkpointer = StreamingCheckpointer(
        FLAGS.checkpointer, logger.output_dir,
        enable=jax.process_index() == 0,
    )
    sharded_init_fn = pjit(
        init_fn,
        in_shardings=PS(),
        out_shardings=train_state_partition
    )
    sharded_create_trainstate_from_params = pjit(
        create_trainstate_from_params,
        in_shardings=(train_state_partition.params, ),
        out_shardings=train_state_partition,
        donate_argnums=(0, ),
    )
    sharded_train_step = pjit(
        train_step,
        in_shardings=(train_state_partition, PS(), PS()),
        out_shardings=(train_state_partition, PS(), PS()),
        donate_argnums=(0, 1),
    )
    sharded_eval_step = pjit(
        eval_step,
        in_shardings=(train_state_partition, PS(), PS()),
        out_shardings=(PS(), PS()),
        donate_argnums=(1,),
    )
    def save_checkpoint(train_state, milestone=False):
        step = int(jax.device_get(train_state.step))
        metadata = dict(
            step=step,
            variant=variant,
            flags=flags_config_dict,
            gptj_config=gptj_config.to_dict(),
        )
        checkpointer.save_all(
            train_state=train_state,
            gather_fns=gather_fns,
            metadata=metadata,
            dataset=dataset.get_state_dict(),
            milestone=milestone,
        )
    mesh = GPTJConfig.get_jax_mesh(FLAGS.mesh_dim)
    with mesh:
        train_state, restored_params = None, None
        if FLAGS.load_checkpoint != '':
            load_type, load_path = FLAGS.load_checkpoint.split('::', 1)
            if load_type == 'huggingface':
                restored_params = tree_apply(
                    shard_fns.params, gptj_config.load_pretrained(load_path)
                )
                train_state = None
            else:
                train_state, restored_params = checkpointer.load_trainstate_checkpoint(
                    FLAGS.load_checkpoint, train_state_shapes, shard_fns
                )
        if train_state is None and restored_params is None:
            # Initialize from scratch
            train_state = sharded_init_fn(next_rng())
        elif train_state is None and restored_params is not None:
            # Restore from params but initialize train_state
            train_state = sharded_create_trainstate_from_params(restored_params)
            del restored_params
        start_step = int(jax.device_get(train_state.step))
        if FLAGS.save_model_freq > 0:
            save_checkpoint(train_state)
        sharded_rng = next_rng()
        step_counter = trange(start_step, FLAGS.total_steps, ncols=0)
        for step, (batch, dataset_metrics) in zip(step_counter, dataset):
            train_state, sharded_rng, metrics = sharded_train_step(
                train_state, sharded_rng, batch
            )
            if step % FLAGS.log_freq == 0:
                if FLAGS.eval_steps > 0:
                    eval_metric_list = []
                    for _ in range(FLAGS.eval_steps):
                        eval_batch, _ = next(eval_iterator)
                        sharded_rng, eval_metrics = sharded_eval_step(
                            train_state, sharded_rng, eval_batch
                        )
                        eval_metric_list.append(eval_metrics)
                    metrics.update(average_metrics(eval_metric_list))
                log_metrics = {"step": step}
                log_metrics.update(metrics)
                log_metrics.update(dataset_metrics)
                log_metrics = jax.device_get(log_metrics)
                logger.log(log_metrics)
                tqdm.write("\n" + pprint.pformat(log_metrics) + "\n")
            if FLAGS.save_milestone_freq > 0 and (step + 1) % FLAGS.save_milestone_freq == 0:
                save_checkpoint(train_state, milestone=True)
            elif FLAGS.save_model_freq > 0 and (step + 1) % FLAGS.save_model_freq == 0:
                save_checkpoint(train_state)
        if FLAGS.save_model_freq > 0:
            save_checkpoint(train_state)
 if __name__ == "__main__":
    mlxu.run(main)
--- a/EasyLM/models/llama/convert_easylm_to_hf.py
+++ b/EasyLM/models/llama/convert_easylm_to_hf.py
@@ -0,0 +1,338 @@
 # Copyright 2022 EleutherAI and The HuggingFace Inc. team. All rights reserved.
 # Copyright 2023 Xinyang Geng
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # This script converts LLaMA model checkpoint trained by EsayLM to the
 # HuggingFace transformers LLaMA PyTorch format, which can then be loaded
 # by HuggingFace transformers.
 import gc
 import json
 import math
 import os
 import shutil
 import numpy as np
 import mlxu
 import jax
 import jax.numpy as jnp
 import flax
 from flax.traverse_util import flatten_dict
 import torch
 from transformers import LlamaConfig, LlamaForCausalLM
 from EasyLM.checkpoint import StreamingCheckpointer
 from EasyLM.jax_utils import float_tensor_to_dtype
 FLAGS, FLAGS_DEF = mlxu.define_flags_with_default(
    load_checkpoint='',
    tokenizer_path='',
    model_size='13b',
    output_dir='',
 )
 LLAMA_STANDARD_CONFIGS = {
    'small': {
        'vocab_size': 64256,
        'dim': 768,
        'intermediate_size': 3072,
        'n_layers': 12,
        'n_heads': 12,
        'norm_eps': 1e-6,
    },
    'medium': {
        'vocab_size': 64256,
        'dim': 1024,
        'intermediate_size': 4096,
        'n_layers': 24,
        'n_heads': 16,
        'norm_eps': 1e-6,
    },
    'large': {
        'vocab_size': 64256,
        'dim': 1536,
        'intermediate_size': 6144,
        'n_layers': 24,
        'n_heads': 16,
        'norm_eps': 1e-6,
    },
    'xlarge': {
        'vocab_size': 64256,
        'dim': 2048,
        'intermediate_size': 8192,
        'n_layers': 24,
        'n_heads': 32,
        'norm_eps': 1e-6,
    },
    '1b': {
        'vocab_size': 64256,
        'dim': 2048,
        'intermediate_size': 5504,
        'n_layers': 22,
        'n_heads': 16,
        'norm_eps': 1e-6,
    },
    '3b': {
        'vocab_size': 64256,
        'dim': 3200,
        'intermediate_size': 8640,
        'n_layers': 26,
        'n_heads': 32,
        'norm_eps': 1e-6,
    },
    '7b': {
        'vocab_size': 64256,
        'dim': 4096,
        'intermediate_size': 11008,
        'n_layers': 32,
        'n_heads': 32,
        'norm_eps': 1e-6,
    },
    '13b': {
        'vocab_size': 64256,
        'dim': 5120,
        'intermediate_size': 13824,
        'n_layers': 40,
        'n_heads': 40,
        'norm_eps': 1e-6,
    },
    '30b': {
        'vocab_size': 64256,
        'dim': 6656,
        'intermediate_size': 17920,
        'n_layers': 60,
        'n_heads': 52,
        'norm_eps': 1e-6,
    },
    '65b': {
        'vocab_size': 64256,
        'dim': 8192,
        'intermediate_size': 22016,
        'n_layers': 80,
        'n_heads': 64,
        'norm_eps': 1e-5,
    },
 }
 def match_keywords(string, positives, negatives):
    for positive in positives:
        if positive not in string:
            return False
    for negative in negatives:
        if negative in string:
            return False
    return True
 def load_and_convert_checkpoint(path):
    _, flax_params = StreamingCheckpointer.load_trainstate_checkpoint(path)
    flax_params = flatten_dict(flax_params['params'], sep='.')
    torch_params = {}
    for key, tensor in flax_params.items():
        if match_keywords(key, ["kernel"], ["norm", 'ln_f']):
            tensor = tensor.T
        torch_params[key] = torch.tensor(
            float_tensor_to_dtype(tensor, 'fp32'), dtype=torch.float16
        )
    return torch_params
 def read_json(path):
    with open(path, "r") as f:
        return json.load(f)
 def write_json(text, path):
    with open(path, "w") as f:
        json.dump(text, f)
 def write_model(loaded, model_path, model_size):
    os.makedirs(model_path, exist_ok=True)
    tmp_model_path = os.path.join(model_path, "tmp")
    os.makedirs(tmp_model_path, exist_ok=True)
    params = LLAMA_STANDARD_CONFIGS[model_size]
    n_layers = params["n_layers"]
    n_heads = params["n_heads"]
    dim = params["dim"]
    dims_per_head = dim // n_heads
    base = 10000.0
    inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head))
    # permute for sliced rotary
    def permute(w):
        return w.view(n_heads, dim // n_heads // 2, 2, dim).transpose(1, 2).reshape(dim, dim)
    param_count = 0
    index_dict = {"weight_map": {}}
    for layer_i in range(n_layers):
        filename = f"pytorch_model-{layer_i + 1}-of-{n_layers + 1}.bin"
        state_dict = {
            f"model.layers.{layer_i}.self_attn.q_proj.weight": permute(
                loaded[f"transformer.h.{layer_i}.attention.wq.kernel"]
            ),
            f"model.layers.{layer_i}.self_attn.k_proj.weight": permute(
                loaded[f"transformer.h.{layer_i}.attention.wk.kernel"]
            ),
            f"model.layers.{layer_i}.self_attn.v_proj.weight": loaded[f"transformer.h.{layer_i}.attention.wv.kernel"],
            f"model.layers.{layer_i}.self_attn.o_proj.weight": loaded[f"transformer.h.{layer_i}.attention.wo.kernel"],
            f"model.layers.{layer_i}.mlp.gate_proj.weight": loaded[f"transformer.h.{layer_i}.feed_forward.w1.kernel"],
            f"model.layers.{layer_i}.mlp.down_proj.weight": loaded[f"transformer.h.{layer_i}.feed_forward.w2.kernel"],
            f"model.layers.{layer_i}.mlp.up_proj.weight": loaded[f"transformer.h.{layer_i}.feed_forward.w3.kernel"],
            f"model.layers.{layer_i}.input_layernorm.weight": loaded[f"transformer.h.{layer_i}.attention_norm.kernel"],
            f"model.layers.{layer_i}.post_attention_layernorm.weight": loaded[f"transformer.h.{layer_i}.ffn_norm.kernel"],
        }
        state_dict[f"model.layers.{layer_i}.self_attn.rotary_emb.inv_freq"] = inv_freq
        for k, v in state_dict.items():
            index_dict["weight_map"][k] = filename
            param_count += v.numel()
        torch.save(state_dict, os.path.join(tmp_model_path, filename))
    filename = f"pytorch_model-{n_layers + 1}-of-{n_layers + 1}.bin"
        # Unsharded
    state_dict = {
        "model.embed_tokens.weight": loaded["transformer.wte.embedding"],
        "model.norm.weight": loaded["transformer.ln_f.kernel"],
        "lm_head.weight": loaded["lm_head.kernel"],
    }
    for k, v in state_dict.items():
        index_dict["weight_map"][k] = filename
        param_count += v.numel()
    torch.save(state_dict, os.path.join(tmp_model_path, filename))
    # Write configs
    index_dict["metadata"] = {"total_size": param_count * 2}
    write_json(index_dict, os.path.join(tmp_model_path, "pytorch_model.bin.index.json"))
    config = LlamaConfig(
        vocab_size=params["vocab_size"],
        hidden_size=dim,
        intermediate_size=params["intermediate_size"],
        num_attention_heads=params["n_heads"],
        num_hidden_layers=params["n_layers"],
        rms_norm_eps=params["norm_eps"],
    )
    config.save_pretrained(tmp_model_path)
    # Make space so we can load the model properly now.
    del state_dict
    del loaded
    gc.collect()
    print("Loading the checkpoint in a Llama model.")
    model = LlamaForCausalLM.from_pretrained(tmp_model_path, torch_dtype=torch.float16)
    # Avoid saving this as part of the config.
    print("Model parameter count", model.num_parameters())
    del model.config._name_or_path
    print("Saving in the Transformers format.")
    model.save_pretrained(model_path, safe_serialization=True)
    shutil.rmtree(tmp_model_path)
 def write_tokenizer(tokenizer_path, input_tokenizer_path):
    print(f"Fetching the tokenizer from {input_tokenizer_path}.")
    os.makedirs(tokenizer_path, exist_ok=True)
    write_json(
        {
            "bos_token": {
                "content": "<s>",
                "lstrip": False,
                "normalized": True,
                "rstrip": False,
                "single_word": False
            },
            "eos_token": {
                "content": "</s>",
                "lstrip": False,
                "normalized": True,
                "rstrip": False,
                "single_word": False
            },
            "unk_token": {
                "content": "<unk>",
                "lstrip": False,
                "normalized": True,
                "rstrip": False,
                "single_word": False
            },
        },
        os.path.join(tokenizer_path, "special_tokens_map.json")
    )
    write_json(
        {
            "add_bos_token": True,
            "add_eos_token": False,
            "model_max_length": 2048,
            "pad_token": None,
            "sp_model_kwargs": {},
            "tokenizer_class": "LlamaTokenizer",
            "clean_up_tokenization_spaces": False,
            "bos_token": {
                "__type": "AddedToken",
                "content": "<s>",
                "lstrip": False,
                "normalized": True,
                "rstrip": False,
                "single_word": False
            },
            "eos_token": {
                "__type": "AddedToken",
                "content": "</s>",
                "lstrip": False,
                "normalized": True,
                "rstrip": False,
                "single_word": False
            },
            "unk_token": {
                "__type": "AddedToken",
                "content": "<unk>",
                "lstrip": False,
                "normalized": True,
                "rstrip": False,
                "single_word": False
            },
        },
        os.path.join(tokenizer_path, "tokenizer_config.json"),
    )
    shutil.copyfile(input_tokenizer_path, os.path.join(tokenizer_path, "tokenizer.model"))
 def main(argv):
    assert FLAGS.load_checkpoint != "" and FLAGS.output_dir != ""# and FLAGS.tokenizer_path != ""
    assert FLAGS.model_size in LLAMA_STANDARD_CONFIGS
    # write_tokenizer(
    #     tokenizer_path=FLAGS.output_dir,
    #     input_tokenizer_path=FLAGS.tokenizer_path,
    # )
    write_model(
        load_and_convert_checkpoint(FLAGS.load_checkpoint),
        model_path=FLAGS.output_dir,
        model_size=FLAGS.model_size,
    )
 if __name__ == "__main__":
    mlxu.run(main)
--- a/EasyLM/models/llama/convert_hf_to_easylm.py
+++ b/EasyLM/models/llama/convert_hf_to_easylm.py
@@ -0,0 +1,196 @@
 """
 Usage:
 python convert_hf_to_easylm.py  \
       --checkpoint_dir     /path/hf_format_dir/    \
       --output_file /path/easylm_format.stream   \
       --model_size 7b \
       --streaming
 """
 import time
 from pathlib import Path
 import argparse
 import mlxu
 import torch
 import flax
 from EasyLM.checkpoint import StreamingCheckpointer
 LLAMA_STANDARD_CONFIGS = {
    '1b': {
        'dim': 2048,
        'intermediate_size': 5504,
        'n_layers': 22,
        'n_heads': 16,
        'norm_eps': 1e-6,
    },
    '3b': {
        'dim': 3200,
        'intermediate_size': 8640,
        'n_layers': 26,
        'n_heads': 32,
        'norm_eps': 1e-6,
    },
    "7b": {
        "dim": 4096,
        "intermediate_size": 11008,
        "n_layers": 32,
        "n_heads": 32,
        "norm_eps": 1e-6,
    },
    "13b": {
        "dim": 5120,
        "intermediate_size": 13824,
        "n_layers": 40,
        "n_heads": 40,
        "norm_eps": 1e-6,
    },
    "30b": {
        "dim": 6656,
        "intermediate_size": 17920,
        "n_layers": 60,
        "n_heads": 52,
        "norm_eps": 1e-6,
    },
    "65b": {
        "dim": 8192,
        "intermediate_size": 22016,
        "n_layers": 80,
        "n_heads": 64,
        "norm_eps": 1e-5,
    },
 }
 def inverse_permute(params, w):
    n_layers = params["n_layers"]
    n_heads = params["n_heads"]
    dim = params["dim"]
    reshaped_w = w.reshape(n_heads, 2, dim // n_heads // 2, dim)
    transposed_w = reshaped_w.transpose(0, 2, 1, 3)
    inverted_w = transposed_w.reshape(dim, dim)
    return inverted_w
 def main(args):
    start = time.time()
    params = LLAMA_STANDARD_CONFIGS[args.model_size]
    ckpt_paths = sorted(Path(args.checkpoint_dir).glob("*.bin"))
    ckpt = {}
    for i, ckpt_path in enumerate(ckpt_paths):
        checkpoint = torch.load(ckpt_path, map_location="cpu")
        for k, v in checkpoint.items():
            if k.startswith("model."):
                k = k[6:]
            ckpt[k] = v
    print(f"Start convert weight to easylm format...")
    jax_weights = {
        "transformer": {
            "wte": {"embedding": ckpt["embed_tokens.weight"].numpy()},
            "ln_f": {"kernel": ckpt["norm.weight"].numpy()},
            "h": {
                "%d"
                % (layer): {
                    "attention": {
                        "wq": {
                            "kernel": inverse_permute(
                                params,
                                ckpt[f"layers.{layer}.self_attn.q_proj.weight"].numpy(),
                            ).transpose()
                        },
                        "wk": {
                            "kernel": inverse_permute(
                                params,
                                ckpt[f"layers.{layer}.self_attn.k_proj.weight"].numpy(),
                            ).transpose()
                        },
                        "wv": {
                            "kernel": ckpt[f"layers.{layer}.self_attn.v_proj.weight"]
                            .numpy()
                            .transpose()
                        },
                        "wo": {
                            "kernel": ckpt[f"layers.{layer}.self_attn.o_proj.weight"]
                            .numpy()
                            .transpose()
                        },
                    },
                    "feed_forward": {
                        "w1": {
                            "kernel": ckpt[f"layers.{layer}.mlp.gate_proj.weight"]
                            .numpy()
                            .transpose()
                        },
                        "w2": {
                            "kernel": ckpt[f"layers.{layer}.mlp.down_proj.weight"]
                            .numpy()
                            .transpose()
                        },
                        "w3": {
                            "kernel": ckpt[f"layers.{layer}.mlp.up_proj.weight"]
                            .numpy()
                            .transpose()
                        },
                    },
                    "attention_norm": {
                        "kernel": ckpt[f"layers.{layer}.input_layernorm.weight"].numpy()
                    },
                    "ffn_norm": {
                        "kernel": ckpt[
                            f"layers.{layer}.post_attention_layernorm.weight"
                        ].numpy()
                    },
                }
                for layer in range(params["n_layers"])
            },
        },
        "lm_head": {"kernel": ckpt["lm_head.weight"].numpy().transpose()},
    }
    print(f"Convert weight to easylm format finished...")
    print(f"Start to save...")
    if args.streaming:
        StreamingCheckpointer.save_train_state_to_file(jax_weights, args.output_file)
    else:
        with mlxu.open_file(args.output_file, "wb") as fout:
            fout.write(flax.serialization.msgpack_serialize(jax_weights, in_place=True))
    print(
        f"Save finished!!! take time: {time.time() - start} save path: {args.output_file}"
    )
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="hf to easylm format script")
    parser.add_argument(
        "--checkpoint_dir",
        type=str,
        help="Need to be converted model weight dir. it is a dir",
    )
    parser.add_argument(
        "--output_file", type=str, help="Save model weight file path, it is a file."
    )
    parser.add_argument(
        "--model_size",
        type=str,
        default="7b",
        choices=["7b", "13b", "30b", "65b"],
        help="model size",
    )
    parser.add_argument(
        "--streaming",
        action="store_true",
        default=True,
        help="whether is model weight saved stream format",
    )
    args = parser.parse_args()
    print(f"checkpoint_dir: {args.checkpoint_dir}")
    print(f"output_file: {args.output_file}")
    print(f"model_size: {args.model_size}")
    print(f"streaming: {args.streaming}")
    main(args)
--- a/EasyLM/models/llama/convert_torch_to_easylm.py
+++ b/EasyLM/models/llama/convert_torch_to_easylm.py
@@ -0,0 +1,68 @@
 # This script converts the standrd LLaMA PyTorch checkpoint released by Meta
 # to the EasyLM checkpoint format. The converted checkpoint can then be loaded
 # by EasyLM for fine-tuning or inference.
 # This script is largely borrow from https://github.com/Sea-Snell/JAX_llama
 from pathlib import Path
 import json
 import numpy as np
 import torch
 import flax
 import mlxu
 from EasyLM.checkpoint import StreamingCheckpointer
 FLAGS, FLAGS_DEF = mlxu.define_flags_with_default(
    checkpoint_dir='',
    output_file='',
    streaming=True,
 )
 def main(argv):
    ckpt_paths = sorted(Path(FLAGS.checkpoint_dir).glob("*.pth"))
    ckpts = {}
    for i, ckpt_path in enumerate(ckpt_paths):
        checkpoint = torch.load(ckpt_path, map_location="cpu")
        ckpts[int(ckpt_path.name.split('.', maxsplit=2)[1])] = checkpoint
    ckpts = [ckpts[i] for i in sorted(list(ckpts.keys()))]
    with open(Path(FLAGS.checkpoint_dir) / "params.json", "r") as f:
        params = json.loads(f.read())
    jax_weights = {
        'transformer': {
            'wte': {'embedding': np.concatenate([ckpt['tok_embeddings.weight'].numpy() for ckpt in ckpts], axis=1)},
            'ln_f': {'kernel': ckpts[0]['norm.weight'].numpy()},
            'h': {
                '%d' % (layer): {
                    'attention': {
                        'wq': {'kernel': np.concatenate([ckpt['layers.%d.attention.wq.weight' % (layer)].numpy() for ckpt in ckpts], axis=0).transpose()},
                        'wk': {'kernel': np.concatenate([ckpt['layers.%d.attention.wk.weight' % (layer)].numpy() for ckpt in ckpts], axis=0).transpose()},
                        'wv': {'kernel': np.concatenate([ckpt['layers.%d.attention.wv.weight' % (layer)].numpy() for ckpt in ckpts], axis=0).transpose()},
                        'wo': {'kernel': np.concatenate([ckpt['layers.%d.attention.wo.weight' % (layer)].numpy() for ckpt in ckpts], axis=1).transpose()},
                    },
                    'feed_forward': {
                        'w1': {'kernel': np.concatenate([ckpt['layers.%d.feed_forward.w1.weight' % (layer)].numpy() for ckpt in ckpts], axis=0).transpose()},
                        'w2': {'kernel': np.concatenate([ckpt['layers.%d.feed_forward.w2.weight' % (layer)].numpy() for ckpt in ckpts], axis=1).transpose()},
                        'w3': {'kernel': np.concatenate([ckpt['layers.%d.feed_forward.w3.weight' % (layer)].numpy() for ckpt in ckpts], axis=0).transpose()},
                    },
                    'attention_norm': {'kernel': ckpts[0]['layers.%d.attention_norm.weight' % (layer)].numpy()},
                    'ffn_norm': {'kernel': ckpts[0]['layers.%d.ffn_norm.weight' % (layer)].numpy()},
                }
            for layer in range(params['n_layers'])},
        },
        'lm_head': {'kernel': np.concatenate([ckpt['output.weight'].numpy() for ckpt in ckpts], axis=0).transpose()},
    }
    if FLAGS.streaming:
        StreamingCheckpointer.save_train_state_to_file(
            jax_weights, FLAGS.output_file
        )
    else:
        with mlxu.open_file(FLAGS.output_file, 'wb') as fout:
            fout.write(flax.serialization.msgpack_serialize(jax_weights, in_place=True))
 if __name__ == '__main__':
    mlxu.run(main)
--- a/EasyLM/models/llama/llama_model.py
+++ b/EasyLM/models/llama/llama_model.py
--- a/EasyLM/models/llama/llama_serve.py
+++ b/EasyLM/models/llama/llama_serve.py
@@ -0,0 +1,386 @@
 import pprint
 from functools import partial
 import numpy as np
 import mlxu
 import jax
 import jax.numpy as jnp
 from jax.experimental.pjit import pjit
 from jax.sharding import PartitionSpec as PS
 import optax
 from transformers import GenerationConfig, FlaxLogitsProcessorList
 from EasyLM.checkpoint import StreamingCheckpointer
 from EasyLM.serving import LMServer
 from EasyLM.jax_utils import (
    JaxRNG, JaxDistributedConfig, next_rng, match_partition_rules, tree_apply,
    set_random_seed, get_float_dtype_by_name, make_shard_and_gather_fns,
    with_sharding_constraint, FlaxTemperatureLogitsWarper
 )
 from EasyLM.models.llama.llama_model import LLaMAConfig, FlaxLLaMAForCausalLM
 FLAGS, FLAGS_DEF = mlxu.define_flags_with_default(
    seed=42,
    initialize_jax_distributed=False,
    mesh_dim='1,-1,1',
    dtype='bf16',
    input_length=1024,
    seq_length=2048,
    top_k=50,
    top_p=1.0,
    do_sample=True,
    num_beams=1,
    add_bos_token=True,
    load_llama_config='',
    load_checkpoint='',
    tokenizer=LLaMAConfig.get_tokenizer_config(),
    lm_server=LMServer.get_default_config(),
    jax_distributed=JaxDistributedConfig.get_default_config(),
 )
 def main(argv):
    JaxDistributedConfig.initialize(FLAGS.jax_distributed)
    set_random_seed(FLAGS.seed)
    prefix_tokenizer = LLaMAConfig.get_tokenizer(
        FLAGS.tokenizer, truncation_side='left', padding_side='left'
    )
    tokenizer = LLaMAConfig.get_tokenizer(
        FLAGS.tokenizer, truncation_side='right', padding_side='right'
    )
    with jax.default_device(jax.devices("cpu")[0]):
        llama_config = LLaMAConfig.load_config(FLAGS.load_llama_config)
        _, params = StreamingCheckpointer.load_trainstate_checkpoint(
            FLAGS.load_checkpoint, disallow_trainstate=True
        )
        hf_model = FlaxLLaMAForCausalLM(
            llama_config,
            input_shape=(1, FLAGS.seq_length),
            seed=FLAGS.seed,
            _do_init=False
        )
    model_ps = match_partition_rules(
        LLaMAConfig.get_partition_rules(), params
    )
    shard_fns, _ = make_shard_and_gather_fns(
        model_ps, get_float_dtype_by_name(FLAGS.dtype)
    )
    @partial(
        pjit,
        in_shardings=(model_ps, PS(), PS()),
        out_shardings=(PS(), PS(), PS())
    )
    def forward_loglikelihood(params, rng, batch):
        batch = with_sharding_constraint(batch, PS(('dp', 'fsdp')))
        rng_generator = JaxRNG(rng)
        input_tokens = batch['input_tokens']
        output_tokens = batch['output_tokens']
        input_mask = batch['input_mask']
        output_mask = batch['output_mask']
        logits = hf_model.module.apply(
            params, input_tokens, attention_mask=input_mask,
            deterministic=True, rngs=rng_generator(llama_config.rng_keys()),
        ).logits
        # if llama_config.n_real_tokens is not None:
        #   logits = logits.at[:, :, llama_config.n_real_tokens:].set(-1e8)
        loglikelihood = -optax.softmax_cross_entropy_with_integer_labels(
            logits, output_tokens
        )
        loglikelihood = jnp.sum(loglikelihood * output_mask, axis=-1)
        match_count = jnp.sum(
            (jnp.argmax(logits, axis=-1) == output_tokens) * output_mask,
            axis=-1
        )
        total = jnp.sum(output_mask, axis=-1)
        is_greedy = match_count == total
        return loglikelihood, is_greedy, rng_generator()
    @partial(
        pjit,
        in_shardings=(model_ps, PS(), PS(), PS()),
        out_shardings=(PS(), PS())
    )
    def forward_generate(params, rng, batch, temperature):
        batch = with_sharding_constraint(batch, PS(('dp', 'fsdp')))
        rng_generator = JaxRNG(rng)
        output = hf_model.generate(
            batch['input_tokens'],
            attention_mask=batch['attention_mask'],
            params=params['params'],
            prng_key=rng_generator(),
            logits_processor=FlaxLogitsProcessorList(
                [FlaxTemperatureLogitsWarper(temperature)]
            ),
            generation_config=GenerationConfig(
                max_new_tokens=FLAGS.seq_length - FLAGS.input_length,
                pad_token_id=tokenizer.eos_token_id,
                bos_token_id=tokenizer.bos_token_id,
                eos_token_id=tokenizer.eos_token_id,
                do_sample=FLAGS.do_sample,
                num_beams=FLAGS.num_beams,
                top_k=FLAGS.top_k,
                top_p=FLAGS.top_p,
            )
        ).sequences[:, batch['input_tokens'].shape[1]:]
        return output, rng_generator()
    @partial(
        pjit,
        in_shardings=(model_ps, PS(), PS()),
        out_shardings=(PS(), PS())
    )
    def forward_greedy_generate(params, rng, batch):
        batch = with_sharding_constraint(batch, PS(('dp', 'fsdp')))
        rng_generator = JaxRNG(rng)
        output = hf_model.generate(
            batch['input_tokens'],
            attention_mask=batch['attention_mask'],
            params=params['params'],
            prng_key=rng_generator(),
            generation_config=GenerationConfig(
                max_new_tokens=FLAGS.seq_length - FLAGS.input_length,
                pad_token_id=tokenizer.eos_token_id,
                bos_token_id=tokenizer.bos_token_id,
                eos_token_id=tokenizer.eos_token_id,
                do_sample=False,
                num_beams=1,
            )
        ).sequences[:, batch['input_tokens'].shape[1]:]
        return output, rng_generator()
    mesh = LLaMAConfig.get_jax_mesh(FLAGS.mesh_dim)
    with mesh:
        params = tree_apply(shard_fns, params)
        sharded_rng = next_rng()
    class ModelServer(LMServer):
        @staticmethod
        def loglikelihood(prefix_text, text):
            nonlocal sharded_rng
            prefix = prefix_tokenizer(
                prefix_text,
                padding='max_length',
                truncation=True,
                max_length=FLAGS.input_length,
                return_tensors='np',
            )
            inputs = tokenizer(
                text,
                padding='max_length',
                truncation=True,
                max_length=FLAGS.seq_length - FLAGS.input_length,
                return_tensors='np',
            )
            output_tokens = np.concatenate([prefix.input_ids, inputs.input_ids], axis=1)
            bos_tokens = np.full(
                (output_tokens.shape[0], 1), tokenizer.bos_token_id, dtype=np.int32
            )
            input_tokens = np.concatenate([bos_tokens, output_tokens[:, :-1]], axis=-1)
            input_mask = np.concatenate(
                [prefix.attention_mask, inputs.attention_mask], axis=1
            )
            if FLAGS.add_bos_token:
                bos_mask = np.ones_like(input_mask[:, :1])
            else:
                bos_mask = np.zeros_like(input_mask[:, :1])
            input_mask = np.concatenate([bos_mask, input_mask[:, :-1]], axis=1)
            output_mask = np.concatenate(
                [np.zeros_like(prefix.attention_mask), inputs.attention_mask], axis=1
            )
            batch = dict(
                input_tokens=input_tokens,
                output_tokens=output_tokens,
                input_mask=input_mask,
                output_mask=output_mask,
            )
            with mesh:
                loglikelihood, is_greedy, sharded_rng = forward_loglikelihood(
                    params, sharded_rng, batch
                )
                loglikelihood, is_greedy = jax.device_get((loglikelihood, is_greedy))
            return loglikelihood, is_greedy
        @staticmethod
        def loglikelihood_rolling(text):
            nonlocal sharded_rng
            inputs = tokenizer(
                text,
                padding='longest',
                truncation=False,
                max_length=np.iinfo(np.int32).max,
                return_tensors='np',
            )
            batch_size = inputs.input_ids.shape[0]
            output_tokens = inputs.input_ids
            attention_mask = inputs.attention_mask
            if output_tokens.shape[1] < FLAGS.seq_length:
                padding_length = FLAGS.seq_length - output_tokens.shape[1]
                pad_tokens = np.full(
                    (batch_size, padding_length), tokenizer.pad_token_id, dtype=np.int32
                )
                output_tokens = np.concatenate([output_tokens, pad_tokens], axis=-1)
                pad_mask = np.zeros(
                    (batch_size, padding_length), dtype=inputs.attention_mask.dtype
                )
                attention_mask = np.concatenate([attention_mask, pad_mask], axis=-1)
            bos_tokens = np.full(
                (batch_size, 1), tokenizer.bos_token_id, dtype=np.int32
            )
            input_tokens = np.concatenate([bos_tokens, output_tokens[:, :-1]], axis=-1)
            bos_mask = np.ones((batch_size, 1), dtype=inputs.attention_mask.dtype)
            total_seq_length = output_tokens.shape[1]
            total_loglikelihood = 0.0
            total_is_greedy = True
            # Sliding window
            for i in range(0, total_seq_length, FLAGS.seq_length):
                # Last window
                if i + FLAGS.seq_length > total_seq_length:
                    last_output_mask = np.copy(attention_mask[:, -FLAGS.seq_length:])
                    last_output_mask[:, :i - total_seq_length] = 0.0
                    batch = dict(
                        input_tokens=input_tokens[:, -FLAGS.seq_length:],
                        output_tokens=output_tokens[:, -FLAGS.seq_length:],
                        input_mask=attention_mask[:, -FLAGS.seq_length:],
                        output_mask=last_output_mask,
                    )
                # Normal window
                else:
                    batch = dict(
                        input_tokens=input_tokens[:, i:i + FLAGS.seq_length],
                        output_tokens=output_tokens[:, i:i + FLAGS.seq_length],
                        input_mask=attention_mask[:, i:i + FLAGS.seq_length],
                        output_mask=attention_mask[:, i:i + FLAGS.seq_length],
                    )
                with mesh:
                    loglikelihood, is_greedy, sharded_rng = forward_loglikelihood(
                        params, sharded_rng, batch
                    )
                    loglikelihood, is_greedy = jax.device_get((loglikelihood, is_greedy))
                total_loglikelihood += loglikelihood
                total_is_greedy = np.logical_and(is_greedy, total_is_greedy)
            return total_loglikelihood, total_is_greedy
        @staticmethod
        def generate(text, temperature):
            nonlocal sharded_rng
            inputs = prefix_tokenizer(
                text,
                padding='max_length',
                truncation=True,
                max_length=FLAGS.input_length,
                return_tensors='np',
            )
            input_tokens = inputs.input_ids
            input_mask = inputs.attention_mask
            if FLAGS.add_bos_token:
                input_tokens[:, 0] = tokenizer.bos_token_id
                input_mask[:, 0] = 1
            batch = dict(
                input_tokens=input_tokens,
                attention_mask=input_mask,
            )
            with mesh:
                output, sharded_rng = forward_generate(
                    params, sharded_rng, batch, temperature
                )
                output = jax.device_get(output)
            output_text = []
            for text in list(tokenizer.batch_decode(output)):
                if tokenizer.eos_token in text:
                    text = text.split(tokenizer.eos_token, maxsplit=1)[0]
                output_text.append(text)
            return output_text
        @staticmethod
        def greedy_until(prefix_text, until, max_length):
            nonlocal sharded_rng
            all_outputs = []
            for pf, ut in zip(prefix_text, until):
                if isinstance(ut, str):
                    ut = [ut]
                total_length = 0
                total_generated = ''
                while total_length < max_length:
                    pf_tokens = tokenizer(
                        pf,
                        padding=False,
                        truncation=False,
                        max_length=np.iinfo(np.int32).max,
                        return_tensors='np',
                    )
                    input_tokens = pf_tokens.input_ids
                    attention_mask = pf_tokens.attention_mask
                    if input_tokens.shape[1] < FLAGS.input_length:
                        extra = FLAGS.input_length - input_tokens.shape[1]
                        pad_tokens = np.full(
                            (1, extra), tokenizer.pad_token_id, dtype=np.int32
                        )
                        input_tokens = np.concatenate(
                            [pad_tokens, input_tokens], axis=1
                        )
                        pad_attention = np.zeros((1, extra), dtype=attention_mask.dtype)
                        attention_mask = np.concatenate(
                            [pad_attention, attention_mask], axis=1
                        )
                    elif input_tokens.shape[1] > FLAGS.input_length:
                        input_tokens = input_tokens[:, -FLAGS.input_length:]
                        attention_mask = attention_mask[:, -FLAGS.input_length:]
                    if FLAGS.add_bos_token:
                        input_tokens[:, 0] = tokenizer.bos_token_id
                        attention_mask[:, 0] = 1
                    batch = dict(input_tokens=input_tokens, attention_mask=attention_mask)
                    with mesh:
                        output, sharded_rng = forward_greedy_generate(
                            params, sharded_rng, batch
                        )
                        output = jax.device_get(output)
                    total_length += output.shape[1]
                    output_text = tokenizer.batch_decode(output)[0]
                    total_generated = total_generated + output_text
                    pf = pf + output_text
                    done = False
                    for s in ut:
                        if s in total_generated:
                            total_generated = total_generated.split(s, maxsplit=1)[0]
                            done = True
                    if done:
                        break
                all_outputs.append(total_generated)
            return all_outputs
    server = ModelServer(FLAGS.lm_server)
    server.run()
 if __name__ == "__main__":
    mlxu.run(main)
--- a/EasyLM/models/llama/llama_train.py
+++ b/EasyLM/models/llama/llama_train.py
@@ -0,0 +1,268 @@
 import pprint
 from functools import partial
 from tqdm import tqdm, trange
 import numpy as np
 import mlxu
 import jax
 import jax.numpy as jnp
 from jax.experimental.pjit import pjit
 from jax.sharding import PartitionSpec as PS
 from flax.training.train_state import TrainState
 from EasyLM.data import DatasetFactory
 from EasyLM.checkpoint import StreamingCheckpointer
 from EasyLM.optimizers import OptimizerFactory
 from EasyLM.jax_utils import (
    JaxRNG, JaxDistributedConfig, next_rng, match_partition_rules,
    cross_entropy_loss_and_accuracy, global_norm, get_float_dtype_by_name,
    set_random_seed, average_metrics, get_weight_decay_mask,
    make_shard_and_gather_fns, with_sharding_constraint,
 )
 from EasyLM.models.llama.llama_model import (
    LLaMAConfig, FlaxLLaMAForCausalLMModule
 )
 FLAGS, FLAGS_DEF = mlxu.define_flags_with_default(
    seed=42,
    mesh_dim='1,-1,1',
    dtype='fp32',
    param_dtype='fp32',
    total_steps=10000,
    load_llama_config='',
    update_llama_config='',
    load_checkpoint='',
    load_dataset_state='',
    log_freq=50,
    save_model_freq=0,
    save_milestone_freq=0,
    eval_freq=0,
    tokenizer=LLaMAConfig.get_tokenizer_config(),
    train_dataset=DatasetFactory.get_default_config(),
    eval_dataset=DatasetFactory.get_default_config(),
    optimizer=OptimizerFactory.get_default_config(),
    checkpointer=StreamingCheckpointer.get_default_config(),
    llama=LLaMAConfig.get_default_config(),
    logger=mlxu.WandBLogger.get_default_config(),
    log_all_worker=False,
    jax_distributed=JaxDistributedConfig.get_default_config(),
 )
 def main(argv):
    JaxDistributedConfig.initialize(FLAGS.jax_distributed)
    variant = mlxu.get_user_flags(FLAGS, FLAGS_DEF)
    flags_config_dict = mlxu.user_flags_to_config_dict(FLAGS, FLAGS_DEF)
    logger = mlxu.WandBLogger(
        config=FLAGS.logger,
        variant=variant,
        enable=FLAGS.log_all_worker or (jax.process_index() == 0),
    )
    set_random_seed(FLAGS.seed)
    tokenizer = LLaMAConfig.get_tokenizer(FLAGS.tokenizer)
    dataset = DatasetFactory.load_dataset(FLAGS.train_dataset, tokenizer)
    if FLAGS.load_dataset_state != '':
        dataset.load_state_dict(mlxu.load_pickle(FLAGS.load_dataset_state))
    if FLAGS.eval_freq > 0:
        eval_dataset = DatasetFactory.load_dataset(
            FLAGS.eval_dataset, dataset.tokenizer, eval_dataset=True
        )
    seq_length = dataset.seq_length
    if FLAGS.load_llama_config != '':
        llama_config = LLaMAConfig.load_config(FLAGS.load_llama_config)
    else:
        llama_config = LLaMAConfig(**FLAGS.llama)
    if FLAGS.update_llama_config != '':
        llama_config.update(dict(eval(FLAGS.update_llama_config)))
    llama_config.update(dict(
        bos_token_id=dataset.tokenizer.bos_token_id,
        eos_token_id=dataset.tokenizer.eos_token_id,
    ))
    if llama_config.vocab_size < dataset.vocab_size:
        print("Updating model config vocab size from", llama_config.vocab_size, "to", dataset.vocab_size)
        llama_config.update(dict(vocab_size=dataset.vocab_size))
    model = FlaxLLaMAForCausalLMModule(
        llama_config, dtype=get_float_dtype_by_name(FLAGS.dtype), param_dtype=get_float_dtype_by_name(FLAGS.param_dtype)
    )
    optimizer, optimizer_info = OptimizerFactory.get_optimizer(
        FLAGS.optimizer,
        get_weight_decay_mask(LLaMAConfig.get_weight_decay_exclusions())
    )
    def create_trainstate_from_params(params):
        return TrainState.create(params=params, tx=optimizer, apply_fn=None)
    def init_fn(rng):
        rng_generator = JaxRNG(rng)
        params = model.init(
            input_ids=jnp.zeros((4, seq_length), dtype=jnp.int32),
            position_ids=jnp.zeros((4, seq_length), dtype=jnp.int32),
            attention_mask=jnp.ones((4, seq_length), dtype=jnp.int32),
            rngs=rng_generator(llama_config.rng_keys()),
        )
        return TrainState.create(params=params, tx=optimizer, apply_fn=None)
    def train_step(train_state, rng, batch):
        rng_generator = JaxRNG(rng)
        batch = with_sharding_constraint(batch, PS(('dp', 'fsdp')))
        def loss_and_accuracy(params):
            logits = model.apply(
                params, batch['input_tokens'], deterministic=False,
                rngs=rng_generator(llama_config.rng_keys()),
            ).logits
            return cross_entropy_loss_and_accuracy(
                logits, batch['target_tokens'], batch['loss_masks']
            )
        grad_fn = jax.value_and_grad(loss_and_accuracy, has_aux=True)
        (loss, accuracy), grads = grad_fn(train_state.params)
        train_state = train_state.apply_gradients(grads=grads)
        metrics = dict(
            loss=loss,
            accuracy=accuracy,
            learning_rate=optimizer_info['learning_rate_schedule'](train_state.step),
            gradient_norm=global_norm(grads),
            param_norm=global_norm(train_state.params),
        )
        return train_state, rng_generator(), metrics
    def eval_step(train_state, rng, batch):
        rng_generator = JaxRNG(rng)
        batch = with_sharding_constraint(batch, PS(('dp', 'fsdp')))
        logits = model.apply(
            train_state.params, batch['input_tokens'], deterministic=True,
            rngs=rng_generator(llama_config.rng_keys()),
        ).logits
        loss, accuracy = cross_entropy_loss_and_accuracy(
            logits, batch['target_tokens'], batch['loss_masks']
        )
        metrics = dict(
            eval_loss=loss,
            eval_accuracy=accuracy,
        )
        return rng_generator(), metrics
    train_state_shapes = jax.eval_shape(init_fn, next_rng())
    train_state_partition = match_partition_rules(
        LLaMAConfig.get_partition_rules(), train_state_shapes
    )
    shard_fns, gather_fns = make_shard_and_gather_fns(
        train_state_partition, train_state_shapes
    )
    checkpointer = StreamingCheckpointer(
        FLAGS.checkpointer, logger.output_dir,
        enable=jax.process_index() == 0,
    )
    sharded_init_fn = pjit(
        init_fn,
        in_shardings=PS(),
        out_shardings=train_state_partition
    )
    sharded_create_trainstate_from_params = pjit(
        create_trainstate_from_params,
        in_shardings=(train_state_partition.params, ),
        out_shardings=train_state_partition,
        donate_argnums=(0, ),
    )
    sharded_train_step = pjit(
        train_step,
        in_shardings=(train_state_partition, PS(), PS()),
        out_shardings=(train_state_partition, PS(), PS()),
        donate_argnums=(0, 1),
    )
    sharded_eval_step = pjit(
        eval_step,
        in_shardings=(train_state_partition, PS(), PS()),
        out_shardings=(PS(), PS()),
        donate_argnums=(1,),
    )
    def save_checkpoint(train_state, milestone=False):
        step = int(jax.device_get(train_state.step))
        metadata = dict(
            step=step,
            variant=variant,
            flags=flags_config_dict,
            llama_config=llama_config.to_dict(),
        )
        checkpointer.save_all(
            train_state=train_state,
            gather_fns=gather_fns,
            metadata=metadata,
            dataset=dataset.get_state_dict(),
            milestone=milestone,
        )
    mesh = LLaMAConfig.get_jax_mesh(FLAGS.mesh_dim)
    with mesh:
        train_state, restored_params = None, None
        if FLAGS.load_checkpoint != '':
            train_state, restored_params = checkpointer.load_trainstate_checkpoint(
                FLAGS.load_checkpoint, train_state_shapes, shard_fns
            )
        if train_state is None and restored_params is None:
            # Initialize from scratch
            train_state = sharded_init_fn(next_rng())
        elif train_state is None and restored_params is not None:
            # Restore from params but initialize train_state
            train_state = sharded_create_trainstate_from_params(restored_params)
            del restored_params
        start_step = int(jax.device_get(train_state.step))
        if FLAGS.save_model_freq > 0:
            save_checkpoint(train_state)
        sharded_rng = next_rng()
        step_counter = trange(start_step, FLAGS.total_steps, ncols=0)
        for step, (batch, dataset_metrics) in zip(step_counter, dataset):
            train_state, sharded_rng, metrics = sharded_train_step(
                train_state, sharded_rng, batch
            )
            if FLAGS.eval_freq > 0 and (step + 1) % FLAGS.eval_freq == 0:
                eval_metric_list = []
                eval_iterator = iter(eval_dataset)
                for eval_batch, _ in eval_iterator:
                    sharded_rng, eval_metrics = sharded_eval_step(
                        train_state, sharded_rng, eval_batch
                    )
                    eval_metric_list.append(eval_metrics)
                metrics.update(average_metrics(eval_metric_list))
            if FLAGS.log_freq > 0 and (step + 1) % FLAGS.log_freq == 0:
                log_metrics = {"step": step + 1}
                log_metrics.update(metrics)
                log_metrics.update(dataset_metrics)
                log_metrics = jax.device_get(log_metrics)
                logger.log(log_metrics)
                tqdm.write("\n" + pprint.pformat(log_metrics) + "\n")
            if FLAGS.save_milestone_freq > 0 and (step + 1) % FLAGS.save_milestone_freq == 0:
                save_checkpoint(train_state, milestone=True)
            elif FLAGS.save_model_freq > 0 and (step + 1) % FLAGS.save_model_freq == 0:
                save_checkpoint(train_state)
        if FLAGS.save_model_freq > 0:
            save_checkpoint(train_state)
 if __name__ == "__main__":
    mlxu.run(main)
--- a/EasyLM/models/roberta/init.py
+++ b/EasyLM/models/roberta/init.py
--- a/EasyLM/models/roberta/roberta_model.py
+++ b/EasyLM/models/roberta/roberta_model.py
--- a/EasyLM/models/roberta/roberta_train.py
+++ b/EasyLM/models/roberta/roberta_train.py
@@ -0,0 +1,307 @@
 import dataclasses
 import pprint
 from functools import partial
 import re
 from tqdm import tqdm, trange
 import numpy as np
 import mlxu
 import jax
 import jax.numpy as jnp
 from jax.experimental.pjit import pjit, with_sharding_constraint
 from jax.sharding import PartitionSpec as PS
 from flax.training.train_state import TrainState
 from EasyLM.data import DatasetFactory
 from EasyLM.checkpoint import StreamingCheckpointer
 from EasyLM.optimizers import OptimizerFactory
 from EasyLM.jax_utils import (
    JaxRNG, JaxDistributedConfig, next_rng, match_partition_rules, get_float_dtype_by_name,
    cross_entropy_loss_and_accuracy, named_tree_map, global_norm,
    set_random_seed, average_metrics, get_weight_decay_mask,
    make_shard_and_gather_fns, tree_apply
 )
 from EasyLM.models.roberta.roberta_model import (
    RobertaConfig, FlaxRobertaForMaskedLMModule
 )
 FLAGS, FLAGS_DEF = mlxu.define_flags_with_default(
    seed=42,
    mesh_dim='-1,1,1',
    dtype='fp32',
    mask_token_probability=0.15,
    total_steps=10000,
    load_roberta_config='',
    update_roberta_config='',
    load_checkpoint='',
    load_dataset_state='',
    log_freq=50,
    save_model_freq=0,
    save_milestone_freq=0,
    eval_steps=0,
    tokenizer=RobertaConfig.get_tokenizer_config(),
    train_dataset=DatasetFactory.get_default_config(),
    eval_dataset=DatasetFactory.get_default_config(),
    optimizer=OptimizerFactory.get_default_config(),
    checkpointer=StreamingCheckpointer.get_default_config(),
    roberta=RobertaConfig.get_default_config(),
    logger=mlxu.WandBLogger.get_default_config(),
    log_all_worker=False,
    jax_distributed=JaxDistributedConfig.get_default_config(),
 )
 def main(argv):
    JaxDistributedConfig.initialize(FLAGS.jax_distributed)
    variant = mlxu.get_user_flags(FLAGS, FLAGS_DEF)
    flags_config_dict = mlxu.user_flags_to_config_dict(FLAGS, FLAGS_DEF)
    logger = mlxu.WandBLogger(
        config=FLAGS.logger,
        variant=variant,
        enable=FLAGS.log_all_worker or (jax.process_index() == 0),
    )
    set_random_seed(FLAGS.seed)
    tokenizer = RobertaConfig.get_tokenizer(FLAGS.tokenizer)
    dataset = DatasetFactory.load_dataset(FLAGS.train_dataset, tokenizer)
    if FLAGS.load_dataset_state != '':
        dataset.load_state_dict(mlxu.load_pickle(FLAGS.load_dataset_state))
    if FLAGS.eval_steps > 0:
        eval_dataset = DatasetFactory.load_dataset(
            FLAGS.eval_dataset, dataset.tokenizer
        )
        eval_iterator = iter(eval_dataset)
    seq_length = dataset.seq_length
    if FLAGS.load_roberta_config != '':
        roberta_config = RobertaConfig.load_config(FLAGS.load_roberta_config)
    else:
        roberta_config = RobertaConfig(**FLAGS.roberta)
    if FLAGS.update_roberta_config != '':
        roberta_config.update(dict(eval(FLAGS.update_roberta_config)))
    roberta_config.update(dict(
        bos_token_id=dataset.tokenizer.bos_token_id,
        eos_token_id=dataset.tokenizer.eos_token_id,
        pad_token_id=dataset.tokenizer.pad_token_id,
        vocab_size=dataset.vocab_size,
    ))
    model = FlaxRobertaForMaskedLMModule(
        roberta_config, dtype=get_float_dtype_by_name(FLAGS.dtype)
    )
    optimizer, optimizer_info = OptimizerFactory.get_optimizer(
        FLAGS.optimizer,
        get_weight_decay_mask(RobertaConfig.get_weight_decay_exclusions()),
    )
    def create_trainstate_from_params(params):
        return TrainState.create(params=params, tx=optimizer, apply_fn=None)
    def init_fn(rng):
        rng_generator = JaxRNG(rng)
        params = model.init(
            input_ids=jnp.zeros((4, seq_length), dtype=jnp.int32),
            position_ids=jnp.zeros((4, seq_length), dtype=jnp.int32),
            attention_mask=jnp.ones((4, seq_length), dtype=jnp.int32),
            token_type_ids=None,
            head_mask=None,
            rngs=rng_generator(roberta_config.rng_keys()),
        )
        return TrainState.create(params=params, tx=optimizer, apply_fn=None)
    def train_step(train_state, rng, batch):
        rng_generator = JaxRNG(rng)
        tokens = with_sharding_constraint(batch['target_tokens'], PS(('dp', 'fsdp')))
        def loss_and_accuracy(params):
            altered_tokens = jax.random.uniform(
                rng_generator(), shape=tokens.shape
            ) < FLAGS.mask_token_probability
            random_uniform = jax.random.uniform(rng_generator(), shape=tokens.shape)
            altered_by_mask = altered_tokens & (random_uniform < 0.8)
            altered_by_random = altered_tokens & (random_uniform >= 0.8) & (random_uniform < 0.9)
            inputs = jnp.where(altered_by_mask, dataset.tokenizer.mask_token_id, tokens)
            random_tokens = jax.random.randint(
                rng_generator(), shape=tokens.shape, minval=0, maxval=dataset.vocab_size
            )
            inputs = jnp.where(altered_by_random, random_tokens, inputs)
            logits = model.apply(
                params, inputs,
                attention_mask=jnp.ones_like(inputs),
                token_type_ids=None,
                position_ids=None,
                head_mask=None,
                deterministic=False,
                rngs=rng_generator(roberta_config.rng_keys()),
            ).logits
            return cross_entropy_loss_and_accuracy(logits, tokens, valid=altered_tokens)
        grad_fn = jax.value_and_grad(loss_and_accuracy, has_aux=True)
        (loss, accuracy), grads = grad_fn(train_state.params)
        train_state = train_state.apply_gradients(grads=grads)
        metrics = dict(
            loss=loss,
            accuracy=accuracy,
            learning_rate=optimizer_info['learning_rate_schedule'](train_state.step),
            gradient_norm=global_norm(grads),
            param_norm=global_norm(train_state.params),
        )
        return train_state, rng_generator(), metrics
    def eval_step(train_state, rng, batch):
        rng_generator = JaxRNG(rng)
        tokens = with_sharding_constraint(batch['target_tokens'], PS(('dp', 'fsdp')))
        altered_tokens = jax.random.uniform(
            rng_generator(), shape=tokens.shape
        ) < FLAGS.mask_token_probability
        random_uniform = jax.random.uniform(rng_generator(), shape=tokens.shape)
        altered_by_mask = altered_tokens & (random_uniform < 0.8)
        altered_by_random = altered_tokens & (random_uniform >= 0.8) & (random_uniform < 0.9)
        inputs = jnp.where(altered_by_mask, dataset.tokenizer.mask_token_id, tokens)
        random_tokens = jax.random.randint(
            rng_generator(), shape=tokens.shape, minval=0, maxval=dataset.vocab_size
        )
        inputs = jnp.where(altered_by_random, random_tokens, inputs)
        logits = model.apply(
            train_state.params, inputs,
            attention_mask=jnp.ones_like(inputs),
            token_type_ids=None,
            position_ids=None,
            head_mask=None,
            deterministic=False,
            rngs=rng_generator(roberta_config.rng_keys()),
        ).logits
        loss, accuracy = cross_entropy_loss_and_accuracy(logits, tokens, valid=altered_tokens)
        metrics = dict(
            eval_loss=loss,
            eval_accuracy=accuracy,
        )
        return rng_generator(), metrics
    train_state_shapes = jax.eval_shape(init_fn, next_rng())
    train_state_partition = match_partition_rules(
        RobertaConfig.get_partition_rules(), train_state_shapes
    )
    shard_fns, gather_fns = make_shard_and_gather_fns(
        train_state_partition, train_state_shapes
    )
    checkpointer = StreamingCheckpointer(
        FLAGS.checkpointer, logger.output_dir,
        enable=jax.process_index() == 0
    )
    sharded_init_fn = pjit(
        init_fn,
        in_shardings=PS(),
        out_shardings=train_state_partition
    )
    sharded_create_trainstate_from_params = pjit(
        create_trainstate_from_params,
        in_shardings=(train_state_partition.params, ),
        out_shardings=train_state_partition,
        donate_argnums=(0, ),
    )
    sharded_train_step = pjit(
        train_step,
        in_shardings=(train_state_partition, PS(), PS()),
        out_shardings=(train_state_partition, PS(), PS()),
        donate_argnums=(0, 1),
    )
    sharded_eval_step = pjit(
        eval_step,
        in_shardings=(train_state_partition, PS(), PS()),
        out_shardings=(PS(), PS()),
        donate_argnums=(1,),
    )
    def save_checkpoint(train_state, milestone=False):
        step = int(jax.device_get(train_state.step))
        metadata = dict(
            step=step,
            variant=variant,
            flags=flags_config_dict,
            roberta_config=roberta_config.to_dict(),
        )
        checkpointer.save_all(
            train_state=train_state,
            gather_fns=gather_fns,
            metadata=metadata,
            dataset=dataset.get_state_dict(),
            milestone=milestone,
        )
    mesh = RobertaConfig.get_jax_mesh(FLAGS.mesh_dim)
    with mesh:
        train_state, restored_params = None, None
        if FLAGS.load_checkpoint != '':
            load_type, load_path = FLAGS.load_checkpoint.split('::', 1)
            if load_type == 'huggingface':
                restored_params = tree_apply(
                    shard_fns.params, roberta_config.load_pretrained(load_path)
                )
                train_state = None
            else:
                train_state, restored_params = checkpointer.load_trainstate_checkpoint(
                    FLAGS.load_checkpoint, train_state_shapes, shard_fns
                )
        if train_state is None and restored_params is None:
            # Initialize from scratch
            train_state = sharded_init_fn(next_rng())
        elif train_state is None and restored_params is not None:
            # Restore from params but initialize train_state
            train_state = sharded_create_trainstate_from_params(restored_params)
            del restored_params
        start_step = int(jax.device_get(train_state.step))
        if FLAGS.save_model_freq > 0:
            save_checkpoint(train_state)
        sharded_rng = next_rng()
        step_counter = trange(start_step, FLAGS.total_steps, ncols=0)
        for step, (batch, dataset_metrics) in zip(step_counter, dataset):
            train_state, sharded_rng, metrics = sharded_train_step(
                train_state, sharded_rng, batch
            )
            if step % FLAGS.log_freq == 0:
                if FLAGS.eval_steps > 0:
                    eval_metric_list = []
                    for _ in range(FLAGS.eval_steps):
                        eval_batch, _ = next(eval_iterator)
                        sharded_rng, eval_metrics = sharded_eval_step(
                            train_state, sharded_rng, eval_batch
                        )
                        eval_metric_list.append(eval_metrics)
                    metrics.update(average_metrics(eval_metric_list))
                log_metrics = {"step": step}
                log_metrics.update(metrics)
                log_metrics.update(dataset_metrics)
                log_metrics = jax.device_get(log_metrics)
                logger.log(log_metrics)
                tqdm.write("\n" + pprint.pformat(log_metrics) + "\n")
            if FLAGS.save_milestone_freq > 0 and (step + 1) % FLAGS.save_milestone_freq == 0:
                save_checkpoint(train_state, milestone=True)
            elif FLAGS.save_model_freq > 0 and (step + 1) % FLAGS.save_model_freq == 0:
                save_checkpoint(train_state)
        if FLAGS.save_model_freq > 0:
            save_checkpoint(train_state)
 if __name__ == "__main__":
    mlxu.run(main)
--- a/EasyLM/optimizers.py
+++ b/EasyLM/optimizers.py
@@ -0,0 +1,346 @@
 import os
 import time
 from typing import Any, Mapping, Text, Tuple, Union, NamedTuple
 from functools import partial
 import re
 import dataclasses
 import random
 from ml_collections.config_dict import config_dict
 from ml_collections import ConfigDict
 import jax
 import jax.numpy as jnp
 import numpy as np
 from absl import logging
 import optax
 from EasyLM.jax_utils import float_to_dtype
 class OptimizerFactory(object):
    """ Configurable optax optimizer factory. """
    def __init__(self):
        raise NotImplementedError
    @staticmethod
    def get_default_config(updates=None):
        config = ConfigDict()
        config.accumulate_gradient_steps = 1
        config.type = 'adamw'
        config.palm_optimizer = PalmOptimizerFactory.get_default_config()
        config.adamw_optimizer = AdamWOptimizerFactory.get_default_config()
        config.lion_optimizer = LionOptimizerFactory.get_default_config()
        if updates is not None:
            config.update(ConfigDict(updates).copy_and_resolve_references())
        return config
    @classmethod
    def get_optimizer(cls, config, weight_decay_mask=None):
        config = cls.get_default_config(config)
        if config.type == 'palm':
            optimizer, optimizer_info = PalmOptimizerFactory.get_optimizer(
                config.palm_optimizer, weight_decay_mask
            )
        elif config.type == 'adamw':
            optimizer, optimizer_info = AdamWOptimizerFactory.get_optimizer(
                config.adamw_optimizer, weight_decay_mask
            )
        elif config.type == 'lion':
            optimizer, optimizer_info = LionOptimizerFactory.get_optimizer(
                config.lion_optimizer, weight_decay_mask
            )
        else:
            raise ValueError(f'Unknown optimizer type: {config.type}')
        if config.accumulate_gradient_steps > 1:
            optimizer = optax.MultiSteps(
                optimizer, config.accumulate_gradient_steps
            )
        return optimizer, optimizer_info
 class PalmOptimizerFactory(object):
    """ PaLM optimizer factory. This optimizer implements the optimizer
        described in the PaLM paper: https://arxiv.org/abs/2204.02311
    """
    def __init__(self):
        raise NotImplementedError
    @staticmethod
    def get_default_config(updates=None):
        config = ConfigDict()
        config.lr = 0.01
        config.lr_warmup_steps = 10000
        config.b1 = 0.9
        config.b2 = 0.99
        config.clip_gradient = 1.0
        config.weight_decay = 1e-4
        config.bf16_momentum = False
        if updates is not None:
            config.update(ConfigDict(updates).copy_and_resolve_references())
        return config
    @classmethod
    def get_optimizer(cls, config, weight_decay_mask=None):
        config = cls.get_default_config(config)
        def learning_rate_schedule(step):
            multiplier = config.lr / 0.01
            return multiplier / jnp.sqrt(jnp.maximum(step, config.lr_warmup_steps))
        def weight_decay_schedule(step):
            multiplier = config.weight_decay / 1e-4
            return -multiplier * jnp.square(learning_rate_schedule(step))
        optimizer_info = dict(
            learning_rate_schedule=learning_rate_schedule,
            weight_decay_schedule=weight_decay_schedule,
        )
        optimizer = optax.chain(
            optax.clip_by_global_norm(config.clip_gradient),
            optax.adafactor(
                learning_rate=learning_rate_schedule,
                multiply_by_parameter_scale=True,
                momentum=config.b1,
                decay_rate=config.b2,
                factored=False,
                clipping_threshold=None,
                dtype_momentum=jnp.bfloat16 if config.bf16_momentum else jnp.float32,
            ),
            optax_add_scheduled_weight_decay(
                weight_decay_schedule, weight_decay_mask
            )
        )
        return optimizer, optimizer_info
 class AdamWOptimizerFactory(object):
    """ AdamW optimizer with cosine schedule. """
    def __init__(self):
        raise NotImplementedError
    @staticmethod
    def get_default_config(updates=None):
        config = ConfigDict()
        config.init_lr = 0.0
        config.end_lr = 0.001
        config.lr = 0.01
        config.lr_warmup_steps = 2000
        config.lr_decay_steps = 500000
        config.b1 = 0.9
        config.b2 = 0.95
        config.clip_gradient = 1.0
        config.weight_decay = 1e-4
        config.bf16_momentum = False
        config.multiply_by_parameter_scale = False
        if updates is not None:
            config.update(ConfigDict(updates).copy_and_resolve_references())
        return config
    @classmethod
    def get_optimizer(cls, config, weight_decay_mask=None):
        config = cls.get_default_config(config)
        learning_rate_schedule = optax.warmup_cosine_decay_schedule(
            init_value=config.init_lr,
            peak_value=config.lr,
            warmup_steps=config.lr_warmup_steps,
            decay_steps=config.lr_decay_steps,
            end_value=config.end_lr,
        )
        optimizer_info = dict(
            learning_rate_schedule=learning_rate_schedule,
        )
        if config.multiply_by_parameter_scale:
            optimizer = optax.chain(
                optax.clip_by_global_norm(config.clip_gradient),
                optax.adafactor(
                    learning_rate=learning_rate_schedule,
                    multiply_by_parameter_scale=True,
                    momentum=config.b1,
                    decay_rate=config.b2,
                    factored=False,
                    clipping_threshold=None,
                    dtype_momentum=jnp.bfloat16 if config.bf16_momentum else jnp.float32,
                ),
                optax_add_scheduled_weight_decay(
                    lambda step: -learning_rate_schedule(step) * config.weight_decay,
                    weight_decay_mask
                )
            )
        else:
            optimizer = optax.chain(
                optax.clip_by_global_norm(config.clip_gradient),
                optax.adamw(
                    learning_rate=learning_rate_schedule,
                    weight_decay=config.weight_decay,
                    b1=config.b1,
                    b2=config.b2,
                    mask=weight_decay_mask,
                    mu_dtype=jnp.bfloat16 if config.bf16_momentum else jnp.float32,
                ),
            )
        return optimizer, optimizer_info
 class LionOptimizerFactory(object):
    """ Lion optimizer with cosine schedule. """
    def __init__(self):
        raise NotImplementedError
    @staticmethod
    def get_default_config(updates=None):
        config = ConfigDict()
        config.init_lr = 0.0
        config.end_lr = 0.0001
        config.lr = 0.001
        config.lr_warmup_steps = 60000
        config.lr_constant_steps = 840000
        config.lr_decay_steps = 100000
        config.b1 = 0.9
        config.b2 = 0.98
        config.clip_gradient = 1.0
        config.weight_decay = 1e-3
        config.bf16_momentum = False
        config.lr_schedule_type = "warmup_cosine_decay_schedule" 
        config.lr_decay_rate = 0.98
        if updates is not None:
            config.update(ConfigDict(updates).copy_and_resolve_references())
        return config
    @classmethod
    def get_optimizer(cls, config, weight_decay_mask=None):
        config = cls.get_default_config(config)
        if config.lr_schedule_type == "warmup_cosine_decay_schedule":
            learning_rate_schedule = optax.warmup_cosine_decay_schedule(
                init_value=config.init_lr,
                peak_value=config.lr,
                warmup_steps=config.lr_warmup_steps,
                decay_steps=config.lr_decay_steps,
                end_value=config.end_lr,
            )
        elif config.lr_schedule_type == "warmup_constant":
            learning_rate_schedule = optax.join_schedules(
                [
                    optax.linear_schedule(
                        init_value=config.init_lr,
                        end_value=config.lr,
                        transition_steps=config.lr_warmup_steps,
                    ),
                    optax.constant_schedule(config.lr),
                ],
                [config.lr_warmup_steps],
            )
        elif config.lr_schedule_type == "warmup_constant_linear_decay":
            learning_rate_schedule = optax.join_schedules(
                [
                    optax.linear_schedule(
                        init_value=config.init_lr,
                        end_value=config.lr,
                        transition_steps=config.lr_warmup_steps,
                    ),
                    optax.constant_schedule(config.lr),
                    optax.linear_schedule(
                        init_value=config.lr,
                        end_value=config.end_lr,
                        transition_steps=config.lr_decay_steps,
                    )
                ],
                [config.lr_warmup_steps, config.lr_constant_steps],
            )
        elif config.lr_schedule_type == "warmup_constant_exponential_decay":
            learning_rate_schedule = optax.join_schedules(
                [
                    optax.linear_schedule(
                        init_value=config.init_lr,
                        end_value=config.lr,
                        transition_steps=config.lr_warmup_steps,
                    ),
                    optax.constant_schedule(config.lr),
                    optax.exponential_decay(
                        init_value=config.lr, 
                        transition_steps=config.lr_decay_steps, 
                        decay_rate=config.lr_decay_rate, 
                        transition_begin=0, 
                        staircase=False, 
                        end_value=config.end_lr,
                    )
                ],
                [config.lr_warmup_steps, config.lr_constant_steps],
            )
        elif config.lr_schedule_type == "exponential_decay":
            learning_rate_schedule = optax.exponential_decay(
                        init_value=config.lr, 
                        transition_steps=config.lr_decay_steps, 
                        decay_rate=config.lr_decay_rate, 
                        transition_begin=0, 
                        staircase=False, 
                        end_value=config.end_lr,
            )
        elif config.lr_schedule_type == "linear_decay":
            learning_rate_schedule = optax.linear_schedule(
                        init_value=config.lr,
                        end_value=config.end_lr,
                        transition_steps=config.lr_decay_steps,
            )
        else:
            raise ValueError('config.lr_schedule_type must be "warmup_cosine_decay_schedule", "warmup_constant", "warmup_constant_linear_decay", "warmup_constant_exponential_decay", "exponential_decay" or "linear_decay"')
        optimizer_info = dict(
            learning_rate_schedule=learning_rate_schedule,
        )
        optimizer = optax.chain(
            optax.clip_by_global_norm(config.clip_gradient),
            optax.lion(
                learning_rate=learning_rate_schedule,
                weight_decay=config.weight_decay,
                b1=config.b1,
                b2=config.b2,
                mask=weight_decay_mask,
                mu_dtype=jnp.bfloat16 if config.bf16_momentum else jnp.float32,
            ),
        )
        return optimizer, optimizer_info
 class OptaxScheduledWeightDecayState(NamedTuple):
    count: jax.Array
 def optax_add_scheduled_weight_decay(schedule_fn, mask=None):
    """ Apply weight decay with schedule. """
    def init_fn(params):
        del params
        return OptaxScheduledWeightDecayState(count=jnp.zeros([], jnp.int32))
    def update_fn(updates, state, params):
        if params is None:
            raise ValueError('Params cannot be None for weight decay!')
        weight_decay = schedule_fn(state.count)
        updates = jax.tree_util.tree_map(
            lambda g, p: g + weight_decay * p, updates, params
        )
        return updates, OptaxScheduledWeightDecayState(
            count=optax.safe_int32_increment(state.count)
        )
    if mask is not None:
        return optax.masked(optax.GradientTransformation(init_fn, update_fn), mask)
    return optax.GradientTransformation(init_fn, update_fn)
--- a/EasyLM/scripts/init.py
+++ b/EasyLM/scripts/init.py
--- a/EasyLM/scripts/benchmark_attention.py
+++ b/EasyLM/scripts/benchmark_attention.py
@@ -0,0 +1,150 @@
 from functools import partial
 from time import time
 import os
 import numpy as np
 import jax
 import jax.flatten_util
 import jax.numpy as jnp
 import mlxu
 from EasyLM.bpt import blockwise_attn
 from EasyLM.jax_utils import (
    get_float_dtype_by_name, set_random_seed, next_rng, JaxRNG
 )
 FLAGS, _ = mlxu.define_flags_with_default(
    seed=42,
    dtype='fp32',
    embed_dim=2048,
    n_heads=16,
    ref_attn_seq_len=2048,
    eff_attn_seq_len=16384,
    batch_size=1,
    query_chunk_size=2048,
    key_chunk_size=2048,
    warmup_steps=40,
    steps=200,
 )
 def main(argv):
    def random_kqv(rng_key, seq_len):
        rng_generator = JaxRNG(rng_key)
        kqv = []
        for i in range(3):
            kqv.append(
                jax.random.normal(
                    rng_generator(),
                    (FLAGS.batch_size, seq_len, FLAGS.n_heads, FLAGS.embed_dim // FLAGS.n_heads),
                    dtype=get_float_dtype_by_name(FLAGS.dtype)
                )
            )
        return tuple(kqv)
    def reference_attn(query, key, value):
        dtype = get_float_dtype_by_name(FLAGS.dtype)
        query = query / jnp.sqrt(query.shape[-1]).astype(dtype)
        logits = jnp.einsum("bqhc,bkhc->bhqk", query, key)
        mask_value = jnp.finfo(logits.dtype).min
        _, q_seq_len, _, _ = query.shape
        _, kv_seq_len, _, _ = key.shape
        mask_shape = (q_seq_len, kv_seq_len)
        row_ids = jax.lax.broadcasted_iota(jnp.int32, mask_shape, 0)
        col_ids = jax.lax.broadcasted_iota(jnp.int32, mask_shape, 1)
        causal_mask = (row_ids < col_ids)[None, None, :, :]
        logits = logits + jnp.where(causal_mask, mask_value, 0.0)
        weights = jax.nn.softmax(logits, axis=-1)
        out = jnp.einsum("bhqk,bkhc->bqhc", weights, value)
        return out
    def efficient_attention(query, key, value):
        dtype = get_float_dtype_by_name(FLAGS.dtype)
        return blockwise_attn(
            query, key, value,
            bias=None,
            deterministic=True,
            dropout_rng=None,
            attn_pdrop=0.0,
            causal=True,
            query_chunk_size=FLAGS.query_chunk_size,
            key_chunk_size=FLAGS.key_chunk_size,
            dtype=get_float_dtype_by_name(FLAGS.dtype),
            policy=jax.checkpoint_policies.nothing_saveable(),
            precision=None,
            float32_logits=True,
            prevent_cse=True,
        )
    @partial(jax.jit, static_argnums=(1,))
    def reference_attn_forward_backward(rng_key, seq_len):
        @partial(jax.grad, argnums=(0, 1, 2))
        @partial(jax.checkpoint, policy=jax.checkpoint_policies.nothing_saveable())
        def grad_fn(query, key, value):
            out = reference_attn(query, key, value)
            return jnp.mean(out)
        query, key, value = random_kqv(rng_key, seq_len)
        return jax.flatten_util.ravel_pytree(
            grad_fn(query, key, value)[1]
        )[0].mean()
    @partial(jax.jit, static_argnums=(1,))
    def efficient_attn_forward_backward(rng_key, seq_len):
        @partial(jax.grad, argnums=(0, 1, 2))
        def grad_fn(query, key, value):
            out = efficient_attention(query, key, value)
            return jnp.mean(out)
        query, key, value = random_kqv(rng_key, seq_len)
        return jax.flatten_util.ravel_pytree(
            grad_fn(query, key, value)[1]
        )[0].mean()
    set_random_seed(FLAGS.seed)
    jax.block_until_ready(reference_attn_forward_backward(next_rng(), FLAGS.ref_attn_seq_len))
    jax.block_until_ready(efficient_attn_forward_backward(next_rng(), FLAGS.eff_attn_seq_len))
    all_results = []
    for i in range(FLAGS.warmup_steps):
        all_results.append(reference_attn_forward_backward(next_rng(), FLAGS.ref_attn_seq_len))
    jax.block_until_ready(all_results)
    start_time = time()
    all_results = []
    for i in range(FLAGS.steps):
        all_results.append(reference_attn_forward_backward(next_rng(), FLAGS.ref_attn_seq_len))
    jax.block_until_ready(all_results)
    elapsed_time_ref_attn = time() - start_time
    print(f'Reference attention: {elapsed_time_ref_attn:.3f} seconds')
    all_results = []
    for i in range(FLAGS.warmup_steps):
        all_results.append(efficient_attn_forward_backward(next_rng(), FLAGS.eff_attn_seq_len))
    jax.block_until_ready(all_results)
    start_time = time()
    all_results = []
    for i in range(FLAGS.steps):
        all_results.append(efficient_attn_forward_backward(next_rng(), FLAGS.eff_attn_seq_len))
    jax.block_until_ready(all_results)
    elapsed_time_efficient_attn = time() - start_time
    print(f'Efficient attention: {elapsed_time_efficient_attn:.3f} seconds')
    flops_ratio = (FLAGS.eff_attn_seq_len / FLAGS.ref_attn_seq_len) ** 2
    efficiency = elapsed_time_ref_attn / elapsed_time_efficient_attn * flops_ratio
    print(f'Efficiency: {efficiency:.3f}')
 if __name__ == '__main__':
    mlxu.run(main)
--- a/EasyLM/scripts/convert_checkpoint.py
+++ b/EasyLM/scripts/convert_checkpoint.py
@@ -0,0 +1,42 @@
 # This script converts model checkpoint trained by EsayLM to a standard
 # mspack checkpoint that can be loaded by huggingface transformers or
 # flax.serialization.msgpack_restore. Such conversion allows models to be
 # used by other frameworks that integrate with huggingface transformers.
 import pprint
 from functools import partial
 import os
 import numpy as np
 import mlxu
 import jax.numpy as jnp
 import flax.serialization
 from EasyLM.checkpoint import StreamingCheckpointer
 from EasyLM.jax_utils import float_to_dtype
 FLAGS, FLAGS_DEF = mlxu.define_flags_with_default(
    load_checkpoint='',
    output_file='',
    streaming=False,
    float_dtype='bf16',
 )
 def main(argv):
    assert FLAGS.load_checkpoint != '' and FLAGS.output_file != '', 'input and output must be specified'
    params = StreamingCheckpointer.load_trainstate_checkpoint(
        FLAGS.load_checkpoint, disallow_trainstate=True
    )[1]['params']
    if FLAGS.streaming:
        StreamingCheckpointer.save_train_state_to_file(
            params, FLAGS.output_file, float_dtype=FLAGS.float_dtype
        )
    else:
        params = float_to_dtype(params, FLAGS.float_dtype)
        with mlxu.open_file(FLAGS.output, 'wb') as fout:
            fout.write(flax.serialization.msgpack_serialize(params, in_place=True))
 if __name__ == "__main__":
    mlxu.run(main)
--- a/EasyLM/scripts/diff_checkpoint.py
+++ b/EasyLM/scripts/diff_checkpoint.py
@@ -0,0 +1,59 @@
 # This script converts model checkpoint trained by EsayLM to a standard
 # mspack checkpoint that can be loaded by huggingface transformers or
 # flax.serialization.msgpack_restore. Such conversion allows models to be
 # used by other frameworks that integrate with huggingface transformers.
 import pprint
 from functools import partial
 import os
 import numpy as np
 import jax
 import jax.numpy as jnp
 import flax.serialization
 import mlxu
 from EasyLM.checkpoint import StreamingCheckpointer
 from EasyLM.jax_utils import float_to_dtype
 FLAGS, FLAGS_DEF = mlxu.define_flags_with_default(
    recover_diff=False,
    load_base_checkpoint='',
    load_target_checkpoint='',
    output_file='',
    streaming=True,
    float_dtype='bf16',
 )
 def main(argv):
    assert FLAGS.load_base_checkpoint != '' and FLAGS.load_target_checkpoint != ''
    assert FLAGS.output_file != ''
    base_params = StreamingCheckpointer.load_trainstate_checkpoint(
        FLAGS.load_base_checkpoint, disallow_trainstate=True
    )[1]['params']
    target_params = StreamingCheckpointer.load_trainstate_checkpoint(
        FLAGS.load_target_checkpoint, disallow_trainstate=True
    )[1]['params']
    if FLAGS.recover_diff:
        params = jax.tree_util.tree_map(
            lambda b, t: b + t, base_params, target_params
        )
    else:
        params = jax.tree_util.tree_map(
            lambda b, t: t - b, base_params, target_params
        )
    if FLAGS.streaming:
        StreamingCheckpointer.save_train_state_to_file(
            params, FLAGS.output_file, float_dtype=FLAGS.float_dtype
        )
    else:
        params = float_to_dtype(params, FLAGS.float_dtype)
        with mlxu.open_file(FLAGS.output, 'wb') as fout:
            fout.write(flax.serialization.msgpack_serialize(params, in_place=True))
 if __name__ == "__main__":
    mlxu.run(main)
--- a/EasyLM/scripts/lm_eval_harness.py
+++ b/EasyLM/scripts/lm_eval_harness.py
@@ -0,0 +1,65 @@
 # This script runs lm_eval_harness evaluations against a served language model.
 # Typically, you need to run a language model server first, e.g.:
 #    python -m EasyLM.models.gptj.gptj_serve ...
 import dataclasses
 import pprint
 from functools import partial
 import os
 from tqdm import tqdm, trange
 import numpy as np
 import mlxu
 from flax.traverse_util import flatten_dict
 from lm_eval import evaluator, tasks
 from lm_eval.base import LM
 from EasyLM.serving import LMClient
 FLAGS, FLAGS_DEF = mlxu.define_flags_with_default(
    tasks='wsc,piqa,winogrande,openbookqa,logiqa',
    shots=0,
    limit=0,
    write_out=False,
    lm_client=LMClient.get_default_config(),
    logger=mlxu.WandBLogger.get_default_config(),
 )
 class LMEvalHarnessInterface(LM):
    def __init__(self, lm_client):
        self.lm_client = lm_client
    def greedy_until(self, inputs):
        prefix, until = zip(*inputs)
        return self.lm_client.greedy_until(prefix, until)
    def loglikelihood_rolling(self, inputs):
        loglikelihood, is_greedy = self.lm_client.loglikelihood_rolling(inputs)
        return list(zip(loglikelihood, is_greedy))
    def loglikelihood(self, inputs):
        prefix, text = zip(*inputs)
        loglikelihood, is_greedy = self.lm_client.loglikelihood(prefix, text)
        return list(zip(loglikelihood, is_greedy))
 def main(argv):
    logger = mlxu.WandBLogger(
        config=FLAGS.logger, variant=mlxu.get_user_flags(FLAGS, FLAGS_DEF)
    )
    model = LMEvalHarnessInterface(LMClient(FLAGS.lm_client))
    task_list = FLAGS.tasks.split(',')
    results = evaluator.evaluate(
        model, tasks.get_task_dict(task_list), False, FLAGS.shots,
        limit=None if FLAGS.limit <= 0 else FLAGS.limit,
        write_out=FLAGS.write_out,
    )
    logger.log(flatten_dict(results['results'], sep='/'))
    pprint.pprint(results)
 if __name__ == "__main__":
    mlxu.run(main)
--- a/EasyLM/scripts/lm_eval_json.py
+++ b/EasyLM/scripts/lm_eval_json.py
@@ -0,0 +1,52 @@
 import json
 import mlxu
 from EasyLM.serving import LMClient
 FLAGS, FLAGS_DEF = mlxu.define_flags_with_default(
    input_file='',
    output_file='',
    prefix_field='prefix',
    text_field='text',
    until_field='until',
    eval_type='loglikelihood',
    lm_client=LMClient.get_default_config(),
 )
 def main(argv):
    lm_client = LMClient(FLAGS.lm_client)
    with mlxu.open_file(FLAGS.input_file, 'r') as fin:
        input_data = json.load(fin)
    if FLAGS.eval_type == 'loglikelihood':
        prefix = input_data[FLAGS.prefix_field]
        text = input_data[FLAGS.text_field]
        loglikelihoods, is_greedys = lm_client.loglikelihood(prefix, text)
        output_data = {
            'loglikelihood': loglikelihoods,
            'is_greedy': is_greedys,
        }
    elif FLAGS.eval_type == 'loglikelihood_rolling':
        text = input_data[FLAGS.text_field]
        loglikelihoods, is_greedys = lm_client.loglikelihood_rolling(text)
        output_data = {
            'loglikelihood': loglikelihoods,
            'is_greedy': is_greedys,
        }
    elif FLAGS.eval_type == 'greedy_until':
        prefix = input_data[FLAGS.prefix_field]
        until = input_data[FLAGS.until_field]
        output_data = {'output_text': lm_client.greedy_until(prefix, until)}
    elif FLAGS.eval_type == 'generate':
        prefix = input_data[FLAGS.prefix_field]
        output_data = {'output_text': lm_client.generate(prefix)}
    else:
        raise ValueError(f'Unknown eval_type: {FLAGS.eval_type}')
    with mlxu.open_file(FLAGS.output_file, 'w') as fout:
        json.dump(output_data, fout)
 if __name__ == "__main__":
    mlxu.run(main)
--- a/EasyLM/serving.py
+++ b/EasyLM/serving.py
@@ -0,0 +1,566 @@
 import dataclasses
 import pprint
 from functools import partial
 import re
 import os
 from threading import Lock
 import urllib
 import time
 from typing import List, Optional, Union
 from pydantic import BaseModel
 import absl.logging
 from tqdm import tqdm, trange
 import numpy as np
 import mlxu
 from ml_collections import ConfigDict
 import uvicorn
 from fastapi import FastAPI
 import gradio as gr
 import requests
 from requests.exceptions import Timeout, ConnectionError
 class InferenceRequest(BaseModel):
    prefix_text: Optional[List[str]] = None
    text: Optional[List[str]] = None
    until: Optional[Union[List[str], List[List[str]]]] = None
    temperature: Optional[float] = None
 class ChatRequest(BaseModel):
    prompt: str
    context: str = ''
    temperature: Optional[float] = None
 class LMServer(object):
    """ HTTP server for serving langauge models. """
    @staticmethod
    def get_default_config(updates=None):
        config = ConfigDict()
        config.host = '0.0.0.0'
        config.port = 5007
        config.batch_size = 1
        config.logging = False
        config.pre_compile = 'loglikelihood'
        config.default_temperature = 1.0
        config.greedy_until_max_length = 5000
        config.prepend_to_prefix = ''
        config.append_to_prefix = ''
        config.prepend_to_text = ''
        config.append_to_text = ''
        config.chat_prepend_text = ''
        config.chat_user_prefix = ''
        config.chat_user_suffix = ''
        config.chat_lm_prefix = ''
        config.chat_lm_suffix = ''
        config.notes = ''
        if updates is not None:
            config.update(ConfigDict(updates).copy_and_resolve_references())
        return config
    def __init__(self, config):
        self.config = self.get_default_config(config)
        self.lock = Lock()
        self.app = FastAPI()
        self.app.post('/loglikelihood')(self.serve_loglikelihood)
        self.app.post('/loglikelihood-rolling')(self.serve_loglikelihood_rolling)
        self.app.post('/generate')(self.serve_generate)
        self.app.post('/greedy-until')(self.serve_greedy_until)
        self.app.post('/chat')(self.serve_chat)
        self.app.get('/ready')(self.serve_ready)
        self.app = gr.mount_gradio_app(self.app, self.create_chat_app(), '/')
    @staticmethod
    def loglikelihood(prefix_text, text):
        raise NotImplementedError()
    @staticmethod
    def loglikelihood_rolling(text):
        raise NotImplementedError()
    @staticmethod
    def generate(text, temperature):
        raise NotImplementedError()
    @staticmethod
    def greedy_until(prefix_text, until, max_length):
        raise NotImplementedError()
    @staticmethod
    def to_list(x):
        if isinstance(x, np.ndarray):
            return x.tolist()
        return x
    def serve_ready(self):
        return 'Ready!\n'
    def serve_loglikelihood(self, data: InferenceRequest):
        with self.lock:
            if self.config.logging:
                absl.logging.info(
                    '\n========= Serving Log Likelihood Request ========= \n'
                    + pprint.pformat(data) + '\n'
                )
            if data.prefix_text is None:
                data.prefix_text = ['' for _ in data.text]
            prefix_text = [
                self.config.prepend_to_prefix + p + self.config.append_to_prefix
                for p in data.prefix_text
            ]
            text = [
                self.config.prepend_to_text + t + self.config.append_to_text
                for t in data.text
            ]
            log_likelihood = []
            is_greedy = []
            for i in trange(0, len(text), self.config.batch_size, ncols=0):
                batch_prefix_text = prefix_text[i:i + self.config.batch_size]
                batch_text = text[i:i + self.config.batch_size]
                batch_size = len(batch_text)
                if batch_size < self.config.batch_size:
                    extra = self.config.batch_size - batch_size
                    batch_prefix_text.extend(['a' for _ in range(extra)])
                    batch_text.extend(['a' for _ in range(extra)])
                batch_log_likelihood, batch_is_greedy = self.loglikelihood(
                    batch_prefix_text, batch_text
                )
                batch_log_likelihood = self.to_list(batch_log_likelihood)
                batch_is_greedy = self.to_list(batch_is_greedy)
                log_likelihood.extend(batch_log_likelihood[:batch_size])
                is_greedy.extend(batch_is_greedy[:batch_size])
            output = {
                'prefix_text': data.prefix_text,
                'text': data.text,
                'log_likelihood': log_likelihood,
                'is_greedy': is_greedy,
            }
            if self.config.logging:
                absl.logging.info(
                '\n========= Output ========= \n'
                + pprint.pformat(output) + '\n'
            )
        return output
    def serve_loglikelihood_rolling(self, data: InferenceRequest):
        with self.lock:
            if self.config.logging:
                absl.logging.info(
                    '\n========= Serving Log Likelihood Request ========= \n'
                    + pprint.pformat(data) + '\n'
                )
            text = [
                self.config.prepend_to_text + t + self.config.append_to_text
                for t in data.text
            ]
            log_likelihood = []
            is_greedy = []
            for i in trange(0, len(text), self.config.batch_size, ncols=0):
                batch_text = text[i:i + self.config.batch_size]
                batch_size = len(batch_text)
                if batch_size < self.config.batch_size:
                    extra = self.config.batch_size - batch_size
                    batch_text.extend(['a' for _ in range(extra)])
                batch_log_likelihood, batch_is_greedy = self.loglikelihood_rolling(
                    batch_text
                )
                batch_log_likelihood = self.to_list(batch_log_likelihood)
                batch_is_greedy = self.to_list(batch_is_greedy)
                log_likelihood.extend(batch_log_likelihood[:batch_size])
                is_greedy.extend(batch_is_greedy[:batch_size])
            output = {
                'text': data.text,
                'log_likelihood': log_likelihood,
                'is_greedy': is_greedy,
            }
            if self.config.logging:
                absl.logging.info(
                '\n========= Output ========= \n'
                + pprint.pformat(output) + '\n'
            )
        return output
    def serve_generate(self, data: InferenceRequest):
        with self.lock:
            if self.config.logging:
                absl.logging.info(
                    '\n========= Serving Generate Request ========= \n'
                    + pprint.pformat(data) + '\n'
                )
            prefix_text = [
                self.config.prepend_to_prefix + p + self.config.append_to_prefix
                for p in data.prefix_text
            ]
            if data.temperature is None:
                data.temperature = self.config.default_temperature
            output_text = []
            for i in trange(0, len(prefix_text), self.config.batch_size, ncols=0):
                batch_prefix_text = prefix_text[i:i + self.config.batch_size]
                batch_size = len(batch_prefix_text)
                if batch_size < self.config.batch_size:
                    extra = self.config.batch_size - batch_size
                    batch_prefix_text.extend(['a' for _ in range(extra)])
                batch_output_text = self.generate(
                    batch_prefix_text,
                    temperature=data.temperature,
                )
                output_text.extend(self.to_list(batch_output_text)[:batch_size])
            output = {
                'prefix_text': data.prefix_text,
                'output_text': output_text,
                'temperature': data.temperature,
            }
            if self.config.logging:
                absl.logging.info(
                    '\n========= Output ========= \n'
                    + pprint.pformat(output) + '\n'
                )
        return output
    def serve_greedy_until(self, data: InferenceRequest):
        with self.lock:
            if self.config.logging:
                absl.logging.info(
                    '\n========= Serving Greedy Until Request ========= \n'
                    + pprint.pformat(data) + '\n'
                )
            prefix_text = [
                self.config.prepend_to_prefix + p + self.config.append_to_prefix
                for p in data.prefix_text
            ]
            until = data.until
            max_length = self.config.greedy_until_max_length
            output_text = []
            for i in range(0, len(prefix_text), self.config.batch_size):
                batch_prefix_text = prefix_text[i:i + self.config.batch_size]
                batch_until = until[i:i + self.config.batch_size]
                batch_size = len(batch_prefix_text)
                batch_output_text = self.greedy_until(batch_prefix_text, batch_until, max_length)
                output_text.extend(self.to_list(batch_output_text)[:batch_size])
            output = {
                'prefix_text': data.prefix_text,
                'until': data.until,
                'max_length': max_length,
                'output_text': output_text,
            }
            if self.config.logging:
                absl.logging.info(
                    '\n========= Output ========= \n'
                    + pprint.pformat(output) + '\n'
                )
        return output
    def process_chat(self, prompt, context, temperature):
        context = (
            context + self.config.chat_user_prefix
            + prompt + self.config.chat_user_suffix
            + self.config.chat_lm_prefix
        )
        response = self.generate(
            [self.config.chat_prepend_text + context],
            temperature=float(temperature),
        )[0]
        context = context + response + self.config.chat_lm_suffix
        return response, context
    def serve_chat(self, data: ChatRequest):
        if data.temperature is None:
            data.temperature = self.config.default_temperature
        response, context = self.process_chat(
            data.prompt, data.context,
            temperature=data.temperature,
        )
        return {
            'response': response,
            'context': context,
            'temperature': data.temperature,
        }
    def create_chat_app(self):
        with gr.Blocks(analytics_enabled=False, title='EasyLM Chat') as gradio_chatbot:
            gr.Markdown('# Chatbot Powered by [EasyLM](https://github.com/young-geng/EasyLM)')
            gr.Markdown(self.config.notes)
            chatbot = gr.Chatbot(label='Chat history')
            msg = gr.Textbox(
                placeholder='Type your message here...',
                show_label=False
            )
            with gr.Row():
                send = gr.Button('Send')
                regenerate = gr.Button('Regenerate', interactive=False)
                clear = gr.Button('Reset')
            temp_slider = gr.Slider(
                label='Temperature', minimum=0, maximum=2.0,
                value=self.config.default_temperature
            )
            context_state = gr.State(['', ''])
            def user_fn(user_message, history, context):
                return {
                    msg: gr.update(value='', interactive=False),
                    clear: gr.update(interactive=False),
                    send: gr.update(interactive=False),
                    regenerate: gr.update(interactive=False),
                    chatbot: history + [[user_message, None]],
                    context_state: [context[1], context[1]],
                }
            def model_fn(history, context, temperature):
                history[-1][1], new_context = self.process_chat(
                    history[-1][0], context[0], temperature
                )
                return {
                    msg: gr.update(value='', interactive=True),
                    clear: gr.update(interactive=True),
                    send: gr.update(interactive=True),
                    chatbot: history,
                    context_state: [context[0], new_context],
                    regenerate: gr.update(interactive=True),
                }
            def regenerate_fn():
                return {
                    msg: gr.update(value='', interactive=False),
                    clear: gr.update(interactive=False),
                    send: gr.update(interactive=False),
                    regenerate: gr.update(interactive=False),
                }
            def clear_fn():
                return {
                    chatbot: None,
                    msg: '',
                    context_state: ['', ''],
                    regenerate: gr.update(interactive=False),
                }
            msg.submit(
                user_fn,
                inputs=[msg, chatbot, context_state],
                outputs=[msg, clear, send, chatbot, context_state, regenerate],
                queue=False
            ).then(
                model_fn,
                inputs=[chatbot, context_state, temp_slider],
                outputs=[msg, clear, send, chatbot, context_state, regenerate],
                queue=True
            )
            send.click(
                user_fn,
                inputs=[msg, chatbot, context_state],
                outputs=[msg, clear, send, chatbot, context_state, regenerate],
                queue=False
            ).then(
                model_fn,
                inputs=[chatbot, context_state, temp_slider],
                outputs=[msg, clear, send, chatbot, context_state, regenerate],
                queue=True
            )
            regenerate.click(
                regenerate_fn,
                inputs=None,
                outputs=[msg, clear, send, regenerate],
                queue=False
            ).then(
                model_fn,
                inputs=[chatbot, context_state, temp_slider],
                outputs=[msg, clear, send, chatbot, context_state, regenerate],
                queue=True
            )
            clear.click(
                clear_fn,
                inputs=None,
                outputs=[chatbot, msg, context_state, regenerate],
                queue=False
            )
        gradio_chatbot.queue(concurrency_count=1)
        return gradio_chatbot
    def run(self):
        if self.config.pre_compile != '':
            if self.config.pre_compile == 'all':
                pre_compile = ['loglikelihood', 'generate', 'greedy_until', 'chat']
            else:
                pre_compile = self.config.pre_compile.split(',')
            pre_compile_data = ['a' for _ in range(self.config.batch_size)]
            for task in pre_compile:
                if task == 'loglikelihood':
                    self.loglikelihood(pre_compile_data, pre_compile_data)
                    self.loglikelihood_rolling(pre_compile_data)
                elif task == 'generate':
                    self.generate(pre_compile_data, 1.0)
                elif task == 'greedy_until':
                    self.greedy_until(
                        pre_compile_data, pre_compile_data,
                        self.config.greedy_until_max_length
                    )
                elif task == 'chat':
                    self.process_chat('a', 'a', 1.0)
                else:
                    raise ValueError(f'Invalid precompile task: {task}!')
        uvicorn.run(self.app, host=self.config.host, port=self.config.port)
 class LMClient(object):
    """ A simple client for the LM server. """
    @staticmethod
    def get_default_config(updates=None):
        config = ConfigDict()
        config.url = 'http://localhost:5007'
        config.batch_size = 1
        config.wait_for_ready = True
        config.dummy = False
        if updates is not None:
            config.update(ConfigDict(updates).copy_and_resolve_references())
        return config
    def __init__(self, config=None):
        self.config = self.get_default_config(config)
        if self.config.wait_for_ready:
            self.wait_for_ready()
    def wait_for_ready(self):
        if self.config.dummy:
            return
        while True:
            try:
                requests.get(urllib.parse.urljoin(self.config.url, 'ready'))
                return
            except (Timeout, ConnectionError) as e:
                time.sleep(10)
    @staticmethod
    def batched(iterator, batch_size):
        batch = []
        for example in iterator:
            batch.append(example)
            if len(batch) == batch_size:
                yield batch
                batch = []
        if len(batch) > 0:
            yield batch
    def loglikelihood(self, prefix, text):
        prefix, text = list(prefix), list(text)
        if self.config.dummy:
            return [-1.0 for _ in text], [False for _ in text]
        log_likelihood = []
        is_greedy = []
        batched_iterator = list(zip(
            self.batched(prefix, self.config.batch_size),
            self.batched(text, self.config.batch_size)
        ))
        for batch_prefix, batch_text in tqdm(batched_iterator, ncols=0):
            response = requests.post(
                urllib.parse.urljoin(self.config.url, 'loglikelihood'),
                json={'prefix_text': batch_prefix, 'text': batch_text}
            ).json()
            log_likelihood.extend(response['log_likelihood'])
            is_greedy.extend(response['is_greedy'])
        return log_likelihood, is_greedy
    def loglikelihood_rolling(self, text):
        text = list(text)
        if self.config.dummy:
            return [-1.0 for _ in text], [False for _ in text]
        log_likelihood = []
        is_greedy = []
        batched_iterator = list(self.batched(text, self.config.batch_size))
        for batch_text in tqdm(batched_iterator, ncols=0):
            response = requests.post(
                urllib.parse.urljoin(self.config.url, 'loglikelihood-rolling'),
                json={'text': batch_text}
            ).json()
            log_likelihood.extend(response['log_likelihood'])
            is_greedy.extend(response['is_greedy'])
        return log_likelihood, is_greedy
    def greedy_until(self, prefix, until):
        prefix, until = list(prefix), list(until)
        if self.config.dummy:
            results = []
            for u in until:
                if isinstance(u, str):
                    results.append('dummy text ' + u)
                else:
                    results.append('dummy text ' + u[0])
            return results
        batched_iterator = list(zip(
            self.batched(prefix, self.config.batch_size),
            self.batched(until, self.config.batch_size),
        ))
        output_text = []
        for batch_prefix, batch_until in tqdm(batched_iterator, ncols=0):
            response = requests.post(
                urllib.parse.urljoin(self.config.url, 'greedy-until'),
                json={'prefix_text': batch_prefix, 'until': batch_until}
            ).json()
            output_text.extend(response['output_text'])
        return output_text
    def generate(self, prefix, temperature=None):
        prefix = list(prefix)
        if self.config.dummy:
            return ['' for _ in prefix]
        output_text = []
        batched_iterator = list(self.batched(prefix, self.config.batch_size))
        for batch_prefix in tqdm(batched_iterator, ncols=0):
            response = requests.post(
                urllib.parse.urljoin(self.config.url, 'generate'),
                json={
                    'prefix_text': batch_prefix,
                    'temperature': temperature,
                }
            ).json()
            output_text.extend(response['output_text'])
        return output_text
    def chat(self, prompt, context, temperature=None):
        if self.config.dummy:
            return ''
        response = requests.post(
            urllib.parse.urljoin(self.config.url, 'chat'),
            json={
                'prompt': prompt,
                'context': context,
                'temperature': temperature,
            }
        ).json()
        return response['response'], response['context']
--- a/README.md
+++ b/README.md
@@ -0,0 +1,302 @@
 ---
 datasets:
 - Finnish-NLP/CulturaX_fi_cleaned
 - Finnish-NLP/HPLT_1.2_fi_cleaned
 - Finnish-NLP/wikipedia_20231101_fi_cleaned
 - Finnish-NLP/Reddit_fi_2006_2022
 - intfloat/multilingual_cc_news
 language:
 - fi
 license: apache-2.0
 pipeline_tag: text-generation
 tags:
 - finnish
 - llama
 library_name: transformers
 ---
 # Ahma-7B for Finnish
 Ahma-7B is a 7B parameter decoder-only transformer model based on Meta's Llama (v1) architecture, pretrained from scratch on the Finnish language. Its development was informed by the research presented in the paper [Scaling Data-Constrained Language Models](https://huggingface.co/papers/2305.16264). The original Llama model architecture was introduced in
 [this paper](https://arxiv.org/abs/2302.13971)
 and first released at [this page](https://github.com/facebookresearch/llama).
 What does Ahma mean? Ahma is the Finnish word for wolverine! In the Finnish Lapland, wolverines are the biggest cause of reindeer damage. 
 There are two different sized base Ahma models both pretrained from scratch, Ahma-3B for 139B tokens and Ahma-7B for 149B tokens:
 | Model | Context length | Layers | Dim | Heads | Params |
 |:--------------------------------------------------------------------------------|:---------------|:-------|:-----|:------|:-------|
 | [Ahma-3B](https://huggingface.co/Finnish-NLP/Ahma-3B) | 2048 | 26 | 3200 | 32 | 3.6B |
 | [Ahma-7B](https://huggingface.co/Finnish-NLP/Ahma-7B) | 2048 | 32 | 4096 | 32 | 7.0B |
 And two instruct-tuned versions:
 | Model | Context length | Layers | Dim | Heads | Params |
 |:--------------------------------------------------------------------------------|:---------------|:-------|:-----|:------|:-------|
 | [Ahma-3B-Instruct](https://huggingface.co/Finnish-NLP/Ahma-3B-Instruct) | 2048 | 26 | 3200 | 32 | 3.6B |
 | [Ahma-7B-Instruct](https://huggingface.co/Finnish-NLP/Ahma-7B-Instruct) | 2048 | 32 | 4096 | 32 | 7.0B |
 ## Paper Abstract
 The current trend of scaling language models involves increasing both parameter count and training dataset size. Extrapolating this trend suggests that training dataset size may soon be limited by the amount of text data available on the internet. Motivated by this limit, we investigate scaling language models in data-constrained regimes. Specifically, we run a large set of experiments varying the extent of data repetition and compute budget, ranging up to 900 billion training tokens and 9 billion parameter models. We find that with constrained data for a fixed compute budget, training with up to 4 epochs of repeated data yields negligible changes to loss compared to having unique data. However, with more repetition, the value of adding compute eventually decays to zero. We propose and empirically validate a scaling law for compute optimality that accounts for the decreasing value of repeated tokens and excess parameters. Finally, we experiment with approaches mitigating data scarcity, including augmenting the training dataset with code data or removing commonly used filters. Models and datasets from our 400 training runs are freely available at this https URL .
 ## Intended uses & limitations
 This model was pretrained only in a self-supervised way, without any supervised training. You can use this model for text generation or fine-tune it for a downstream task. This model followed a 2-stage pretraining approach where single-turn instruction-following examples were mixed in with the other training data in the second stage (explained more later in this readme). Thanks to this approach, this pretrained model is already capable of instruction following, but you might get even better results if you specifically fine-tune it for instruction following or other use cases. For instruction-following fine-tuning, you should use the same prompt format showcased below.
 ### How to use
 #### Fine-tuning
 We have now added finetuning example notebook along with video! \
 Notebook: https://huggingface.co/Finnish-NLP/Ahma-3B/blob/main/Finetune_Ahma_3B_example.ipynb \
 Video: https://www.youtube.com/watch?v=6mbgn9XzpS4 
 #### Inference
 If you want to use this model for instruction-following, you need to use the same prompt format we used in the second stage of the pretraining (basically the same format what Meta used in their Llama2 models). **Note: do not use "LlamaTokenizer" from transformers library but always use the AutoTokenizer instead, or use the plain sentencepiece tokenizer.** Here is an example using the instruction-following prompt format, with some generation arguments you can modify for your use:
 ```python
 from transformers import AutoTokenizer, AutoModelForCausalLM
 system_prompt = "Olet tekoälyavustaja. Vastaat aina mahdollisimman avuliaasti. Vastauksesi eivät saa sisältää mitään haitallista, epäeettistä, rasistista, seksististä, vaarallista tai laitonta sisältöä. Jos kysymyksessä ei ole mitään järkeä tai se ei ole asiasisällöltään johdonmukainen, selitä miksi sen sijaan, että vastaisit jotain väärin. Jos et tiedä vastausta kysymykseen, älä kerro väärää tietoa."
 def format_prompt(prompt: str) -> str:
    prompt = f" [INST] <<SYS>>
 {system_prompt.strip()}
 <</SYS>>
 {prompt.strip()} [/INST] "
    return prompt
 tokenizer = AutoTokenizer.from_pretrained("Finnish-NLP/Ahma-7B")
 model = AutoModelForCausalLM.from_pretrained("Finnish-NLP/Ahma-7B")
 model = model.to("cuda")
 # use the custom prompt format function or the chat template feature in the tokenizer to format your inputs
 # prompt = format_prompt("Listaa kolme hyötyä, joita pienet avoimen lähdekoodin kielimallit tuovat?")
 # inputs = tokenizer(prompt, return_tensors="pt")
 messages = [
    {
        "role": "system",
        "content": system_prompt,
    },
    {"role": "user", "content": "Listaa kolme hyötyä, joita pienet avoimen lähdekoodin kielimallit tuovat?"},
 ]
 inputs = tokenizer.apply_chat_template(
    messages, tokenize=True, add_generation_prompt=True, return_tensors="pt"
 )
 inputs = inputs.to("cuda")
 generated_ids = model.generate(
    inputs,
    temperature=0.6,
    penalty_alpha=0.6,
    top_k=4,
    do_sample=True,
    repetition_penalty=1.2,
    min_length=5,
    max_length=2048,
 )
 generated_text = tokenizer.batch_decode(
    generated_ids, skip_special_tokens=False
 )[0]
 """
 1. Parempi luettavuus ja ymmärtäminen: Pienten avoimen lähdekoodin kielimallien avulla voidaan luoda ymmärrettävämpää ja luettavampaa tekstiä, mikä helpottaa ihmisten ymmärtämistä ja tiedon hankkimista.
 2. Parempi mukautuvuus ja monipuolisuus: Avoimen lähdekoodin mallit antavat kehittäjille mahdollisuuden luoda räätälöityjä ratkaisuja omiin tarpeisiinsa, jolloin he voivat hyödyntää olemassa olevaa tietämystä ja asiantuntemusta.
 3. Lisääntynyt yhteistyö ja avoimuus: Avoimen lähdekoodin mallien ansiosta kehittäjät voivat tehdä yhteistyötä muiden kanssa, jakaa ideoita ja parantaa koodin laatua jakamalla oivalluksia ja parhaita käytäntöjä. Tämä edistää yhteistyöhön perustuvaa ympäristöä ja kannustaa jatkuvaan parantamiseen.
 """
 ```
 You may experiment with different system prompt instructions too if you like.
 ### Limitations and bias
 This model was trained only with Finnish texts excluding code so it should not be used for multilingual and code generation use cases.
 The training data used for this model contains a lot of content from the internet, which is far from neutral. Therefore, the model can have biased predictions. This bias will also affect all fine-tuned versions of this model.
 To reduce toxic content, training data was filtered with a toxicity classifier but it cannot truly eliminate all toxic text.
 ## Training data
 This model was pretrained on the combination of 14 datasets:
 - [CulturaX_fi_cleaned](https://huggingface.co/datasets/Finnish-NLP/CulturaX_fi_cleaned), we cleaned Finnish split from the original [CulturaX](https://huggingface.co/datasets/uonlp/CulturaX) dataset
 - [HPLT_1.2_fi_cleaned](https://huggingface.co/datasets/Finnish-NLP/HPLT_1.2_fi_cleaned), we cleaned Finnish split from the original [HPLT v1.2](https://hplt-project.org/datasets/v1.2) dataset
 - [wikipedia_20231101_fi_cleaned](https://huggingface.co/datasets/Finnish-NLP/wikipedia_20231101_fi_cleaned), we used the Finnish subset of the wikipedia (November 2023) dataset
 - [Reddit_fi_2006_2022](https://huggingface.co/datasets/Finnish-NLP/Reddit_fi_2006_2022), filtered and post-processed dataset of Finnish Reddit
 - [Yle Finnish News Archive 2011-2018](http://urn.fi/urn:nbn:fi:lb-2017070501)
 - [Yle Finnish News Archive 2019-2020](http://urn.fi/urn:nbn:fi:lb-2021050401)
 - [Finnish News Agency Archive (STT)](http://urn.fi/urn:nbn:fi:lb-2018121001)
 - [The Suomi24 Sentences Corpus](http://urn.fi/urn:nbn:fi:lb-2020021803)
 - [Project Lönnrot](http://www.lonnrot.net/)
 - [Finnish parliament speeches](https://avoindata.eduskunta.fi)
 - [multilingual_cc_news](https://huggingface.co/datasets/intfloat/multilingual_cc_news), we used the Finnish subset of the multilingual CC-News dataset
 - [fi-news-corpus](https://github.com/nkrusch/fi-news-corpus)
 - Finnish higher education public theses
 - Finnish single-turn instruction-following datasets, combination of multiple originally openly licensed English datasets translated to Finnish. For example, [Ultrachat, Aya, Capybara, etc](https://huggingface.co/collections/Finnish-NLP/sft-dpo-dataset-65f55dde1139c3cd683ff035)
 Raw datasets were automatically cleaned to filter out bad quality and non-Finnish examples. Also, a [perplexity](https://huggingface.co/course/chapter7/3#perplexity-for-language-models) score was calculated for all texts with a KenLM model which was trained with very clean Finnish texts only. This perplexity score can then be used to determine how "clean" Finnish language the text contains. To reduce toxic text, we used Finnish toxicity classifier [TurkuNLP/bert-large-finnish-cased-toxicity](https://huggingface.co/TurkuNLP/bert-large-finnish-cased-toxicity) released by TurkuNLP to classify all text examples. Classified toxicity label scores can then be used to determine how toxic the text is.
 All datasets were concatenated and the whole dataset near deduplicated using MinHashLSH from [text-dedup](https://github.com/ChenghaoMou/text-dedup). Top 95% perplexity score was used as a filtering threshold to filter out the worst quality 5% of texts. To reduce amount of toxic content, the dataset was filtered to include text examples having lower than 80% score for the toxicity labels "label_identity_attack", "label_insult", "label_threat" and "label_severe_toxicity".
 Finally, 20,000 text examples from each of the CulturaX, Wikipedia, Yle, STT, Suomi24, and Reddit datasets were randomly selected for evaluation dataset.
 The final training dataset had 23 billion words (calculated with regex "\w+") and the evaluation dataset had 23 million words. After tokenization, the training dataset had 41 billion tokens and the evaluation dataset had 40 million tokens. For the 2-stage pretraining, training datasets are divided as follows:
 The first stage:
 |Dataset | Words | Ratio | 
 |:-----------------------------|:------------|:-------------|
 |CulturaX | 12.820B | 59.88% | 
 |HPLT v1.2 | 5.034B | 23.51% |
 |Suomi24 | 3.018B | 14.09% |
 |Reddit | 0.141B | 0.66% | 
 |CC-News | 0.311B | 1.45% |
 |FI news corpus | 0.004B | 0.02% |
 |Project Lönnrot | 0.083B | 0.39% |
 |**TOTAL** | **21.410B** | **100.0%** |
 The second stage:
 |Dataset | Words | Ratio | 
 |:--------------------------------------------------------------|:------------|:------------|
 |CulturaX (cleaner sample using KenLM perplexity score) | 2.252B | 55.48% | 
 |Wikipedia | 0.095B | 2.34% | 
 |STT | 0.253B | 6.23% |
 |Yle | 0.212B | 5.22% |
 |Finnish parliament speeches | 0.021B | 0.52% | 
 |Finnish higher education public theses | 0.855B | 21.07% | 
 |Finnish instruction-following datasets (note: 2X upsampled) | 0.371B | 9.14% | 
 |**TOTAL** | **4.059B** | **100.0%** |
 ## Training procedure
 ### Preprocessing
 Texts are tokenized using Byte Pair Encoding (BPE) using the implementation from SentencePiece splitting all numbers into individual digits and using bytes to decompose unknown UTF-8 characters. The total
 vocabulary size is 64k tokens. Inputs are sequences of 2048 consecutive tokens. Texts are not lower cased so this model is case-sensitive: it makes a difference between finnish and Finnish. Both BOS and EOS tokens were used in the pretraining.
 ### 2-stage pretraining
 The model was trained on TPUv4-32 VM, sponsored by the [Google TPU Research Cloud](https://sites.research.google/trc/about/). Training was conducted with a slightly modified Jax/Flax based [EasyLM](https://github.com/young-geng/EasyLM) framework, and inspired by the [OpenLLaMA](https://github.com/openlm-research/open_llama) project. The optimizer used was a [Lion](https://arxiv.org/abs/2302.06675).
 The 2-stage pretraining approach was inspired by [MiniCPM](https://shengdinghu.notion.site/MiniCPM-Unveiling-the-Potential-of-End-side-Large-Language-Models-d4d3a8c426424654a4e80e42a711cb20) findings. For the first stage (79% of the entire training), we used noisier web-scraped datasets. For the second stage (21% of the entire training), we primarily used cleaner datasets and instruction-following datasets shuffled together, like in MiniCPM. The learning rate schedule for the 2-stage pretraining was Warmup-Stable-Decay (WSD). During the first stage, the learning rate schedule had a linear warmup for about 8 billion tokens to a peak learning rate of 1e-4 (note: with the Lion optimizer, the learning rate had to be about 10 times smaller than with the commonly used AdamW), followed by a stable phase where the rate of 1e-4 was kept constant. During the second stage, the learning rate schedule had a linear decay from 1e-4 to 6e-6 for the first 7 billion tokens, followed by a stable phase for the remaining tokens.
 In the first stage, the model was trained for 118 billion tokens, which is about three epochs of the first-stage training data, inspired by the findings of [Scaling Data-Constrained Language Models](https://huggingface.co/papers/2305.16264). In the second stage, the model was trained for 31 billion tokens, which is close to five epochs of the second-stage training data.
 Thanks to the WSD learning rate schedule, you can more easily experiment with different first-stage model checkpoints. For example, you could apply the second-stage training on an earlier checkpoint or continue pretraining further before the second stage. Model checkpoints were pushed to this repository every 100,000 training steps (approximately 13 billion tokens).
 - [900K](https://huggingface.co/Finnish-NLP/Ahma-7B/tree/5f6eb9498b17fece810d766f81c711c38a2b2de2)
 - [800K](https://huggingface.co/Finnish-NLP/Ahma-7B/tree/bc2d607ce302c1b0ff75c229496645cf232c6d98)
 - [700K](https://huggingface.co/Finnish-NLP/Ahma-7B/tree/69352a497d5953c5290296a1f429a450978c7f7f)
 - [600K](https://huggingface.co/Finnish-NLP/Ahma-7B/tree/760ab5f865b08d9a512c1df523a5c4deb6874322)
 - [500K](https://huggingface.co/Finnish-NLP/Ahma-7B/tree/32ea3d35931da8039180e80d67f6c323719ae50a)
 - [400K](https://huggingface.co/Finnish-NLP/Ahma-7B/tree/d1256a6815983053d0f9934f21f163d764fc5ecd)
 - [300K](https://huggingface.co/Finnish-NLP/Ahma-7B/tree/1e3094c66e788fe81d2aadad5bf8f0431358bd38)
 - [200K](https://huggingface.co/Finnish-NLP/Ahma-7B/tree/a4afd130fa0effea047deaaf8bf63b3eba1b323b)
 - [100K](https://huggingface.co/Finnish-NLP/Ahma-7B/tree/245fad2f5838af1465cb40ad42caef092e875cd9)
 ## Evaluation results
 ### FIN-bench
 This Ahma 7B base model was primarily evaluated using [FIN-bench by TurkuNLP](https://github.com/TurkuNLP/FIN-bench), and the same evaluation was carried out for other relevant Finnish models for comparison: [FinGPT 8B by TurkuNLP](https://huggingface.co/TurkuNLP/gpt3-finnish-8B), [Viking 7B by TurkuNLP, SiloGen and HPLT](https://huggingface.co/LumiOpen/Viking-7B), and [Poro 34B by SiloGen, TurkuNLP and HPLT](https://huggingface.co/LumiOpen/Poro-34B). Below are the results with 0-shot and 3-shot settings in FIN-bench.
 0-shot results:
 | Benchmark | Ahma 3B base (instruct prompt format) | Ahma 3B Instruct (instruct prompt format) | Ahma 7B base (instruct prompt format) | Ahma 7B Instruct (instruct prompt format) | FinGPT 8B | Viking 7B | Poro 34B (8bit quant) |
 |:---------------------------|:--------------------------------------|:------------------------------------------|:--------------------------------------|:------------------------------------------|:----------|:----------|:----------------------|
 | Analogies | 50.77 | 48.46 | 56.92 | 41.54 | 49.23 | 40.00 | 54.62 |
 | Arithmetic | 27.64 | 22.14 | 11.50 | 14.70 | 33.15 | 30.16 | 30.34 |
 | Cause and Effect | 59.48 | 58.82 | 59.48 | 53.60 | 66.01 | 58.82 | 62.74 |
 | Emotions | 36.25 | 28.12 | 36.25 | 27.50 | 22.50 | 26.25 | 35.63 |
 | Empirical Judgements | 33.33 | 35.35 | 33.33 | 33.33 | 27.27 | 33.33 | 49.49 |
 | General Knowledge | 44.29 | 48.57 | 51.43 | 37.14 | 40.00 | 24.29 | 51.43 |
 | HHH Alignment | 42.09 | 41.66 | 44.23 | 43.22 | 41.81 | 42.51 | 42.92 |
 | Intent Recognition | 24.42 | 26.16 | 43.64 | 56.94 | 17.49 | 22.40 | 68.35 |
 | Misconceptions | 46.27 | 47.01 | 46.27 | 47.01 | 53.73 | 53.73 | 52.24 |
 | Paraphrase | 59.50 | 73.00 | 67.00 | 70.50 | 51.00 | 50.00 | 51.00 |
 | Sentence Ambiguity | 53.33 | 65.00 | 60.00 | 63.33 | 51.67 | 48.33 | 50.00 |
 | Similarities Abstraction | 65.79 | 68.42 | 71.05 | 61.84 | 60.53 | 65.79 | 60.53 |
 | **Non-Arithmetic Average** | **47.55** | **48.95** | **51.33** | **48.30** | **46.17** | **44.42** | **52.08** |
 | **Overall Average** | **36.49** | **34.06** | **29.20** | **29.64** | **38.93** | **36.50** | **40.00** |
 3-shot results:
 | Benchmark | Ahma 3B base (instruct prompt format) | Ahma 3B Instruct (instruct prompt format) | Ahma 7B base (instruct prompt format) | Ahma 7B Instruct (instruct prompt format) | FinGPT 8B | Viking 7B | Poro 34B (8bit quant) |
 |:---------------------------|:--------------------------------------|:------------------------------------------|:--------------------------------------|:------------------------------------------|:----------|:----------|:----------------------|
 | Analogies | 50.77 | 49.23 | 49.23 | 43.08 | 40.77 | 54.62 | 76.92 |
 | Arithmetic | 38.38 | 43.89 | 20.88 | 26.81 | 43.63 | 45.78 | 53.68 |
 | Cause and Effect | 60.78 | 64.71 | 66.01 | 62.74 | 64.05 | 58.17 | 67.32 |
 | Emotions | 30.00 | 41.25 | 30.00 | 53.75 | 44.37 | 48.13 | 56.87 |
 | Empirical Judgements | 46.46 | 44.44 | 39.39 | 39.39 | 32.32 | 43.43 | 63.64 |
 | General Knowledge | 47.14 | 40.00 | 27.14 | 44.29 | 54.29 | 28.57 | 74.29 |
 | HHH Alignment | 43.53 | 44.80 | 43.80 | 45.09 | 45.39 | 44.80 | 46.07 |
 | Intent Recognition | 20.52 | 44.22 | 36.42 | 39.02 | 51.45 | 58.82 | 83.67 |
 | Misconceptions | 50.75 | 52.24 | 46.27 | 51.49 | 52.99 | 46.27 | 52.99 |
 | Paraphrase | 50.50 | 58.50 | 57.50 | 65.00 | 53.00 | 54.50 | 55.00 |
 | Sentence Ambiguity | 53.33 | 48.33 | 53.33 | 51.67 | 51.67 | 53.33 | 66.67 |
 | Similarities Abstraction | 69.74 | 72.37 | 72.37 | 69.74 | 64.47 | 73.68 | 75.00 |
 | **Non-Arithmetic Average** | **48.48** | **51.49** | **49.05** | **51.63** | **51.19** | **50.94** | **61.96** |
 | **Overall Average** | **42.87** | **47.27** | **33.41** | **37.84** | **46.99** | **48.07** | **57.36** |
 As we can see, Ahma 7B base model has bad arithmetic performance but in non-arithmetic tasks it clearly outperforms same sized models like the FinGPT 8B and Viking 7B, especially in 0-shot usage. Ahma 7B base model is even on-par with the 5X larger Poro 34B model, in non-arithmetic tasks in 0-shot usage. This result might be attributed to Ahma's 2-stage pretraining and the inclusion of instruct-following examples during the pretraining phase.
 In a 3-shot setting, the results are more mixed. The poorer performance of Ahma 7B base model in 3-shot settings might be due to the use of the instruct prompt format and having only single-turn instruction-following training examples.
 ### MTBench Finnish
 This Ahma 7B base model was also evaluated using [MTBench Finnish by LumiOpen](https://github.com/LumiOpen/FastChat/tree/main/fastchat/llm_judge) even though this Ahma model is not fine-tuned for chat. Since the MTBench evaluates also multi-turn chats while Ahma base models were only pretrained with single-turn instruction following examples, we have reported MTBench Finnish results separately for their single-turn and multi-turn evaluation examples. [Poro 34B Chat by SiloGen, TurkuNLP and HPLT](https://huggingface.co/LumiOpen/Poro-34B-chat) model's presumably multi-turn results are copied from their model card for the comparison.
 Single-turn results:
 | Benchmark | Ahma 3B base (instruct prompt format) | Ahma 3B Instruct (instruct prompt format) | Ahma 7B base (instruct prompt format) | Ahma 7B Instruct (instruct prompt format) |
 |:--------------------|:--------------------------------------|:------------------------------------------|:--------------------------------------|:------------------------------------------|
 | Coding | 1.00 | 1.00 | 1.70 | 1.10 |
 | Extraction | 2.00 | 1.30 | 3.10 | 3.00 |
 | Humanities | 4.05 | 6.20 | 6.60 | 8.00 |
 | Math | 3.00 | 3.20 | 3.90 | 2.90 |
 | Reasoning | 2.90 | 4.60 | 3.70 | 5.70 |
 | Roleplay | 4.80 | 6.50 | 6.60 | 7.20 |
 | STEM | 5.10 | 5.95 | 6.75 | 7.30 |
 | Writing | 6.60 | 9.00 | 7.10 | 8.80 |
 | **Overall Average** | **3.68** | **4.72** | **4.93** | **5.50** |
 Multi-turn results:
 | Benchmark | Ahma 3B base (instruct prompt format) | Ahma 3B Instruct (instruct prompt format) | Ahma 7B base (instruct prompt format) | Ahma 7B Instruct (instruct prompt format) | Poro 34B Chat |
 |:--------------------|:--------------------------------------|:------------------------------------------|:--------------------------------------|:------------------------------------------|:--------------|
 | Coding | 1.00 | 1.00 | 1.40 | 1.05 | 3.70 |
 | Extraction | 1.55 | 1.15 | 2.05 | 2.65 | 6.37 |
 | Humanities | 3.25 | 6.20 | 4.95 | 7.85 | 9.25 |
 | Math | 2.20 | 2.70 | 2.50 | 2.40 | 1.20 |
 | Reasoning | 2.45 | 3.50 | 2.55 | 4.50 | 4.35 |
 | Roleplay | 4.90 | 6.40 | 6.35 | 6.60 | 7.35 |
 | STEM | 4.20 | 4.78 | 4.28 | 5.40 | 7.80 |
 | Writing | 3.80 | 6.65 | 4.10 | 6.25 | 8.50 |
 | **Overall Average** | **2.92** | **4.05** | **3.52** | **4.59** | **6.06** |
 As we can see, Ahma 7B base model struggles with multi-turn examples, as expected, since it has only been pretrained with single-turn instruction following examples. In addition, coding performance was expectedly poor because the Ahma 7B model is not trained with code data. In single-turn setting, Ahma 7B beats both the Ahma 3B base and Instruct-tuned versions, demonstrating greater base capability to be further improved with Instruct-tuning.
 ## Acknowledgements
 This project would not have been possible without compute generously provided by Google through the
 [TPU Research Cloud](https://sites.research.google/trc/).
 ## Team Members
 - Aapo Tanskanen, [Hugging Face profile](https://huggingface.co/aapot), [LinkedIn profile](https://www.linkedin.com/in/aapotanskanen/)
 - Rasmus Toivanen, [Hugging Face profile](https://huggingface.co/RASMUS), [LinkedIn profile](https://www.linkedin.com/in/rasmustoivanen/)
 Feel free to contact us for more details 🤗
 ![Ahma](ahma.jpg)
--- a/ahma.jpg
+++ b/ahma.jpg
--- a/config.json
+++ b/config.json
@@ -0,0 +1,27 @@
 {
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 2048,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "float16",
  "transformers_version": "4.38.0.dev0",
  "use_cache": true,
  "vocab_size": 64256
 }
--- a/convert_to_hf_model.sh
+++ b/convert_to_hf_model.sh
@@ -0,0 +1,4 @@
 JAX_PLATFORM_NAME=cpu python3 -m EasyLM.models.llama.convert_easylm_to_hf \
    --load_checkpoint='' \
    --model_size='7b' \
    --output_dir='./'
--- a/generation_config.json
+++ b/generation_config.json
@@ -0,0 +1,6 @@
 {
  "_from_model_config": true,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "transformers_version": "4.38.0.dev0"
 }
--- a/model-00001-of-00003.safetensors
+++ b/model-00001-of-00003.safetensors
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:625f48801b93273fa419f51704ccc45bba97010337ed52ed8db290767a152c71
 size 4978830560
--- a/model-00002-of-00003.safetensors
+++ b/model-00002-of-00003.safetensors
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:2d8f06f2f59ecbffe4fce68ab67d5ad530d951fbce594f7c9850d9bce0b739a3
 size 4991431320
--- a/model-00003-of-00003.safetensors
+++ b/model-00003-of-00003.safetensors
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:c34dac66ee076f27cda65ce37ae74894f30a4e6cc49f898cf1bf67cbf3c1f10e
 size 4035085208
--- a/model.safetensors.index.json
+++ b/model.safetensors.index.json
@@ -0,0 +1,298 @@
 {
  "metadata": {
    "total_size": 14005313536
  },
  "weight_map": {
    "lm_head.weight": "model-00003-of-00003.safetensors",
    "model.embed_tokens.weight": "model-00001-of-00003.safetensors",
    "model.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors",
    "model.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
    "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
    "model.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
    "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
    "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
    "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
    "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
    "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
    "model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors",
    "model.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
    "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
    "model.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
    "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
    "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
    "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
    "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
    "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
    "model.layers.10.input_layernorm.weight": "model-00001-of-00003.safetensors",
    "model.layers.10.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
    "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
    "model.layers.10.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
    "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
    "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
    "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
    "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
    "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
    "model.layers.11.input_layernorm.weight": "model-00002-of-00003.safetensors",
    "model.layers.11.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
    "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
    "model.layers.11.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
    "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
    "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
    "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
    "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
    "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
    "model.layers.12.input_layernorm.weight": "model-00002-of-00003.safetensors",
    "model.layers.12.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
    "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
    "model.layers.12.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
    "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
    "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
    "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
    "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
    "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
    "model.layers.13.input_layernorm.weight": "model-00002-of-00003.safetensors",
    "model.layers.13.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
    "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
    "model.layers.13.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
    "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
    "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
    "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
    "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
    "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
    "model.layers.14.input_layernorm.weight": "model-00002-of-00003.safetensors",
    "model.layers.14.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
    "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
    "model.layers.14.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
    "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
    "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
    "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
    "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
    "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
    "model.layers.15.input_layernorm.weight": "model-00002-of-00003.safetensors",
    "model.layers.15.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
    "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
    "model.layers.15.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
    "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
    "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
    "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
    "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
    "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
    "model.layers.16.input_layernorm.weight": "model-00002-of-00003.safetensors",
    "model.layers.16.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
    "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
    "model.layers.16.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
    "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
    "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
    "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
    "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
    "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
    "model.layers.17.input_layernorm.weight": "model-00002-of-00003.safetensors",
    "model.layers.17.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
    "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
    "model.layers.17.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
    "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
    "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
    "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
    "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
    "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
    "model.layers.18.input_layernorm.weight": "model-00002-of-00003.safetensors",
    "model.layers.18.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
    "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
    "model.layers.18.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
    "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
    "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
    "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
    "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
    "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
    "model.layers.19.input_layernorm.weight": "model-00002-of-00003.safetensors",
    "model.layers.19.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
    "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
    "model.layers.19.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
    "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
    "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
    "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
    "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
    "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
    "model.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors",
    "model.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
    "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
    "model.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
    "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
    "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
    "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
    "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
    "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
    "model.layers.20.input_layernorm.weight": "model-00002-of-00003.safetensors",
    "model.layers.20.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
    "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
    "model.layers.20.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
    "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
    "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
    "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
    "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
    "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
    "model.layers.21.input_layernorm.weight": "model-00002-of-00003.safetensors",
    "model.layers.21.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
    "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
    "model.layers.21.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
    "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
    "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
    "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
    "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
    "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
    "model.layers.22.input_layernorm.weight": "model-00002-of-00003.safetensors",
    "model.layers.22.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
    "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
    "model.layers.22.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
    "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
    "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
    "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
    "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
    "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
    "model.layers.23.input_layernorm.weight": "model-00003-of-00003.safetensors",
    "model.layers.23.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
    "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
    "model.layers.23.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
    "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
    "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
    "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
    "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
    "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
    "model.layers.24.input_layernorm.weight": "model-00003-of-00003.safetensors",
    "model.layers.24.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
    "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
    "model.layers.24.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
    "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
    "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
    "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
    "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
    "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
    "model.layers.25.input_layernorm.weight": "model-00003-of-00003.safetensors",
    "model.layers.25.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
    "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
    "model.layers.25.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
    "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
    "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
    "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
    "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
    "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
    "model.layers.26.input_layernorm.weight": "model-00003-of-00003.safetensors",
    "model.layers.26.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
    "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
    "model.layers.26.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
    "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
    "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
    "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
    "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
    "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
    "model.layers.27.input_layernorm.weight": "model-00003-of-00003.safetensors",
    "model.layers.27.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
    "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
    "model.layers.27.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
    "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
    "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
    "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
    "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
    "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
    "model.layers.28.input_layernorm.weight": "model-00003-of-00003.safetensors",
    "model.layers.28.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
    "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
    "model.layers.28.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
    "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
    "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
    "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
    "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
    "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
    "model.layers.29.input_layernorm.weight": "model-00003-of-00003.safetensors",
    "model.layers.29.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
    "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
    "model.layers.29.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
    "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
    "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
    "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
    "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
    "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
    "model.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors",
    "model.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
    "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
    "model.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
    "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
    "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
    "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
    "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
    "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
    "model.layers.30.input_layernorm.weight": "model-00003-of-00003.safetensors",
    "model.layers.30.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
    "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
    "model.layers.30.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
    "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
    "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
    "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
    "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
    "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
    "model.layers.31.input_layernorm.weight": "model-00003-of-00003.safetensors",
    "model.layers.31.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
    "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
    "model.layers.31.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
    "model.layers.31.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
    "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
    "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
    "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
    "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
    "model.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors",
    "model.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
    "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
    "model.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
    "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
    "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
    "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
    "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
    "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
    "model.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors",
    "model.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
    "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
    "model.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
    "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
    "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
    "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
    "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
    "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
    "model.layers.6.input_layernorm.weight": "model-00001-of-00003.safetensors",
    "model.layers.6.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
    "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
    "model.layers.6.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
    "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
    "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
    "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
    "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
    "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
    "model.layers.7.input_layernorm.weight": "model-00001-of-00003.safetensors",
    "model.layers.7.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
    "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
    "model.layers.7.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
    "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
    "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
    "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
    "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
    "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
    "model.layers.8.input_layernorm.weight": "model-00001-of-00003.safetensors",
    "model.layers.8.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
    "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
    "model.layers.8.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
    "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
    "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
    "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
    "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
    "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
    "model.layers.9.input_layernorm.weight": "model-00001-of-00003.safetensors",
    "model.layers.9.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
    "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
    "model.layers.9.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
    "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
    "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
    "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
    "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
    "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
    "model.norm.weight": "model-00003-of-00003.safetensors"
  }
 }
--- a/pretrain_llama_7b.sh
+++ b/pretrain_llama_7b.sh
@@ -0,0 +1,55 @@
 #! /bin/bash
 # Put your WANDB API key here to enable logging to wandb.
 export WANDB_API_KEY=''
 # TPU specific flags to improve training throughput
 export LIBTPU_INIT_ARGS='--xla_jf_spmd_threshold_for_windowed_einsum_mib=0 --xla_tpu_spmd_threshold_for_allgather_cse=10000 --xla_tpu_spmd_rewrite_einsum_with_reshape=true --xla_enable_async_all_gather=true --jax_enable_async_collective_offload=true --xla_tpu_enable_latency_hiding_scheduler=true TPU_MEGACORE=MEGACORE_DENSE'
 python3 -m EasyLM.models.llama.llama_train \
    --jax_distributed.initialize_jax_distributed=True \
    --mesh_dim='1,-1,4' \
    --dtype='bf16' \
    --total_steps=900000 \
    --eval_freq=50000 \
    --log_freq=1000 \
    --save_model_freq=2000 \
    --save_milestone_freq=50000 \
    --load_llama_config='7b' \
    --update_llama_config='' \
    --load_dataset_state='' \
    --load_checkpoint='' \
    --tokenizer.vocab_file='tokenizer.model' \
    --optimizer.type='lion' \
    --optimizer.lion_optimizer.weight_decay=1.0 \
    --optimizer.lion_optimizer.lr_schedule_type='warmup_constant_linear_decay' \
    --optimizer.lion_optimizer.lr=1e-4 \
    --optimizer.lion_optimizer.end_lr=1e-5 \
    --optimizer.lion_optimizer.lr_warmup_steps=60000 \
    --optimizer.lion_optimizer.lr_constant_steps=900000 \
    --optimizer.lion_optimizer.lr_decay_steps=100000 \
    --optimizer.lion_optimizer.bf16_momentum=True \
    --train_dataset.type='huggingface' \
    --train_dataset.text_processor.fields='text' \
    --train_dataset.text_processor.add_eos_token=True \
    --train_dataset.text_processor.add_bos_token=True \
    --train_dataset.huggingface_dataset.path='/researchdisk/lm_training_dataset_first_stage' \
    --train_dataset.huggingface_dataset.split='train' \
    --train_dataset.huggingface_dataset.seq_length=2048 \
    --train_dataset.huggingface_dataset.batch_size=64 \
    --eval_dataset.type='huggingface' \
    --eval_dataset.text_processor.fields='text' \
    --eval_dataset.text_processor.add_eos_token=True \
    --eval_dataset.text_processor.add_bos_token=True \
    --eval_dataset.huggingface_dataset.path='/researchdisk/lm_training_dataset_first_stage' \
    --eval_dataset.huggingface_dataset.split='validation' \
    --eval_dataset.huggingface_dataset.seq_length=2048 \
    --eval_dataset.huggingface_dataset.batch_size=64 \
    --checkpointer.save_optimizer_state=True \
    --logger.online=True \
    --logger.prefix='EasyLM' \
    --logger.project="llama-7b-v2" \
    --logger.output_dir="gs://finnish-nlp-research-us/llama-7b-v2-checkpoint" \
    --logger.wandb_dir="./"
--- a/special_tokens_map.json
+++ b/special_tokens_map.json
@@ -0,0 +1,23 @@
 {
  "bos_token": {
    "content": "<s>",
    "lstrip": false,
    "normalized": false,
    "rstrip": false,
    "single_word": false
  },
  "eos_token": {
    "content": "</s>",
    "lstrip": false,
    "normalized": false,
    "rstrip": false,
    "single_word": false
  },
  "unk_token": {
    "content": "<unk>",
    "lstrip": false,
    "normalized": false,
    "rstrip": false,
    "single_word": false
  }
 }
--- a/tokenizer.json
+++ b/tokenizer.json
--- a/tokenizer.model
+++ b/tokenizer.model
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:1980c00aa3cb5455177a39efa3e60e7b8887ee89c3f7b8950719592a08ad9456
 size 1400411
--- a/tokenizer.vocab
+++ b/tokenizer.vocab
--- a/tokenizer_config.json
+++ b/tokenizer_config.json
@@ -0,0 +1,75 @@
 {
  "add_bos_token": true,
  "add_eos_token": false,
  "add_prefix_space": true,
  "added_tokens_decoder": {
    "0": {
      "content": "<unk>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "1": {
      "content": "<s>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "2": {
      "content": "</s>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "3": {
      "content": "[INST]",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "4": {
      "content": "[/INST]",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "5": {
      "content": "<<SYS>>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "6": {
      "content": "<</SYS>>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    }
  },
  "bos_token": "<s>",
  "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = 'Olet tekoälyavustaja. Vastaat aina mahdollisimman avuliaasti. Vastauksesi eivät saa sisältää mitään haitallista, epäeettistä, rasistista, seksististä, vaarallista tai laitonta sisältöä. Jos kysymyksessä ei ole mitään järkeä tai se ei ole asiasisällöltään johdonmukainen, selitä miksi sen sijaan, että vastaisit jotain väärin. Jos et tiedä vastausta kysymykseen, älä kerro väärää tietoa.' %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + ' [INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + eos_token }}{% endif %}{% endfor %}",
  "clean_up_tokenization_spaces": false,
  "eos_token": "</s>",
  "legacy": false,
  "model_max_length": 1000000000000000019884624838656,
  "pad_token": null,
  "sp_model_kwargs": {},
  "spaces_between_special_tokens": false,
  "tokenizer_class": "PreTrainedTokenizerFast",
  "unk_token": "<unk>",
  "use_default_system_prompt": false
 }
--- a/train_sentencepiece.py
+++ b/train_sentencepiece.py
@@ -0,0 +1,10 @@
 import sentencepiece as spm
 spm.SentencePieceTrainer.train(input="/researchdisk/training_dataset_sentences/train.txt", model_prefix="tokenizer",
                                model_type="bpe", split_digits=True, vocab_size=64256, byte_fallback=True,
                                normalization_rule_name="nfkc",
                                user_defined_symbols=["[INST]", "[/INST]", "<<SYS>>", "<</SYS>>"],
                                required_chars="abcdefghijklmnopqrstuvwxyzåäöABCDEFGHIJKLMNOPQRSTUVWXYZÅÄÖ",
                                train_extremely_large_corpus=True,
                                input_sentence_size=500000000, shuffle_input_sentence=True,
                                num_threads=96)