From 22d07727a0b96660c13f20ed8631b6cfb716ba46 Mon Sep 17 00:00:00 2001 From: ModelHub XC Date: Thu, 11 Jun 2026 08:53:12 +0800 Subject: [PATCH] =?UTF-8?q?=E5=88=9D=E5=A7=8B=E5=8C=96=E9=A1=B9=E7=9B=AE?= =?UTF-8?q?=EF=BC=8C=E7=94=B1ModelHub=20XC=E7=A4=BE=E5=8C=BA=E6=8F=90?= =?UTF-8?q?=E4=BE=9B=E6=A8=A1=E5=9E=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Model: mesolitica/mallam-1.1B-4096 Source: Original Platform --- .gitattributes | 50 +++++++++++++++++++++++++++++++++++++ README.md | 55 +++++++++++++++++++++++++++++++++++++++++ config.json | 26 +++++++++++++++++++ configuration.json | 1 + generation_config.json | 6 +++++ model.safetensors | 3 +++ special_tokens_map.json | 36 +++++++++++++++++++++++++++ tokenizer.json | 3 +++ tokenizer_config.json | 53 +++++++++++++++++++++++++++++++++++++++ 9 files changed, 233 insertions(+) create mode 100644 .gitattributes create mode 100644 README.md create mode 100644 config.json create mode 100644 configuration.json create mode 100644 generation_config.json create mode 100644 model.safetensors create mode 100644 special_tokens_map.json create mode 100644 tokenizer.json create mode 100644 tokenizer_config.json diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..98a1fbc --- /dev/null +++ b/.gitattributes @@ -0,0 +1,50 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bin.* filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zstandard filter=lfs diff=lfs merge=lfs -text +*.tfevents* filter=lfs diff=lfs merge=lfs -text +*.db* filter=lfs diff=lfs merge=lfs -text +*.ark* filter=lfs diff=lfs merge=lfs -text +**/*ckpt*data* filter=lfs diff=lfs merge=lfs -text +**/*ckpt*.meta filter=lfs diff=lfs merge=lfs -text +**/*ckpt*.index filter=lfs diff=lfs merge=lfs -text + +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.gguf* filter=lfs diff=lfs merge=lfs -text +*.ggml filter=lfs diff=lfs merge=lfs -text +*.llamafile* filter=lfs diff=lfs merge=lfs -text +*.pt2 filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text + +model.safetensors filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..bcd017b --- /dev/null +++ b/README.md @@ -0,0 +1,55 @@ +--- +language: +- ms +--- + +# MaLLaM 🌙 1.1B (Malaysia Large Language Model), Pretrain 1.1B 4096 context length on Malaysian text + +Pretrain from scratch 1.1B parameters using Mistral architecture on 90B Malaysian text tokens. + +README at https://github.com/mesolitica/malaya/tree/5.1/pretrained-model/mistral + +- Trained on 90B tokens, gathered at https://github.com/malaysia-ai/dedup-text-dataset/tree/main/pretrain-llm +- We use Ray cluster to train on 5 nodes of 4x A100 80GB, https://github.com/malaysia-ai/jupyter-gpu/tree/main/ray + +WandB, https://wandb.ai/mesolitica/pretrain-mistral-1.1b?workspace=user-husein-mesolitica + +WandB report, https://wandb.ai/mesolitica/pretrain-mistral-3b/reports/Pretrain-Larger-Malaysian-Mistral--Vmlldzo2MDkyOTgz + +Technical report, https://github.com/mesolitica/malaya/wiki/MaLLaM-%F0%9F%8C%99-Malaysia-Large-Language-Model + +## how-to + +```python +from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig +import torch + +TORCH_DTYPE = 'bfloat16' +nf4_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_quant_type='nf4', + bnb_4bit_use_double_quant=True, + bnb_4bit_compute_dtype=getattr(torch, TORCH_DTYPE) +) + +tokenizer = AutoTokenizer.from_pretrained('mesolitica/mallam-1.1B-4096') +model = AutoModelForCausalLM.from_pretrained( + 'mesolitica/mallam-1.1B-4096', + use_flash_attention_2 = True, + quantization_config = nf4_config +) +prompt = 'nama saya' +inputs = tokenizer([prompt], return_tensors='pt', add_special_tokens=False).to('cuda') + +generate_kwargs = dict( + inputs, + max_new_tokens=512, + top_p=0.95, + top_k=50, + temperature=0.9, + do_sample=True, + num_beams=1, + repetition_penalty=1.05, +) +r = model.generate(**generate_kwargs) +``` \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..4a0deb9 --- /dev/null +++ b/config.json @@ -0,0 +1,26 @@ +{ + "_name_or_path": "huseinzol05-dummy-mistral-1.1b/checkpoint-54300", + "architectures": [ + "MistralForCausalLM" + ], + "attention_dropout": 0.0, + "bos_token_id": 1, + "eos_token_id": 2, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 5632, + "max_position_embeddings": 32768, + "model_type": "mistral", + "num_attention_heads": 32, + "num_hidden_layers": 22, + "num_key_value_heads": 8, + "rms_norm_eps": 1e-05, + "rope_theta": 10000.0, + "sliding_window": 4096, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.36.0.dev0", + "use_cache": true, + "vocab_size": 32000 +} diff --git a/configuration.json b/configuration.json new file mode 100644 index 0000000..bbeeda1 --- /dev/null +++ b/configuration.json @@ -0,0 +1 @@ +{"framework": "pytorch", "task": "text-generation", "allow_remote": true} \ No newline at end of file diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..c441554 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,6 @@ +{ + "_from_model_config": true, + "bos_token_id": 1, + "eos_token_id": 2, + "transformers_version": "4.36.0.dev0" +} diff --git a/model.safetensors b/model.safetensors new file mode 100644 index 0000000..87e7164 --- /dev/null +++ b/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:50935f3d70d89594cb3b2b0f71ceae17a7717f14a6b29f36c42c1abf55a106bd +size 2246257208 diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..70be356 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,36 @@ +{ + "additional_special_tokens": [ + "", + "", + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..539e716 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab85dbb9764201eb6e7ad48f07922777e4abaf5b3d00c91e6cfdaa789c72667f +size 1343999 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..b2e0858 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,53 @@ +{ + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "", + "", + "", + "" + ], + "bos_token": "", + "clean_up_tokenization_spaces": true, + "eos_token": "", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "tokenizer_class": "PreTrainedTokenizerFast", + "unk_token": "" +}