From e1305a41d55bb6c66b863948e262ce2e9fe7c367 Mon Sep 17 00:00:00 2001 From: ModelHub XC Date: Tue, 5 May 2026 08:30:49 +0800 Subject: [PATCH] =?UTF-8?q?=E5=88=9D=E5=A7=8B=E5=8C=96=E9=A1=B9=E7=9B=AE?= =?UTF-8?q?=EF=BC=8C=E7=94=B1ModelHub=20XC=E7=A4=BE=E5=8C=BA=E6=8F=90?= =?UTF-8?q?=E4=BE=9B=E6=A8=A1=E5=9E=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Model: gia-uh/cecilia-2b-instruct-v1 Source: Original Platform --- .gitattributes | 39 ++++++++++++++++++++ README.md | 47 ++++++++++++++++++++++++ cecilia-2b-instruct-v1.f16.gguf | 3 ++ config.json | 30 ++++++++++++++++ generation_config.json | 6 ++++ model.safetensors | 3 ++ special_tokens_map.json | 28 +++++++++++++++ tokenizer.json | 3 ++ tokenizer.model | 3 ++ tokenizer_config.json | 64 +++++++++++++++++++++++++++++++++ 10 files changed, 226 insertions(+) create mode 100644 .gitattributes create mode 100644 README.md create mode 100644 cecilia-2b-instruct-v1.f16.gguf create mode 100644 config.json create mode 100644 generation_config.json create mode 100644 model.safetensors create mode 100644 special_tokens_map.json create mode 100644 tokenizer.json create mode 100644 tokenizer.model create mode 100644 tokenizer_config.json diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..1fbb959 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,39 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +cecilia_ft_ms_v1.gguf filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text +cecilia-instruct-v1.f16.gguf filter=lfs diff=lfs merge=lfs -text +cecilia-2b-instruct-v1.f16.gguf filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..6ac3d99 --- /dev/null +++ b/README.md @@ -0,0 +1,47 @@ +--- +library_name: transformers +tags: +- llama +- gguf +- safetensors +- finetune +base_model: +- gia-uh/cecilia-2b-v0.1 +license: mit +datasets: +- gia-uh/maria-silvia-v1 +language: +- es +- en +pipeline_tag: text-generation +--- + +# Cecilia: The Cuban Language Model +Cecilia is a family of language models continual pretrained specifically on Cuban written text, capturing the linguistic, cultural, and social nuances of Cuban Spanish. These models are designed to support natural language processing tasks with a focus on Cuban language varieties and cultural context. + +## About Cecilia FT MS v1 +This model is a fine-tuned version of **Cecilia 2B v0.1** which is a continual pre-trained model based on [Salamandra 2b](https://huggingface.co/BSC-LT/salamandra-2b). It belongs to the **Cecilia** collection and follows the same lineage as [Cecilia 2B v0.1](https://huggingface.co/gia-uh/cecilia-2b-v0.1). + +## Model Formats +This repository is a **Hybrid Release** containing: +- **Safetensors:** For use with Hugging Face `transformers`. +- **GGUF (FP16):** For use with `llama.cpp`, `vLLM`, or local inference tools. + +## Quantizations +Official quantized GGUF versions (Q8_0, Q6_K, Q4_K_M) in the repository [gia-uh/cecilia-2b-instruct-v1-GGUF](https://huggingface.co/gia-uh/cecilia-2b-instruct-v1-GGUF) + +## Quickstart (Transformers) + +```python +from transformers import AutoConfig, AutoModel, AutoTokenizer + +repo_id = "gia-uh/cecilia_ft_ms_v1" + +# Load model and tokenizer +config = AutoConfig.from_pretrained(repo_id, trust_remote_code=False) +tokenizer = AutoTokenizer.from_pretrained(repo_id) +model = AutoModel.from_pretrained(repo_id, trust_remote_code=False) + +# Simple inference +inputs = tokenizer("Hola, que bolá?", return_tensors="pt") +outputs = model(**inputs) \ No newline at end of file diff --git a/cecilia-2b-instruct-v1.f16.gguf b/cecilia-2b-instruct-v1.f16.gguf new file mode 100644 index 0000000..54f9dae --- /dev/null +++ b/cecilia-2b-instruct-v1.f16.gguf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:57af0d98047e0c4efe9b647a8a3ba812bf36e6e1dd64a6275875f4e52c0efe8c +size 4513629344 diff --git a/config.json b/config.json new file mode 100644 index 0000000..a20d9f3 --- /dev/null +++ b/config.json @@ -0,0 +1,30 @@ +{ + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 1, + "eos_token_id": 2, + "pad_token_id": 2, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 5440, + "max_position_embeddings": 8192, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 16, + "num_hidden_layers": 24, + "num_key_value_heads": 16, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 10000.0, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.51.3", + "use_cache": false, + "vocab_size": 256000 +} diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..8df1deb --- /dev/null +++ b/generation_config.json @@ -0,0 +1,6 @@ +{ + "_from_model_config": true, + "bos_token_id": 1, + "eos_token_id": 2, + "transformers_version": "4.51.3" +} diff --git a/model.safetensors b/model.safetensors new file mode 100644 index 0000000..cc2591e --- /dev/null +++ b/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8294b7435412e8acf7c22987163cec1f7e241eb6d732eda605c72afae14768b8 +size 4507005744 diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..6bd935b --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "<|im_start|>", + "<|im_end|>" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..0f0bd57 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:990527d1e7b98c027d386c742250b2f8517bd3adf98c46cc6c1c2f35b234c224 +size 37007559 diff --git a/tokenizer.model b/tokenizer.model new file mode 100644 index 0000000..bf57480 --- /dev/null +++ b/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab94ddf46d14f0279254858d53770c5319c5129d47291ee2bada530271cb1292 +size 4813276 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..c8cec3b --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,64 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "<|im_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "5": { + "content": "<|im_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "<|im_start|>", + "<|im_end|>" + ], + "bos_token": "", + "chat_template": "{%- if not date_string is defined %}{%- set date_string = \"2025-11-13\" %}{%- endif %}{%- set base_system_message = \"Soy CecilIA, un modelo de lenguaje experimental desarrollado en colaboración entre la Facultad de Matemática y Computación (MATCOM) de la Universidad de La Habana y el Grupo de Procesamiento del Lenguaje y Sistemas de Información (GPLSI) de la Universidad de Alicante, entrenado para entender y procesar el español hablado en Cuba. Estás diseñado para responder a preguntas de cultura, historia, geografía, y conversar de forma general sobre Cuba. Responde siempre de forma amigable y coloquial.\" -%}{%- if messages and messages[0].role == \"system\" -%}{%- set task_system = messages[0].content -%}{%- set messages = messages[1:] -%}{%- else -%}{%- set task_system = \"\" -%}{%- endif -%}{%- if task_system -%}{%- set system_message = base_system_message + \"\n\" + task_system -%}{%- else -%}{%- set system_message = base_system_message -%}{%- endif -%}{{ \"<|im_start|>system\n\" + system_message + \"<|im_end|>\n\" }}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "legacy": true, + "model_max_length": 8192, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} \ No newline at end of file