From c6357987812a4e2246a9da27b5024be23710b924 Mon Sep 17 00:00:00 2001 From: ModelHub XC Date: Mon, 18 May 2026 02:51:38 +0800 Subject: [PATCH] =?UTF-8?q?=E5=88=9D=E5=A7=8B=E5=8C=96=E9=A1=B9=E7=9B=AE?= =?UTF-8?q?=EF=BC=8C=E7=94=B1ModelHub=20XC=E7=A4=BE=E5=8C=BA=E6=8F=90?= =?UTF-8?q?=E4=BE=9B=E6=A8=A1=E5=9E=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Model: junaid008/qehwa-pashto-llm Source: Original Platform --- .gitattributes | 36 ++++ README.md | 455 +++++++++++++++++++++++++++++++++++++++++ config.json | 64 ++++++ generation_config.json | 9 + model.safetensors | 3 + tokenizer.json | 3 + tokenizer_config.json | 15 ++ 7 files changed, 585 insertions(+) create mode 100644 .gitattributes create mode 100644 README.md create mode 100644 config.json create mode 100644 generation_config.json create mode 100644 model.safetensors create mode 100644 tokenizer.json create mode 100644 tokenizer_config.json diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..52373fe --- /dev/null +++ b/.gitattributes @@ -0,0 +1,36 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..a0b6945 --- /dev/null +++ b/README.md @@ -0,0 +1,455 @@ +--- +language: +- ps +- en +- ur +license: apache-2.0 +library_name: transformers +tags: +- pashto +- peshawari +- pakistani-pashto +- causal-lm +- qwen2 +- sft +- cpt +- unsloth +- trl +base_model: Qwen/Qwen2.5-7B +pipeline_tag: text-generation +--- + +# ☕ Qehwa — Pashto's First LLM + +**The first and best Pakistani Pashto large language model — specifically trained on Peshawari dialect.** + +Built by a solo developer as a free and open resource for 60+ million Pashto speakers worldwide. + +> ⚠️ This model performs best on Pakistani/Peshawari Pashto. Performance may be lower on Afghan Pashto dialect. + +--- + +## 🌟 Model Description + +**Qehwa** is a fully instruction-tuned Pashto language model built on top of Qwen2.5-7B. It is the result of two-stage training: + +1. **Continued Pre-Training (CPT)** on 3.4 million clean Pakistani Pashto documents +2. **Supervised Fine-Tuning (SFT)** on 126,519 high-quality Peshawari Pashto instruction-response pairs + +This is the **first dedicated Pakistani Pashto LLM** — no comparable model exists publicly. It specifically targets the **Peshawari/KPK dialect** rather than generic or Afghan Pashto. + +This repo contains the **fully merged model** — ready to use with standard transformers, no additional libraries required. + +--- + +## ✨ Capabilities + +- ✅ Answers questions in pure Peshawari Pashto +- ✅ Responds to English instructions in Pashto +- ✅ Responds to Urdu instructions in Pashto +- ✅ Natural Pashto conversation +- ✅ Pashto creative writing and poetry +- ✅ Islamic topics in Pashto +- ✅ KPK history, culture, and geography +- ✅ Pashtunwali traditions and ethics +- ✅ Pashto grammar correction +- ✅ English to Pashto translation +- ✅ Correct Pashto-specific characters: ښ ږ ټ ډ ړ ځ + +--- + +## 📊 Evaluation Results + +Qehwa was evaluated on a custom benchmark of **150 tests across 15 categories** — the first ever comprehensive Pashto LLM benchmark. Since no standard Pashto benchmark exists publicly, this evaluation was designed specifically for Pakistani Pashto. + +### Top Performing Categories + +| Category | Score | +|---|---| +| English → Pashto | **90%** 🔥🔥 | +| Urdu → Pashto | **84%** 🔥🔥 | +| Health & Daily Life in Pashto | **90%** 🔥🔥 | +| Culture & History | **90%** 🔥 | +| Geography & Nature | **90%** 🔥 | + +> **Overall Average Accuracy across all 15 benchmark categories: 85.3%** + +### Evaluation Methodology +- 150 custom Pashto prompts across 15 categories +- Evaluated on A100 40GB GPU +- Human reviewed outputs for fluency, accuracy and dialect correctness +- No existing Pashto benchmark was available — this is the first Pashto LLM benchmark + +--- + +## 💻 Installation +```bash +pip install transformers accelerate torch +``` + +For faster inference: +```bash +pip install unsloth +``` + +For running locally on CPU or small GPU: +```bash +pip install transformers accelerate bitsandbytes +``` + +--- + +## 🚀 How to Use + +### ✅ Method 1 — Transformers (Recommended) + +Best for: Research, production, standard usage +```python +from transformers import AutoModelForCausalLM, AutoTokenizer +import torch + +model_name = "junaid008/qehwa-pashto-llm" + +tokenizer = AutoTokenizer.from_pretrained(model_name) +model = AutoModelForCausalLM.from_pretrained( + model_name, + torch_dtype = torch.bfloat16, + device_map = "auto", +) + +ALPACA_TEMPLATE = """Below is an instruction in Pashto or English. Write a detailed response in Pashto. + +### Instruction: +{} + +### Response: +{}""" + +def generate(prompt): + inputs = tokenizer( + ALPACA_TEMPLATE.format(prompt, ""), + return_tensors = "pt", + ).to("cuda") + + outputs = model.generate( + **inputs, + max_new_tokens = 500, + temperature = 0.7, + do_sample = True, + repetition_penalty = 1.1, + pad_token_id = tokenizer.eos_token_id, + ) + + response = tokenizer.decode(outputs[0], skip_special_tokens=True) + return response.split("### Response:")[-1].strip() + +# Pashto input +print(generate("د پیښور تاریخ راته ووایه")) + +# English input +print(generate("Tell me about Pashtunwali")) + +# Urdu input +print(generate("پشاور کے بارے میں بتاؤ")) +``` + +--- + +### ✅ Method 2 — 4-bit Quantization (Low VRAM) + +Best for: GPUs with 8GB VRAM or less +```python +from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig +import torch + +model_name = "junaid008/qehwa-pashto-llm" + +bnb_config = BitsAndBytesConfig( + load_in_4bit = True, + bnb_4bit_quant_type = "nf4", + bnb_4bit_compute_dtype = torch.bfloat16, + bnb_4bit_use_double_quant = True, +) + +tokenizer = AutoTokenizer.from_pretrained(model_name) +model = AutoModelForCausalLM.from_pretrained( + model_name, + quantization_config = bnb_config, + device_map = "auto", +) + +ALPACA_TEMPLATE = """Below is an instruction in Pashto or English. Write a detailed response in Pashto. + +### Instruction: +{} + +### Response: +{}""" + +def generate(prompt): + inputs = tokenizer( + ALPACA_TEMPLATE.format(prompt, ""), + return_tensors = "pt", + ).to("cuda") + + outputs = model.generate( + **inputs, + max_new_tokens = 500, + temperature = 0.7, + do_sample = True, + repetition_penalty = 1.1, + pad_token_id = tokenizer.eos_token_id, + ) + + response = tokenizer.decode(outputs[0], skip_special_tokens=True) + return response.split("### Response:")[-1].strip() + +print(generate("پښتونولي تشریح کړه")) +``` + +--- + +### ✅ Method 3 — Unsloth (2x Faster Inference) + +Best for: Speed-optimized usage, Colab, A100/H100 +```python +from unsloth import FastLanguageModel + +model, tokenizer = FastLanguageModel.from_pretrained( + model_name = "junaid008/qehwa-pashto-llm", + max_seq_length = 2048, + dtype = None, + load_in_4bit = False, +) +FastLanguageModel.for_inference(model) + +ALPACA_TEMPLATE = """Below is an instruction in Pashto or English. Write a detailed response in Pashto. + +### Instruction: +{} + +### Response: +{}""" + +import torch +inputs = tokenizer( + ALPACA_TEMPLATE.format("د پیښور تاریخ راته ووایه", ""), + return_tensors = "pt", +).to("cuda") + +outputs = model.generate( + **inputs, + max_new_tokens = 500, + temperature = 0.7, + do_sample = True, + repetition_penalty = 1.1, + pad_token_id = tokenizer.pad_token_id, +) + +response = tokenizer.decode(outputs[0], skip_special_tokens=True) +print(response.split("### Response:")[-1].strip()) +``` + +--- + +### ✅ Method 4 — CPU Only (No GPU) + +Best for: Testing on laptop, no GPU available (slow but works) +```python +from transformers import AutoModelForCausalLM, AutoTokenizer +import torch + +model_name = "junaid008/qehwa-pashto-llm" + +tokenizer = AutoTokenizer.from_pretrained(model_name) +model = AutoModelForCausalLM.from_pretrained( + model_name, + torch_dtype = torch.float32, # float32 for CPU + device_map = "cpu", +) + +ALPACA_TEMPLATE = """Below is an instruction in Pashto or English. Write a detailed response in Pashto. + +### Instruction: +{} + +### Response: +{}""" + +inputs = tokenizer( + ALPACA_TEMPLATE.format("پښتو ژبه د چا ده؟", ""), + return_tensors = "pt", +) + +outputs = model.generate( + **inputs, + max_new_tokens = 200, + do_sample = False, # greedy for CPU speed + pad_token_id = tokenizer.eos_token_id, +) + +response = tokenizer.decode(outputs[0], skip_special_tokens=True) +print(response.split("### Response:")[-1].strip()) +``` + +--- + +### ✅ Method 5 — Google Colab (Free) + +Best for: Trying without any local setup + +Open in Colab and run: +```python +# Install +!pip install transformers accelerate -q + +from transformers import AutoModelForCausalLM, AutoTokenizer +import torch + +tokenizer = AutoTokenizer.from_pretrained("junaid008/qehwa-pashto-llm") +model = AutoModelForCausalLM.from_pretrained( + "junaid008/qehwa-pashto-llm", + torch_dtype = torch.bfloat16, + device_map = "auto", +) + +ALPACA_TEMPLATE = """Below is an instruction in Pashto or English. Write a detailed response in Pashto. + +### Instruction: +{} + +### Response: +{}""" + +def generate(prompt): + inputs = tokenizer(ALPACA_TEMPLATE.format(prompt, ""), return_tensors="pt").to("cuda") + outputs = model.generate(**inputs, max_new_tokens=500, temperature=0.7, + do_sample=True, pad_token_id=tokenizer.eos_token_id) + return tokenizer.decode(outputs[0], skip_special_tokens=True).split("### Response:")[-1].strip() + +print(generate("Tell me about Peshawar")) +print(generate("پښتونولي تشریح کړه")) +print(generate("پشاور کا مشہور کھانا کیا ہے؟")) +``` + +--- + +## ⚙️ Hardware Requirements + +| Method | VRAM | Speed | +|---|---|---| +| bfloat16 full | 16GB+ | ✅ Fast | +| 4-bit quantized | 8GB+ | ✅ Good | +| Unsloth | 16GB+ | 🔥 2x Faster | +| CPU only | No GPU | ⚠️ Slow | + +--- + +## 📊 Training Details + +### Stage 1 — Continued Pre-Training (CPT) + +| Parameter | Value | +|---|---| +| Base model | Qwen/Qwen2.5-7B | +| Hardware | NVIDIA A100-SXM4-40GB | +| Training steps | 5,000 | +| Final CPT loss | ~1.8 | +| Dataset size | 3,400,000 documents | +| Sequence length | 2,048 tokens | +| Precision | bfloat16 | +| LoRA rank | 64 | +| Learning rate | 5e-5 | +| Effective batch size | 32 | + +### Stage 2 — Supervised Fine-Tuning (SFT) + +| Parameter | Value | +|---|---| +| Base model | junaid008/pashto-qwen2.5-7b-v3 (CPT) | +| Hardware | NVIDIA A100-SXM4-40GB | +| Training steps | 7,908 | +| Final SFT loss | 0.455 | +| Dataset size | 126,519 pairs | +| Epochs | 2 | +| Sequence length | 2,048 tokens | +| Precision | bfloat16 | +| LoRA rank | 64 | +| Learning rate | 5e-5 | +| Effective batch size | 32 | + +--- + +## 📚 Dataset + +### CPT Dataset +- 3.4 million Pakistani Pashto documents +- Sources: news, books, religious texts, Wikipedia, web crawl +- Custom cleaned with Pashto-specific Unicode normalization + +### SFT Dataset — 126,519 pairs: + +| Type | Description | +|---|---| +| Pashto → Pashto | Questions and answers in pure Peshawari Pashto | +| English → Pashto | English instructions with Pashto responses | +| Urdu → Pashto | Urdu instructions with Pashto responses | +| Conversation | Natural dialogue in Peshawari dialect | +| Islamic topics | Religious knowledge in Pashto | +| Creative writing | Pashto poetry, stories, descriptions | +| Grammar | Pashto language correction and explanation | +| Translation | English to Pashto translation pairs | +| Cultural | Pashtunwali, traditions, history, geography | + +--- + +## 🗣️ Example Outputs + +**Pashto input:** +``` +Prompt : د پیښور تاریخ راته ووایه +Output : پیښور د نړۍ یو له تر ټولو پخوانیو ښارونو څخه دی چې تاریخ یې زرګونه + کاله پخوا ته رسیږي. د دې ښار پخوانی نوم پوشکلاوتي او بیا پروشاپور و. + د پیښور د بالاحصار کلا د مغلو او سکهانو د وخت یادګار دی... +``` + +**English input:** +``` +Prompt : Tell me about Pashtunwali +Output : پښتونولي د پښتنو د ژوند هغه لرغونی او سپېڅلی ضابطه اخلاق ده چې د دوی + ټولنیز ژوند یې تنظیم کړی دی. په دې کې مېلمه پالنه، غیرت، او د کمزورو + ساتنه تر ټولو لوی ارزښتونه ګڼل کېږي... +``` + +**Urdu input:** +``` +Prompt : پشتونوالی کیا ہے؟ +Output : پښتونولي د پښتنو د ژوند هغه لرغونی او زرین ضابطه اخلاق ده... +``` + +--- + +## 👤 Author + +**Junaid Aslam** +- HuggingFace: [junaid008](https://huggingface.co/junaid008) +- Built independently as a contribution to Pashto NLP + +--- + +## 📜 License + +Apache 2.0 — free to use, modify, and distribute with attribution. + +--- + +## 🤝 Citation +```bibtex +@misc{qehwa-pashto-llm, + author = {Junaid Aslam}, + title = {Qehwa — Pashto's First LLM}, + year = {2026}, + publisher = {HuggingFace}, + url = {https://huggingface.co/junaid008/qehwa-pashto-llm} +} +``` \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..aa782c9 --- /dev/null +++ b/config.json @@ -0,0 +1,64 @@ +{ + "architectures": [ + "Qwen2ForCausalLM" + ], + "attention_dropout": 0.0, + "bos_token_id": null, + "dtype": "bfloat16", + "eos_token_id": 151643, + "hidden_act": "silu", + "hidden_size": 3584, + "initializer_range": 0.02, + "intermediate_size": 18944, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 131072, + "max_window_layers": 28, + "model_type": "qwen2", + "num_attention_heads": 28, + "num_hidden_layers": 28, + "num_key_value_heads": 4, + "pad_token_id": 151665, + "rms_norm_eps": 1e-06, + "rope_parameters": { + "rope_theta": 1000000.0, + "rope_type": "default" + }, + "sliding_window": null, + "tie_word_embeddings": false, + "transformers_version": "5.2.0", + "unsloth_fixed": true, + "unsloth_version": "2026.3.4", + "use_cache": false, + "use_mrope": false, + "use_sliding_window": false, + "vocab_size": 152064 +} diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..da8a0f7 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,9 @@ +{ + "eos_token_id": [ + 151643 + ], + "max_length": 131072, + "max_new_tokens": 2048, + "pad_token_id": 151665, + "transformers_version": "5.2.0" +} diff --git a/model.safetensors b/model.safetensors new file mode 100644 index 0000000..8433b46 --- /dev/null +++ b/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b14653034d5866af47bde6859adb272c70eb2475ff742a914bde1cd4287f39e +size 15231272152 diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..5340d81 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bd5948af71b4f56cf697f7580814c7ce8b80595ef985544efcacf716126a2e31 +size 11422356 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..bdadbb5 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,15 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|PAD_TOKEN|>", + "padding_side": "left", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +}