629 lines
7.6 KiB
Markdown
629 lines
7.6 KiB
Markdown
|
|
---
|
||
|
|
base_model: MihaiPopa-1/OmniTranslate-1.0
|
||
|
|
# base_model: Unsloth/Qwen3-0.6B-Unsloth-bnb-4bit - Variant that I used for fine-tuning (4-bit BNB quant by Unsloth)
|
||
|
|
tags:
|
||
|
|
- text-generation-inference
|
||
|
|
- translation
|
||
|
|
- transformers
|
||
|
|
- unsloth
|
||
|
|
- qwen3
|
||
|
|
- omnitranslate
|
||
|
|
license: apache-2.0
|
||
|
|
language:
|
||
|
|
- abk
|
||
|
|
- abq
|
||
|
|
- abs
|
||
|
|
- acm
|
||
|
|
- adh
|
||
|
|
- adi
|
||
|
|
- ady
|
||
|
|
- aeb
|
||
|
|
- afr
|
||
|
|
- agx
|
||
|
|
- aii
|
||
|
|
- aim
|
||
|
|
- ain
|
||
|
|
- ajz
|
||
|
|
- akb
|
||
|
|
- aln
|
||
|
|
- als
|
||
|
|
- alt
|
||
|
|
- amh
|
||
|
|
- anp
|
||
|
|
- aoz
|
||
|
|
- apc
|
||
|
|
- apt
|
||
|
|
- arb
|
||
|
|
- arg
|
||
|
|
- arq
|
||
|
|
- ars
|
||
|
|
- ary
|
||
|
|
- arz
|
||
|
|
- asm
|
||
|
|
- ast
|
||
|
|
- atb
|
||
|
|
- ava
|
||
|
|
- awa
|
||
|
|
- ayp
|
||
|
|
- ayr
|
||
|
|
- azb
|
||
|
|
- azj
|
||
|
|
- bak
|
||
|
|
- bam
|
||
|
|
- ban
|
||
|
|
- bar
|
||
|
|
- bas
|
||
|
|
- bbc
|
||
|
|
- bbk
|
||
|
|
- bcl
|
||
|
|
- bdq
|
||
|
|
- bel
|
||
|
|
- ben
|
||
|
|
- bew
|
||
|
|
- bho
|
||
|
|
- bhp
|
||
|
|
- bis
|
||
|
|
- biu
|
||
|
|
- bjn
|
||
|
|
- bod
|
||
|
|
- bos
|
||
|
|
- brh
|
||
|
|
- brx
|
||
|
|
- bts
|
||
|
|
- btx
|
||
|
|
- bug
|
||
|
|
- bul
|
||
|
|
- bwi
|
||
|
|
- bxr
|
||
|
|
- cat
|
||
|
|
- cbk
|
||
|
|
- ccp
|
||
|
|
- ceb
|
||
|
|
- ces
|
||
|
|
- cfm
|
||
|
|
- cha
|
||
|
|
- che
|
||
|
|
- chr
|
||
|
|
- chu
|
||
|
|
- chv
|
||
|
|
- cjs
|
||
|
|
- ckb
|
||
|
|
- ckt
|
||
|
|
- cmn
|
||
|
|
- cnh
|
||
|
|
- cnw
|
||
|
|
- cos
|
||
|
|
- crh
|
||
|
|
- crj
|
||
|
|
- crk
|
||
|
|
- crl
|
||
|
|
- crs
|
||
|
|
- csb
|
||
|
|
- csw
|
||
|
|
- csy
|
||
|
|
- ctd
|
||
|
|
- cym
|
||
|
|
- czt
|
||
|
|
- dak
|
||
|
|
- dan
|
||
|
|
- dar
|
||
|
|
- deu
|
||
|
|
- dik
|
||
|
|
- diu
|
||
|
|
- div
|
||
|
|
- dje
|
||
|
|
- dks
|
||
|
|
- dln
|
||
|
|
- dng
|
||
|
|
- dnw
|
||
|
|
- doi
|
||
|
|
- dru
|
||
|
|
- dsb
|
||
|
|
- dtp
|
||
|
|
- dty
|
||
|
|
- dzo
|
||
|
|
- ekk
|
||
|
|
- ell
|
||
|
|
- emj
|
||
|
|
- enl
|
||
|
|
- enm
|
||
|
|
- epo
|
||
|
|
- ess
|
||
|
|
- eus
|
||
|
|
- eve
|
||
|
|
- ewo
|
||
|
|
- ext
|
||
|
|
- fao
|
||
|
|
- fas
|
||
|
|
- ffm
|
||
|
|
- fij
|
||
|
|
- fil
|
||
|
|
- fin
|
||
|
|
- fit
|
||
|
|
- fkv
|
||
|
|
- fmu
|
||
|
|
- fra
|
||
|
|
- fro
|
||
|
|
- frp
|
||
|
|
- fry
|
||
|
|
- fuf
|
||
|
|
- fur
|
||
|
|
- fuv
|
||
|
|
- gag
|
||
|
|
- gaz
|
||
|
|
- gcf
|
||
|
|
- gla
|
||
|
|
- gle
|
||
|
|
- glg
|
||
|
|
- glk
|
||
|
|
- glv
|
||
|
|
- gmh
|
||
|
|
- gnb
|
||
|
|
- goh
|
||
|
|
- gom
|
||
|
|
- gos
|
||
|
|
- grc
|
||
|
|
- gsw
|
||
|
|
- gug
|
||
|
|
- guj
|
||
|
|
- guz
|
||
|
|
- hac
|
||
|
|
- hae
|
||
|
|
- hak
|
||
|
|
- hat
|
||
|
|
- hau
|
||
|
|
- haw
|
||
|
|
- hbo
|
||
|
|
- heb
|
||
|
|
- her
|
||
|
|
- hif
|
||
|
|
- hil
|
||
|
|
- hin
|
||
|
|
- hmr
|
||
|
|
- hne
|
||
|
|
- hns
|
||
|
|
- hrv
|
||
|
|
- hrx
|
||
|
|
- hsb
|
||
|
|
- hun
|
||
|
|
- hwc
|
||
|
|
- hye
|
||
|
|
- hyw
|
||
|
|
- iba
|
||
|
|
- ibg
|
||
|
|
- ibo
|
||
|
|
- ife
|
||
|
|
- ike
|
||
|
|
- ikt
|
||
|
|
- ilo
|
||
|
|
- ina
|
||
|
|
- ind
|
||
|
|
- inh
|
||
|
|
- isl
|
||
|
|
- ita
|
||
|
|
- ivv
|
||
|
|
- jav
|
||
|
|
- jpn
|
||
|
|
- jun
|
||
|
|
- kaa
|
||
|
|
- kab
|
||
|
|
- kac
|
||
|
|
- kak
|
||
|
|
- kal
|
||
|
|
- kam
|
||
|
|
- kan
|
||
|
|
- kas
|
||
|
|
- kat
|
||
|
|
- kaz
|
||
|
|
- kbd
|
||
|
|
- kca
|
||
|
|
- kdh
|
||
|
|
- kdr
|
||
|
|
- kea
|
||
|
|
- kei
|
||
|
|
- kgp
|
||
|
|
- kha
|
||
|
|
- khk
|
||
|
|
- khm
|
||
|
|
- kik
|
||
|
|
- kin
|
||
|
|
- kir
|
||
|
|
- kiu
|
||
|
|
- kjb
|
||
|
|
- kjh
|
||
|
|
- kmr
|
||
|
|
- knc
|
||
|
|
- koi
|
||
|
|
- kor
|
||
|
|
- kos
|
||
|
|
- kpv
|
||
|
|
- krj
|
||
|
|
- krl
|
||
|
|
- kru
|
||
|
|
- ksh
|
||
|
|
- ksw
|
||
|
|
- ktj
|
||
|
|
- ktz
|
||
|
|
- kua
|
||
|
|
- kum
|
||
|
|
- kwn
|
||
|
|
- kyu
|
||
|
|
- kzj
|
||
|
|
- lad
|
||
|
|
- lao
|
||
|
|
- lat
|
||
|
|
- lbe
|
||
|
|
- ldn
|
||
|
|
- lew
|
||
|
|
- lez
|
||
|
|
- lfn
|
||
|
|
- lim
|
||
|
|
- lin
|
||
|
|
- lis
|
||
|
|
- lit
|
||
|
|
- lki
|
||
|
|
- lld
|
||
|
|
- lmk
|
||
|
|
- lnd
|
||
|
|
- lrc
|
||
|
|
- ltg
|
||
|
|
- ltz
|
||
|
|
- lud
|
||
|
|
- lug
|
||
|
|
- luo
|
||
|
|
- lus
|
||
|
|
- lvs
|
||
|
|
- lwg
|
||
|
|
- lzh
|
||
|
|
- mag
|
||
|
|
- mah
|
||
|
|
- mai
|
||
|
|
- mak
|
||
|
|
- mal
|
||
|
|
- mar
|
||
|
|
- mas
|
||
|
|
- mbf
|
||
|
|
- mdf
|
||
|
|
- mer
|
||
|
|
- mfe
|
||
|
|
- mfg
|
||
|
|
- mfy
|
||
|
|
- mhi
|
||
|
|
- mhr
|
||
|
|
- mhy
|
||
|
|
- min
|
||
|
|
- mip
|
||
|
|
- mjw
|
||
|
|
- mkd
|
||
|
|
- mlt
|
||
|
|
- mni
|
||
|
|
- mnk
|
||
|
|
- mns
|
||
|
|
- mnw
|
||
|
|
- moh
|
||
|
|
- mph
|
||
|
|
- mqy
|
||
|
|
- mri
|
||
|
|
- mrj
|
||
|
|
- mrw
|
||
|
|
- mtg
|
||
|
|
- mui
|
||
|
|
- mup
|
||
|
|
- mus
|
||
|
|
- mvp
|
||
|
|
- mwf
|
||
|
|
- mwl
|
||
|
|
- mww
|
||
|
|
- mya
|
||
|
|
- myv
|
||
|
|
- myx
|
||
|
|
- mzh
|
||
|
|
- nah
|
||
|
|
- nan
|
||
|
|
- nap
|
||
|
|
- naq
|
||
|
|
- nbu
|
||
|
|
- nde
|
||
|
|
- ndo
|
||
|
|
- nds
|
||
|
|
- new
|
||
|
|
- nio
|
||
|
|
- njn
|
||
|
|
- njo
|
||
|
|
- nld
|
||
|
|
- nmf
|
||
|
|
- nmz
|
||
|
|
- nno
|
||
|
|
- nob
|
||
|
|
- nog
|
||
|
|
- non
|
||
|
|
- npi
|
||
|
|
- npo
|
||
|
|
- nrf
|
||
|
|
- nri
|
||
|
|
- nrm
|
||
|
|
- nse
|
||
|
|
- nus
|
||
|
|
- nya
|
||
|
|
- nyn
|
||
|
|
- nzm
|
||
|
|
- obo
|
||
|
|
- oci
|
||
|
|
- ojb
|
||
|
|
- olo
|
||
|
|
- orv
|
||
|
|
- ory
|
||
|
|
- oss
|
||
|
|
- ota
|
||
|
|
- oto
|
||
|
|
- otw
|
||
|
|
- pam
|
||
|
|
- pan
|
||
|
|
- pap
|
||
|
|
- pbt
|
||
|
|
- pcd
|
||
|
|
- pck
|
||
|
|
- pcm
|
||
|
|
- pfl
|
||
|
|
- plt
|
||
|
|
- pmq
|
||
|
|
- pmx
|
||
|
|
- pnb
|
||
|
|
- pnt
|
||
|
|
- pol
|
||
|
|
- por
|
||
|
|
- pov
|
||
|
|
- ppk
|
||
|
|
- pps
|
||
|
|
- prg
|
||
|
|
- pui
|
||
|
|
- pxm
|
||
|
|
- quc
|
||
|
|
- qul
|
||
|
|
- qup
|
||
|
|
- qus
|
||
|
|
- quz
|
||
|
|
- raw
|
||
|
|
- rcf
|
||
|
|
- rel
|
||
|
|
- rhg
|
||
|
|
- ria
|
||
|
|
- rjs
|
||
|
|
- rmc
|
||
|
|
- rml
|
||
|
|
- rmn
|
||
|
|
- rmy
|
||
|
|
- rnl
|
||
|
|
- roh
|
||
|
|
- ron
|
||
|
|
- rtm
|
||
|
|
- rue
|
||
|
|
- run
|
||
|
|
- rus
|
||
|
|
- sah
|
||
|
|
- san
|
||
|
|
- sat
|
||
|
|
- sck
|
||
|
|
- scn
|
||
|
|
- sda
|
||
|
|
- sdc
|
||
|
|
- sdh
|
||
|
|
- ses
|
||
|
|
- sgc
|
||
|
|
- sgh
|
||
|
|
- sid
|
||
|
|
- sin
|
||
|
|
- sju
|
||
|
|
- skr
|
||
|
|
- slk
|
||
|
|
- slv
|
||
|
|
- sma
|
||
|
|
- sme
|
||
|
|
- smj
|
||
|
|
- smn
|
||
|
|
- smo
|
||
|
|
- sms
|
||
|
|
- smt
|
||
|
|
- sna
|
||
|
|
- snd
|
||
|
|
- som
|
||
|
|
- sot
|
||
|
|
- spa
|
||
|
|
- srd
|
||
|
|
- srp
|
||
|
|
- ssw
|
||
|
|
- sul
|
||
|
|
- sun
|
||
|
|
- swe
|
||
|
|
- swg
|
||
|
|
- swh
|
||
|
|
- syc
|
||
|
|
- syl
|
||
|
|
- szl
|
||
|
|
- tab
|
||
|
|
- tam
|
||
|
|
- taq
|
||
|
|
- tat
|
||
|
|
- tcy
|
||
|
|
- tcz
|
||
|
|
- tel
|
||
|
|
- tet
|
||
|
|
- tgk
|
||
|
|
- tha
|
||
|
|
- thl
|
||
|
|
- tig
|
||
|
|
- tir
|
||
|
|
- tkl
|
||
|
|
- tkr
|
||
|
|
- tlh
|
||
|
|
- tly
|
||
|
|
- tok
|
||
|
|
- ton
|
||
|
|
- tpi
|
||
|
|
- tpw
|
||
|
|
- trc
|
||
|
|
- trp
|
||
|
|
- trs
|
||
|
|
- ttj
|
||
|
|
- tuk
|
||
|
|
- tur
|
||
|
|
- tuv
|
||
|
|
- twx
|
||
|
|
- tyv
|
||
|
|
- tzl
|
||
|
|
- tzm
|
||
|
|
- udm
|
||
|
|
- uig
|
||
|
|
- ukr
|
||
|
|
- urd
|
||
|
|
- uzn
|
||
|
|
- uzs
|
||
|
|
- vap
|
||
|
|
- vie
|
||
|
|
- vot
|
||
|
|
- vro
|
||
|
|
- war
|
||
|
|
- way
|
||
|
|
- wba
|
||
|
|
- wbm
|
||
|
|
- wes
|
||
|
|
- whk
|
||
|
|
- wlx
|
||
|
|
- wol
|
||
|
|
- wsg
|
||
|
|
- wwa
|
||
|
|
- xal
|
||
|
|
- xho
|
||
|
|
- xmm
|
||
|
|
- xmv
|
||
|
|
- xog
|
||
|
|
- yaz
|
||
|
|
- ydd
|
||
|
|
- yor
|
||
|
|
- yrk
|
||
|
|
- yrl
|
||
|
|
- yua
|
||
|
|
- yue
|
||
|
|
- zea
|
||
|
|
- zgh
|
||
|
|
- zom
|
||
|
|
- zsm
|
||
|
|
- zul
|
||
|
|
pipeline_tag: translation
|
||
|
|
datasets:
|
||
|
|
- MihaiPopa-1/OmniSurgical-1.1
|
||
|
|
---
|
||
|
|
|
||
|
|
# OmniTranslate 1.1
|
||
|
|
|
||
|
|
OmniTranslate 1.1 is a massively multilingual machine translation model supporting over 500 languages. Fine-tuned from [Qwen 3 0.6B](https://www.huggingface.co/Qwen/Qwen3-0.6B) (with Unsloth), this model is designed for translation tasks on any device!
|
||
|
|
|
||
|
|
# Features
|
||
|
|
* **500+ Languages Supported:** The broadest coverage of languages supported for a translation model that's under 1 billion parameters!
|
||
|
|
* **Tiny Size:** Beats any other large model on speed and memory usage. No other model is able to compete with this!
|
||
|
|
|
||
|
|
# Improvements over 1.0
|
||
|
|
* OmniTranslate now makes less hiccups when translating to Romanian (like "ami"), and the diacritic bug on Romanian translations has been mostly fixed!
|
||
|
|
|
||
|
|
There's a tiny chance that the model will spit out without diacritics (mostly due to seeds) though, so try a different one.
|
||
|
|
|
||
|
|
# Experimental Features
|
||
|
|
* We added 2 new languages, Emoji and Sulfuristic Speak (my own language for OmniTranslate 1.1 to quite fit the Chaos Cubed Minecraft vibe!). Try these out:
|
||
|
|
|
||
|
|
## Emoji
|
||
|
|
```python
|
||
|
|
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||
|
|
import torch
|
||
|
|
|
||
|
|
# 1. Load from your Hugging Face Repo
|
||
|
|
model_id = "MihaiPopa-1/OmniTranslate-1.1"
|
||
|
|
|
||
|
|
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
||
|
|
model = AutoModelForCausalLM.from_pretrained(
|
||
|
|
model_id,
|
||
|
|
torch_dtype=torch.float32, # Standard for CPU
|
||
|
|
device_map="cpu" # Forces CPU usage
|
||
|
|
)
|
||
|
|
|
||
|
|
# 2. Translate to Emoji
|
||
|
|
prompt = "<|im_start|>user\nTranslate to emj_Emoj: We love the world!<|im_end|>\n<|im_start|>assistant\n<think>\n"
|
||
|
|
inputs = tokenizer(prompt, return_tensors="pt").to("cpu")
|
||
|
|
|
||
|
|
with torch.no_grad():
|
||
|
|
outputs = model.generate(**inputs, max_new_tokens=64, temperature=0.1)
|
||
|
|
|
||
|
|
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
|
||
|
|
```
|
||
|
|
|
||
|
|
## Sulfuristic Speak
|
||
|
|
```python
|
||
|
|
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||
|
|
import torch
|
||
|
|
|
||
|
|
# 1. Load from your Hugging Face Repo
|
||
|
|
model_id = "MihaiPopa-1/OmniTranslate-1.1"
|
||
|
|
|
||
|
|
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
||
|
|
model = AutoModelForCausalLM.from_pretrained(
|
||
|
|
model_id,
|
||
|
|
torch_dtype=torch.float32, # Standard for CPU
|
||
|
|
device_map="cpu" # Forces CPU usage
|
||
|
|
)
|
||
|
|
|
||
|
|
# 2. Translate to Sulfuristic Speak ("Translate to Sulfuristic Speak" also works too!)
|
||
|
|
prompt = "<|im_start|>user\nTranslate to sul_Latn: Let's ride a Sulfur Cube!<|im_end|>\n<|im_start|>assistant\n<think>\n"
|
||
|
|
inputs = tokenizer(prompt, return_tensors="pt").to("cpu")
|
||
|
|
|
||
|
|
with torch.no_grad():
|
||
|
|
outputs = model.generate(**inputs, max_new_tokens=128, temperature=0.1)
|
||
|
|
|
||
|
|
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
|
||
|
|
```
|
||
|
|
|
||
|
|
# Notes
|
||
|
|
OmniTranslate 1.1 is still a experimental model and shouldn't be used for tasks where accurate translations matter.
|
||
|
|
|
||
|
|
# Notes
|
||
|
|
Providing the ISO code instead of the language name can improve the results a lot.
|
||
|
|
|
||
|
|
# Usage
|
||
|
|
Code is by Gemini 3 Flash (then some little modifications by myself):
|
||
|
|
```python
|
||
|
|
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||
|
|
import torch
|
||
|
|
|
||
|
|
# 1. Load from your Hugging Face Repo
|
||
|
|
model_id = "MihaiPopa-1/OmniTranslate-1.1"
|
||
|
|
|
||
|
|
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
||
|
|
model = AutoModelForCausalLM.from_pretrained(
|
||
|
|
model_id,
|
||
|
|
torch_dtype=torch.float32, # Standard for CPU
|
||
|
|
device_map="cpu" # Forces CPU usage
|
||
|
|
)
|
||
|
|
|
||
|
|
# 2. Translate (replace ron_Latn with your language here)
|
||
|
|
prompt = "<|im_start|>user\nTranslate to ron_Latn: OmniTranslate is a massively multilingual machine translation model supporting over 500 languages!<|im_end|>\n<|im_start|>assistant\n<think>\n"
|
||
|
|
inputs = tokenizer(prompt, return_tensors="pt").to("cpu")
|
||
|
|
|
||
|
|
with torch.no_grad():
|
||
|
|
outputs = model.generate(**inputs, max_new_tokens=256, temperature=0.1)
|
||
|
|
|
||
|
|
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
|
||
|
|
```
|
||
|
|
|
||
|
|
# Data Used
|
||
|
|
I used my own [OmniSurgical 1.1](https://www.huggingface.co/datasets/MihaiPopa-1/OmniSurgical-1.1), which the dataset itself contains a part of [HF's FineTranslations](https://www.huggingface.co/datasets/HuggingFaceFW/finetranslations)
|
||
|
|
|
||
|
|
---
|
||
|
|
|
||
|
|
# Uploaded finetuned model
|
||
|
|
|
||
|
|
- **Developed by:** MihaiPopa-1
|
||
|
|
- **License:** apache-2.0
|
||
|
|
- **Finetuned from model :** unsloth/qwen3-0.6b-unsloth-bnb-4bit
|
||
|
|
|
||
|
|
This qwen3 model was trained 2x faster with [Unsloth](https://github.com/unslothai/unsloth) and Huggingface's TRL library.
|
||
|
|
|
||
|
|
[<img src="https://raw.githubusercontent.com/unslothai/unsloth/main/images/unsloth%20made%20with%20love.png" width="200"/>](https://github.com/unslothai/unsloth)
|