From 53332e0057456e8083cc151c9192dd3adbb53add Mon Sep 17 00:00:00 2001 From: ModelHub XC Date: Wed, 6 May 2026 14:57:49 +0800 Subject: [PATCH] =?UTF-8?q?=E5=88=9D=E5=A7=8B=E5=8C=96=E9=A1=B9=E7=9B=AE?= =?UTF-8?q?=EF=BC=8C=E7=94=B1ModelHub=20XC=E7=A4=BE=E5=8C=BA=E6=8F=90?= =?UTF-8?q?=E4=BE=9B=E6=A8=A1=E5=9E=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Model: line-corporation/japanese-large-lm-3.6b Source: Original Platform --- .gitattributes | 35 +++++++++++++++++++++++ README.md | 66 +++++++++++++++++++++++++++++++++++++++++++ config.json | 25 ++++++++++++++++ model.safetensors | 3 ++ pytorch_model.bin | 3 ++ spiece.model | 3 ++ tokenizer_config.json | 15 ++++++++++ 7 files changed, 150 insertions(+) create mode 100644 .gitattributes create mode 100644 README.md create mode 100644 config.json create mode 100644 model.safetensors create mode 100644 pytorch_model.bin create mode 100644 spiece.model create mode 100644 tokenizer_config.json diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..a6344aa --- /dev/null +++ b/.gitattributes @@ -0,0 +1,35 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..a4de5c4 --- /dev/null +++ b/README.md @@ -0,0 +1,66 @@ +--- +license: apache-2.0 +datasets: +- wikipedia +- mc4 +- cc100 +- oscar +language: +- ja +--- +# japanese-large-lm-3.6b + +This repository provides a 3.6B parameters Japanese language model, trained by [LINE Corporation](https://linecorp.com/ja/). + +[Tech Blog](https://engineering.linecorp.com/ja/blog/3.6-billion-parameter-japanese-language-model) explains details. + +## How to use + +``` +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, set_seed + +model = AutoModelForCausalLM.from_pretrained("line-corporation/japanese-large-lm-3.6b", torch_dtype=torch.float16) +tokenizer = AutoTokenizer.from_pretrained("line-corporation/japanese-large-lm-3.6b", use_fast=False) +generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0) +set_seed(101) + +text = generator( + "おはようございます、今日の天気は", + max_length=30, + do_sample=True, + pad_token_id=tokenizer.pad_token_id, + num_return_sequences=5, +) + +for t in text: + print(t) + +# 下記は生成される出力の例 +# [{'generated_text': 'おはようございます、今日の天気は雨模様ですね。梅雨のこの時期の 朝は洗濯物が乾きにくいなど、主婦にとっては悩みどころですね。 では、'}, +# {'generated_text': 'おはようございます、今日の天気は晴れ。 気温は8°C位です。 朝晩は結構冷え込むようになりました。 寒くなってくると、...'}, +# {'generated_text': 'おはようございます、今日の天気は曇りです。 朝起きたら雪が軽く積もっていた。 寒さもそれほどでもありません。 日中は晴れるみたいですね。'}, +# {'generated_text': 'おはようございます、今日の天気は☁のち☀です。 朝の気温5°C、日中も21°Cと 暖かい予報です'}, +# {'generated_text': 'おはようございます、今日の天気は晴天ですが涼しい1日です、気温は午後になり低くなり25°Cくらい、風も強いようですので、'}] +``` + +## Model architecture +| Model | Vocab size | Architecture | Position type | Layers | Hidden dim | Attention heads | +| :---: | :--------: | :----------- | :-----------: | :----: | :--------: | :-------------: | +| 1.7B | 51200 | GPT2 | Absolute | 24 | 2304 | 24 | +| 3.6B | 51200 | GPTNeoX | RoPE | 30 | 3072 | 32 | + +## Training Corpus +Our training corpus consists of the Japanese portions of publicly available corpus such as C4, CC-100, and Oscar. +We also incorporated the Web texts crawled by in-house system. +The total size of our training corpus is about 650 GB. +The trained model achieves 7.50 perplexity on the internal validation sets of Japanese C4. + +## Tokenization +We use a sentencepiece tokenizer with a unigram language model and byte-fallback. +We **do not** apply pre-tokenization with Japanese tokenizer. +Thus, a user may directly feed raw sentences into the tokenizer. + + +## License +[Apache License, Version 2.0](https://www.apache.org/licenses/LICENSE-2.0) \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..60effa1 --- /dev/null +++ b/config.json @@ -0,0 +1,25 @@ +{ + "architectures": [ + "GPTNeoXForCausalLM" + ], + "bos_token_id": 2, + "classifier_dropout": 0.1, + "eos_token_id": 2, + "hidden_act": "gelu", + "hidden_size": 3072, + "initializer_range": 0.02, + "intermediate_size": 12288, + "layer_norm_eps": 1e-05, + "max_position_embeddings": 2048, + "model_type": "gpt_neox", + "num_attention_heads": 32, + "num_hidden_layers": 30, + "rotary_emb_base": 10000, + "rotary_pct": 1.0, + "tie_word_embeddings": true, + "torch_dtype": "float16", + "transformers_version": "4.29.2", + "use_cache": true, + "use_parallel_residual": false, + "vocab_size": 51200 +} diff --git a/model.safetensors b/model.safetensors new file mode 100644 index 0000000..8d7b2b3 --- /dev/null +++ b/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:90f2fb996c52955727b13b4f3a84a7c3cca9c525017051a9efebc0ae5575e72e +size 7237638964 diff --git a/pytorch_model.bin b/pytorch_model.bin new file mode 100644 index 0000000..4d8029f --- /dev/null +++ b/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:baa5571b7827fa31387b9d98877ccee8f5e9859633c7f4136f6c37ab4c4c41a1 +size 7237734117 diff --git a/spiece.model b/spiece.model new file mode 100644 index 0000000..0075e65 --- /dev/null +++ b/spiece.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0c5c56a739832923347681ed8a03a9cbf5afb6d1fe60089a5b01dd2dd063ab71 +size 1208648 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..1c9d583 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,15 @@ +{ + "extra_ids": 0, + "do_lower_case": false, + "keep_accents": true, + "bos_token": "", + "eos_token": "", + "unk_token": "", + "pad_token": "", + "mask_token": "", + "cls_token": "", + "sep_token": "", + "sp_model_kwargs": {}, + "special_tokens_map_file": null, + "tokenizer_class": "T5Tokenizer" +}