From 469de2c9e5e0b1c76c656a833233673fe06a208a Mon Sep 17 00:00:00 2001 From: ModelHub XC Date: Sun, 12 Apr 2026 14:25:57 +0800 Subject: [PATCH] =?UTF-8?q?=E5=88=9D=E5=A7=8B=E5=8C=96=E9=A1=B9=E7=9B=AE?= =?UTF-8?q?=EF=BC=8C=E7=94=B1ModelHub=20XC=E7=A4=BE=E5=8C=BA=E6=8F=90?= =?UTF-8?q?=E4=BE=9B=E6=A8=A1=E5=9E=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Model: Data-Juicer/LLaMA2-7B-ZH-Chat-52k Source: Original Platform --- .gitattributes | 38 ++++++++++++++++++++ README.md | 59 ++++++++++++++++++++++++++++++++ config.json | 28 +++++++++++++++ generation_config.json | 9 +++++ pytorch_model-00001-of-00003.bin | 3 ++ pytorch_model-00002-of-00003.bin | 3 ++ pytorch_model-00003-of-00003.bin | 3 ++ pytorch_model.bin.index.json | 3 ++ special_tokens_map.json | 6 ++++ tokenizer.model | 3 ++ tokenizer_config.json | 35 +++++++++++++++++++ 11 files changed, 190 insertions(+) create mode 100644 .gitattributes create mode 100644 README.md create mode 100644 config.json create mode 100644 generation_config.json create mode 100644 pytorch_model-00001-of-00003.bin create mode 100644 pytorch_model-00002-of-00003.bin create mode 100644 pytorch_model-00003-of-00003.bin create mode 100644 pytorch_model.bin.index.json create mode 100644 special_tokens_map.json create mode 100644 tokenizer.model create mode 100644 tokenizer_config.json diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..4d70de8 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,38 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bin.* filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zstandard filter=lfs diff=lfs merge=lfs -text +*.tfevents* filter=lfs diff=lfs merge=lfs -text +*.db* filter=lfs diff=lfs merge=lfs -text +*.ark* filter=lfs diff=lfs merge=lfs -text +**/*ckpt*data* filter=lfs diff=lfs merge=lfs -text +**/*ckpt*.meta filter=lfs diff=lfs merge=lfs -text +**/*ckpt*.index filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +pytorch_model-00001-of-00003.bin filter=lfs diff=lfs merge=lfs -text +pytorch_model-00002-of-00003.bin filter=lfs diff=lfs merge=lfs -text +pytorch_model-00003-of-00003.bin filter=lfs diff=lfs merge=lfs -text +tokenizer.model filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..4c38fb6 --- /dev/null +++ b/README.md @@ -0,0 +1,59 @@ +--- +frameworks: +- Pytorch +license: Apache License 2.0 +tasks: +- text-generation +datasets: + train: + - Data-Juicer/alpaca-cot-zh-refined-by-data-juicer +tags: +- data-juicer +- arxiv:2309.02033 +--- +## News + Our first data-centric LLM competition begins! Please visit the competition's official websites, **FT-Data Ranker** ([1B Track](https://tianchi.aliyun.com/competition/entrance/532157), [7B Track](https://tianchi.aliyun.com/competition/entrance/532158)), for more information. +## Introduction +This is a reference LLM from [Data-Juicer](https://github.com/alibaba/data-juicer). + +The model architecture is LLaMA2-7B and we built it upon the a pre-trained Chinese checkpoint from [FlagAlpha](https://huggingface.co/FlagAlpha/Atom-7B). +The model is fine-trained on 52k Chinese chat samples of Data-Juicer's refined [alpaca-CoT data](https://github.com/alibaba/data-juicer/blob/main/configs/data_juicer_recipes/alpaca_cot/README.md#refined-alpaca-cot-dataset-meta-info). +It beats LLaMA2-7B fine-tuned on 543k Belle samples in GPT-4 evaluation. + +For more details, please refer to our [paper](https://arxiv.org/abs/2309.02033). + +![exp_llama](https://img.alicdn.com/imgextra/i2/O1CN019WtUPP1uhebnDlPR8_!!6000000006069-2-tps-2530-1005.png) + +## 使用 +```python + +from modelscope import ( + AutoModelForCausalLM, AutoTokenizer, GenerationConfig, snapshot_download +) +model_dir = 'LLaMA2-7B-ZH-Chat-52k' + +tokenizer = AutoTokenizer.from_pretrained(model_dir) +model = AutoModelForCausalLM.from_pretrained(model_dir).eval() + +inputs = tokenizer('How are you?', return_tensors='pt').to(model.device) +response = model.generate(inputs.input_ids, max_length=128) +print(tokenizer.decode(response.cpu()[0], skip_special_tokens=True)) +``` + +## 参考 +If you find our work useful for your research or development, please kindly cite the following [paper](https://arxiv.org/abs/2309.02033). +``` +@misc{chen2023datajuicer, +title={Data-Juicer: A One-Stop Data Processing System for Large Language Models}, +author={Daoyuan Chen and Yilun Huang and Zhijian Ma and Hesen Chen and Xuchen Pan and Ce Ge and Dawei Gao and Yuexiang Xie and Zhaoyang Liu and Jinyang Gao and Yaliang Li and Bolin Ding and Jingren Zhou}, +year={2023}, +eprint={2309.02033}, +archivePrefix={arXiv}, +primaryClass={cs.LG} +} +``` + +#### Clone with HTTP +```bash + git clone https://www.modelscope.cn/Data-Juicer/LLaMA2-7B-ZH-Chat-52k.git +``` diff --git a/config.json b/config.json new file mode 100644 index 0000000..907f18a --- /dev/null +++ b/config.json @@ -0,0 +1,28 @@ +{ + "_name_or_path": "flagalpha-7b_zh_sft_verb_52000", + "architectures": [ + "LlamaForCausalLM" + ], + "bos_token_id": 1, + "eos_token_id": 2, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 11008, + "max_length": 4096, + "max_position_embeddings": 4096, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 32, + "pad_token_id": 2, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "tie_word_embeddings": false, + "torch_dtype": "float32", + "transformers_version": "4.31.0", + "unk_token_id": 0, + "use_cache": true, + "vocab_size": 65000 +} diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..735afba --- /dev/null +++ b/generation_config.json @@ -0,0 +1,9 @@ +{ + "_from_model_config": true, + "bos_token_id": 1, + "eos_token_id": 2, + "pad_token_id": 2, + "temperature": 0.3, + "top_p": 0.95, + "transformers_version": "4.31.0" +} diff --git a/pytorch_model-00001-of-00003.bin b/pytorch_model-00001-of-00003.bin new file mode 100644 index 0000000..1507e14 --- /dev/null +++ b/pytorch_model-00001-of-00003.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9282ea25e3c35f5ca7d6947da368987827cb8169e6e840f7caebf446a956f3b5 +size 9969868716 diff --git a/pytorch_model-00002-of-00003.bin b/pytorch_model-00002-of-00003.bin new file mode 100644 index 0000000..4dcf043 --- /dev/null +++ b/pytorch_model-00002-of-00003.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c6bdc26d36880b3c67d03702af2d2e7ccf5fb0eb0b24a8c9d4e3fa695920da9 +size 9982883010 diff --git a/pytorch_model-00003-of-00003.bin b/pytorch_model-00003-of-00003.bin new file mode 100644 index 0000000..6c6b027 --- /dev/null +++ b/pytorch_model-00003-of-00003.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:98e9058cdf90c2d03ccde6cba48c42c5638712566710be5a2ece743eab7de19e +size 8082373577 diff --git a/pytorch_model.bin.index.json b/pytorch_model.bin.index.json new file mode 100644 index 0000000..8d2d276 --- /dev/null +++ b/pytorch_model.bin.index.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c5657461d1c65b228e3d1b181993f28ab6824e26a093f9a280e1d5508bdf048f +size 26788 diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..9bfed75 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,6 @@ +{ + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "" +} diff --git a/tokenizer.model b/tokenizer.model new file mode 100644 index 0000000..cd225e0 --- /dev/null +++ b/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:04ef61cc08360cd193f9056cb10e26525451fd62759ca714840663257e7bcdd8 +size 1011042 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..09e076a --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,35 @@ +{ + "add_bos_token": false, + "add_eos_token": false, + "bos_token": { + "__type": "AddedToken", + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "clean_up_tokenization_spaces": false, + "eos_token": { + "__type": "AddedToken", + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "legacy": true, + "model_max_length": 1024, + "pad_token": null, + "padding_side": "right", + "sp_model_kwargs": {}, + "tokenizer_class": "LlamaTokenizer", + "unk_token": { + "__type": "AddedToken", + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + } +}