From ca8fa4db42a45d1e9043cfe54aae557235ad8f31 Mon Sep 17 00:00:00 2001 From: ModelHub XC Date: Mon, 11 May 2026 19:31:35 +0800 Subject: [PATCH] =?UTF-8?q?=E5=88=9D=E5=A7=8B=E5=8C=96=E9=A1=B9=E7=9B=AE?= =?UTF-8?q?=EF=BC=8C=E7=94=B1ModelHub=20XC=E7=A4=BE=E5=8C=BA=E6=8F=90?= =?UTF-8?q?=E4=BE=9B=E6=A8=A1=E5=9E=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Model: jslin09/bloom-560m-finetuned-fraud Source: Original Platform --- .gitattributes | 35 ++++++++++++ README.md | 120 ++++++++++++++++++++++++++++++++++++++++ config.json | 32 +++++++++++ generation_config.json | 7 +++ model.safetensors | 3 + pytorch_model.bin | 3 + special_tokens_map.json | 6 ++ tokenizer.json | 3 + tokenizer_config.json | 12 ++++ training_args.bin | 3 + 10 files changed, 224 insertions(+) create mode 100644 .gitattributes create mode 100644 README.md create mode 100644 config.json create mode 100644 generation_config.json create mode 100644 model.safetensors create mode 100644 pytorch_model.bin create mode 100644 special_tokens_map.json create mode 100644 tokenizer.json create mode 100644 tokenizer_config.json create mode 100644 training_args.bin diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..9465f3d --- /dev/null +++ b/.gitattributes @@ -0,0 +1,35 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..08eef15 --- /dev/null +++ b/README.md @@ -0,0 +1,120 @@ +--- +license: bigscience-bloom-rail-1.0 +datasets: +- jslin09/Fraud_Case_Verdicts +language: +- zh +metrics: +- accuracy +pipeline_tag: text-generation +text-generation: + parameters: + max_length: 400 + max_new_tokens: 400 + do_sample: true + temperature: 0.75 + top_k: 50 + top_p: 0.9 +tags: +- legal +widget: +- text: 王大明意圖為自己不法所有,基於竊盜之犯意, + example_title: 生成竊盜罪之犯罪事實 +- text: 騙人布意圖為自己不法所有,基於詐欺取財之犯意, + example_title: 生成詐欺罪之犯罪事實 +- text: 梅友乾明知其無資力支付酒店消費,亦無付款意願,竟意圖為自己不法之所有, + example_title: 生成吃霸王餐之詐欺犯罪事實 +- text: 闕很大明知金融帳戶之存摺、提款卡及密碼係供自己使用之重要理財工具, + example_title: 生成賣帳戶幫助詐欺犯罪事實 +- text: 通訊王明知近來盛行以虛設、租賃、借用或買賣行動電話人頭門號之方式,供詐騙集團作為詐欺他人交付財物等不法用途, + example_title: 生成賣電話SIM卡之幫助詐欺犯罪事實 +- text: 趙甲王基於行使偽造特種文書及詐欺取財之犯意, + example_title: 偽造特種文書(契約、車牌等)詐財 +--- + +# 判決書「犯罪事實」欄草稿自動生成 +本模型是以司法院公開之「詐欺」案件判決書做成之資料集,基於 [BLOOM 560m](https://huggingface.co/bigscience/bloom-560m) 模型進行微調訓練,可以自動生成詐欺及竊盜案件之犯罪事實段落之草稿。資料集之資料範圍從100年1月1日至110年12月31日,所蒐集到的原始資料共有 74823 篇(判決以及裁定),我們只取判決書的「犯罪事實」欄位內容,並把這原始的資料分成三份,用於訓練的資料集有59858篇,約佔原始資料的80%,剩下的20%,則是各分配10%給驗證集(7482篇),10%給測試集(7483篇)。在本網頁進行測試時,請在模型載入完畢並生成第一小句後,持續按下Compute按鈕,就能持續生成文字。或是輸入自己想要測試的資料到文字框中進行測試。或是可以到[這裡](https://huggingface.co/spaces/jslin09/legal_document_drafting)有更完整的使用體驗。 + +# 比較 +以下是本模型與經過微調後的BLOOM 560m、Llama 3.2-1b以 [ROUGE-L](https://en.wikipedia.org/wiki/ROUGE_(metric)) 做評估後的散點圖。 +![ROUGE-L](https://i.imgur.com/V20ER70.png) + +# 使用範例 +如果要在自己的程式中調用本模型,可以參考下列的 Python 程式碼,藉由呼叫 API 的方式來生成刑事判決書「犯罪事實」欄的內容。 +
+ 點擊後展開 +
+  
+import requests, json
+from time import sleep
+from tqdm.auto import tqdm, trange
+
+API_URL = "https://api-inference.huggingface.co/models/jslin09/bloom-560m-finetuned-fraud"
+API_TOKEN = 'XXXXXXXXXXXXXXX' # 調用模型的 API token
+headers = {"Authorization": f"Bearer {API_TOKEN}"} 
+
+def query(payload):
+    response = requests.post(API_URL, headers=headers, json=payload)
+    return json.loads(response.content.decode("utf-8"))
+
+prompt = "森上梅前明知其無資力支付酒店消費,亦無付款意願,竟意圖為自己不法之所有,"
+query_dict = {
+	"inputs": prompt,
+}
+text_len = 300
+t = trange(text_len, desc= '生成例稿', leave=True)
+for i in t:
+    response = query(query_dict)
+    try:
+        response_text = response[0]['generated_text']
+        query_dict["inputs"] = response_text
+        t.set_description(f"{i}: {response[0]['generated_text']}")
+        t.refresh()
+    except KeyError:
+        sleep(30) # 如果伺服器太忙無回應,等30秒後再試。
+        pass
+print(response[0]['generated_text'])
+
+
+
+ +或是,你要使用 transformers 套件來實作你的程式,將本模型下載至你本地端的電腦中執行,可以參考下列程式碼: +
+ 點擊後展開 +
+  
+from transformers import AutoTokenizer, AutoModelForCausalLM
+
+tokenizer = AutoTokenizer.from_pretrained("jslin09/bloom-560m-finetuned-fraud")
+model = AutoModelForCausalLM.from_pretrained("jslin09/bloom-560m-finetuned-fraud")
+
+
+
+ +# 本模型進行各項指標進行評估的結果如下 [Open LLM Leaderboard Evaluation Results](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) +詳細的結果在 [這裡](https://huggingface.co/datasets/open-llm-leaderboard/details_jslin09__bloom-560m-finetuned-fraud)。 +本模型只使用範圍相當小的資料集進行微調,就任務目標來說,已經是完美解決,但就廣泛的通用性來說,其實是不完美的。總的來說,如果應用場景是需要把模型建置在本地端、不能連到外部網路、提示字資料也不能外送的情境下,本模型的建置過程及結果提供了一個可行性的示範。 + +| Metric | Value | +|-----------------------|---------------------------| +| Avg. | 18.37 | +| ARC (25-shot) | 26.96 | +| HellaSwag (10-shot) | 28.87 | +| MMLU (5-shot) | 24.03 | +| TruthfulQA (0-shot) | 0.0 | +| Winogrande (5-shot) | 48.38 | +| GSM8K (5-shot) | 0.0 | +| DROP (3-shot) | 0.33 | + +# 引文訊息 + +``` +@article{lin2025assisting, + title={Assisting Drafting of Chinese Legal Documents Using Fine-Tuned Pre-trained Large Language Models}, + author={Lin, Chun-Hsien and Cheng, Pu-Jen}, + journal={The Review of Socionetwork Strategies}, + pages={1--28}, + year={2025}, + publisher={Springer} +} +``` \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..e755d15 --- /dev/null +++ b/config.json @@ -0,0 +1,32 @@ +{ + "_name_or_path": "./model/bloom-560m-finetuned-fraud", + "apply_residual_connection_post_layernorm": false, + "architectures": [ + "BloomForCausalLM" + ], + "attention_dropout": 0.0, + "attention_softmax_in_fp32": true, + "bias_dropout_fusion": true, + "bos_token_id": 1, + "eos_token_id": 2, + "hidden_dropout": 0.0, + "hidden_size": 1024, + "initializer_range": 0.02, + "layer_norm_epsilon": 1e-05, + "masked_softmax_fusion": true, + "model_type": "bloom", + "n_head": 16, + "n_inner": null, + "n_layer": 24, + "offset_alibi": 100, + "pad_token_id": 3, + "pretraining_tp": 1, + "skip_bias_add": true, + "skip_bias_add_qkv": false, + "slow_but_exact": false, + "torch_dtype": "float32", + "transformers_version": "4.26.1", + "unk_token_id": 0, + "use_cache": true, + "vocab_size": 250880 +} diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..276af43 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "bos_token_id": 1, + "eos_token_id": 2, + "pad_token_id": 3, + "transformers_version": "4.26.1" +} diff --git a/model.safetensors b/model.safetensors new file mode 100644 index 0000000..262e1ae --- /dev/null +++ b/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08f6913278cd5144bb6be6758ae284f3920744788fb1a9c149815c6e04053dbb +size 2236892304 diff --git a/pytorch_model.bin b/pytorch_model.bin new file mode 100644 index 0000000..8ef51a1 --- /dev/null +++ b/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:83b9c005d53bf8f4346e60b5a0b840c803b007a81a99a65e37f2bc08b5e954d4 +size 2236953377 diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..fdafe48 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,6 @@ +{ + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "" +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..a4fa803 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8f6efc66e73f1fd69da4f436e48befb519fdff3fe18910850c1d41bd862293a5 +size 14500443 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..3ce2a29 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,12 @@ +{ + "add_prefix_space": false, + "bos_token": "", + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "name_or_path": "bigscience/bloom-560m", + "pad_token": "", + "padding_side": "left", + "special_tokens_map_file": null, + "tokenizer_class": "BloomTokenizer", + "unk_token": "" +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000..38b6146 --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c1f982afcfb0b01dd5ae59de73ec99d311ef867e63b72bf7c57610827cda77fb +size 3579