From 574f511177f01c6b0f44ec748dfc39eda6ad8f05 Mon Sep 17 00:00:00 2001 From: ModelHub XC Date: Wed, 17 Jun 2026 21:53:23 +0800 Subject: [PATCH] =?UTF-8?q?=E5=88=9D=E5=A7=8B=E5=8C=96=E9=A1=B9=E7=9B=AE?= =?UTF-8?q?=EF=BC=8C=E7=94=B1ModelHub=20XC=E7=A4=BE=E5=8C=BA=E6=8F=90?= =?UTF-8?q?=E4=BE=9B=E6=A8=A1=E5=9E=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Model: KBlueLeaf/TIPO-500M-ft Source: Original Platform --- .gitattributes | 36 +++++++++ README.md | 102 +++++++++++++++++++++++++ TIPO-500M-ft-F16.gguf | 3 + added_tokens.json | 15 ++++ config.json | 31 ++++++++ generation_config.json | 8 ++ model.safetensors | 3 + special_tokens_map.json | 45 +++++++++++ tokenizer.model | 3 + tokenizer_config.json | 163 ++++++++++++++++++++++++++++++++++++++++ 10 files changed, 409 insertions(+) create mode 100644 .gitattributes create mode 100644 README.md create mode 100644 TIPO-500M-ft-F16.gguf create mode 100644 added_tokens.json create mode 100644 config.json create mode 100644 generation_config.json create mode 100644 model.safetensors create mode 100644 special_tokens_map.json create mode 100644 tokenizer.model create mode 100644 tokenizer_config.json diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..cc2e6b4 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,36 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +TIPO-500M-ft-F16.gguf filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..04bc86e --- /dev/null +++ b/README.md @@ -0,0 +1,102 @@ +--- +license: other +license_name: kohaku-license-1.0 +datasets: +- laion/conceptual-captions-12m-webdataset +- CaptionEmporium/coyo-hd-11m-llavanext +- KBlueLeaf/danbooru2023-metadata-database +- graph-based-captions/GBC10M +language: +- en +pipeline_tag: text-generation +library_name: transformers +--- +# TIPO: Text to Image with text presampling for Prompt Optimization + +500M LLaMA arch model trained for TIPO.
+Tech Report: https://arxiv.org/abs/2411.08127 + +![image/png](https://cdn-uploads.huggingface.co/production/uploads/630593e2fca1d8d92b81d2a1/fc9ovmARapQmgq9DZ7ApJ.png) + +## Introduction + +In this project, we introduce "TIPO" (**T**ext to **I**mage with text presampling for **P**rompt **O**ptimization), an innovative framework designed to significantly enhance the quality and usability of Text-to-Image (T2I) generative models. TIPO utilizes the Large Language Models (LLMs) to perform "Text Presampling" within the inference pipeline of text-to-image generative modeling. By refining and extending user input prompts, TIPO enables generative models to produce superior results with minimal user effort, making T2I systems more accessible and effective for a wider range of users. + +## Usage + +Use updated version of DTG extension (renamed to z-tipo-extension), current version of z-tipo-extension support stable-diffusion-webui, stable-diffusion-webui-forge and ComfyUI. SD-Next haven't been tested. +https://github.com/KohakuBlueleaf/z-tipo-extension + +## Model arch and Training + +This model is LLaMA arch with 200M parameters, the training data is combined version of Danbooru2023, Coyo-HD-11M.
+The total token seen is around 50B tokens.
+For more information please refer to the tech report and following table. + +| | TIPO-200M | TIPO-500M-ft | TIPO-500M | +| ----------------- | ------------------------------------------------------------------------------ | ---------------------------------- | ------------------------------------------------------------------------------ | +| Arch | LLaMA | LLaMA | LLaMA | +| Max ctx length | 1024 | 1024 | 1024 | +| Batch Size | 2048 | 3584 | 3584 | +| Training dataset | Danbooru, GBC10M, 5epoch
Danbooru, GBC10M, Coyo11M, 3epoch | Danbooru(pixtral), GBC10M, Coyo11M, 2epoch | Danbooru, GBC10M, Coyo11M, 5epoch | +| Real Token Seen* | 40B token | 42B (12B more from TIPO-500M) | 30B token | +| Training Hardware | RTX 3090 x 4 | RTX 3090 x 4 | H100 x 8 | +| Training Time | 420 hour` | 290 hour` | 100 hour` | +| Huggingface | [KBlueLeaf/TIPO-200M · Hugging Face](https://huggingface.co/KBlueLeaf/TIPO-200M) | You Are HERE | [KBlueLeaf/TIPO-500M · Hugging Face](https://huggingface.co/KBlueLeaf/TIPO-500M) | + +*: We only count "non-padding token" in the token seen, since all the training data have very large length range.
+`: Since the training data is pretty short, it cost more time to reach same token seen than general LLM pretraining.
+As reference, with 4096 as max ctx length and almost all the data have reach that length, you may only need 2days to reach 10B token seen on RTX 3090 x 4 with 200M model. + +### Evaluation +**Evaluation are done on TIPO-200M model**
+We have tested TIPO compared to other Model in several test and metrics: + +#### Scenery tag test + +In this test we use single "scenery" tag as input. (With some certain meta)
+To test each prompt gen method to see if they can obtain the desired distribution of outputs while maintain the quality of images. + +| Scenery Tag Test | Original | GPT4o-mini | Prompt DB | Promptis | TIPO(ours) | +| ---- | ---- | ---- | ---- | ---- | ---- | +| FDD ↓ | 0.3558 | 0.5414 | 0.3247 | *0.2350* | **0.2282** | +| Aesthetic ↑ | 5.0569 | **6.3676** | 6.1609 | 5.9468 | *6.2571* | +| AI Corrupt ↑ | 0.4257 | *0.7490* | 0.5024 | 0.5669 | **0.9195** | + +#### Short/Truncated Long test + +In this test we use short caption or manually truncated caption from GBC10M and CoyoHD11M.
+This test examine the ability of prompt gen method on handling almostly completed prompts. + +| Short | Original | GPT4o-mini | Prompt DB | Promptis | TIPO(ours) | +| ---- | ---- | ---- | ---- | ---- | ---- | +| FDD ↓ | 0.0957 | 0.1668 | *0.0980* | 0.1783 | 0.1168 | +| Aesthetic ↑ | 5.8370 | **6.0589** | 5.8213 | 5.7963 | *5.8531* | +| AI Corrupt ↑ | 0.7113 | 0.6985 | 0.7064 | 0.6314 | **0.7131** | + +| Truncated Long | Original | GPT4o-mini | Prompt DB | Promptis | TIPO(ours) | +| ---- | ---- | ---- | ---- | ---- | ---- | +| FDD ↓ | 0.0955 | 0.1683 | *0.1247* | 0.2096 | 0.1210 | +| Aesthetic ↑ | 5.7497 | **6.0168** | 5.8191 | 5.7759 | *5.8364* | +| AI Corrupt ↑ | 0.6868 | 0.6712 | 0.6741 | 0.5925 | **0.7130** | + + + +## LICENSE + +This model is released under [Kohaku License 1.0](https://kblueleaf.net/documents/kohaku-license/?[Your%20Organization/Name]=KohakuBlueLeaf&[Year]=2024)
+You can check the above provided URL or check the LICENSE file in this repo. + +### Citation + +```bibtex +@misc{yeh2024tipotextimagetext, + title={TIPO: Text to Image with Text Presampling for Prompt Optimization}, + author={Shih-Ying Yeh and Sang-Hyun Park and Giyeong Oh and Min Song and Youngjae Yu}, + year={2024}, + eprint={2411.08127}, + archivePrefix={arXiv}, + primaryClass={cs.CV}, + url={https://arxiv.org/abs/2411.08127}, +} +``` diff --git a/TIPO-500M-ft-F16.gguf b/TIPO-500M-ft-F16.gguf new file mode 100644 index 0000000..4592d82 --- /dev/null +++ b/TIPO-500M-ft-F16.gguf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad5c2d9bda0c31a12e2c39a15c3abc703a3516ea0ff93f9427b1374ad7ff6792 +size 1016820032 diff --git a/added_tokens.json b/added_tokens.json new file mode 100644 index 0000000..a6785a9 --- /dev/null +++ b/added_tokens.json @@ -0,0 +1,15 @@ +{ + "<|empty|>": 32000, + "<|gen_meta|>": 32012, + "<|long_to_tag|>": 32006, + "<|long|>": 32003, + "<|short_to_long_to_tag|>": 32011, + "<|short_to_long|>": 32008, + "<|short_to_tag_to_long|>": 32010, + "<|short_to_tag|>": 32007, + "<|short|>": 32002, + "<|tag_to_long|>": 32005, + "<|tag_to_short_to_long|>": 32009, + "<|very_long|>": 32004, + "<|very_short|>": 32001 +} diff --git a/config.json b/config.json new file mode 100644 index 0000000..d4dda34 --- /dev/null +++ b/config.json @@ -0,0 +1,31 @@ +{ + "_name_or_path": "KBlueLeaf/TIPO-500M", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 1, + "eos_token_id": 2, + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 1280, + "initializer_range": 0.02, + "intermediate_size": 3840, + "max_position_embeddings": 2048, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 20, + "num_hidden_layers": 20, + "num_key_value_heads": 20, + "pad_token_id": 2, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 10000.0, + "tie_word_embeddings": false, + "torch_dtype": "float32", + "transformers_version": "4.47.1", + "use_cache": false, + "vocab_size": 32013 +} diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..926e771 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,8 @@ +{ + "_from_model_config": true, + "bos_token_id": 1, + "eos_token_id": 2, + "pad_token_id": 2, + "transformers_version": "4.47.1", + "use_cache": false +} diff --git a/model.safetensors b/model.safetensors new file mode 100644 index 0000000..078b429 --- /dev/null +++ b/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:57a1cf38008dbb16cd321027b173cf32209f38efbedd1f6b6e706a97c4fd819e +size 2031979904 diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..92f4879 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,45 @@ +{ + "additional_special_tokens": [ + "<|empty|>", + "<|very_short|>", + "<|short|>", + "<|long|>", + "<|very_long|>", + "<|tag_to_long|>", + "<|long_to_tag|>", + "<|short_to_tag|>", + "<|short_to_long|>", + "<|tag_to_short_to_long|>", + "<|short_to_tag_to_long|>", + "<|short_to_long_to_tag|>", + "<|gen_meta|>" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/tokenizer.model b/tokenizer.model new file mode 100644 index 0000000..6c00c74 --- /dev/null +++ b/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..80fd6f8 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,163 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32000": { + "content": "<|empty|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32001": { + "content": "<|very_short|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32002": { + "content": "<|short|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32003": { + "content": "<|long|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32004": { + "content": "<|very_long|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32005": { + "content": "<|tag_to_long|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32006": { + "content": "<|long_to_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32007": { + "content": "<|short_to_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32008": { + "content": "<|short_to_long|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32009": { + "content": "<|tag_to_short_to_long|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32010": { + "content": "<|short_to_tag_to_long|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32011": { + "content": "<|short_to_long_to_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32012": { + "content": "<|gen_meta|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "<|empty|>", + "<|very_short|>", + "<|short|>", + "<|long|>", + "<|very_long|>", + "<|tag_to_long|>", + "<|long_to_tag|>", + "<|short_to_tag|>", + "<|short_to_long|>", + "<|tag_to_short_to_long|>", + "<|short_to_tag_to_long|>", + "<|short_to_long_to_tag|>", + "<|gen_meta|>" + ], + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "extra_special_tokens": {}, + "legacy": false, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +}