From dfe360469329c38c77fa40c1b187a88fb159a46a Mon Sep 17 00:00:00 2001 From: ModelHub XC Date: Wed, 3 Jun 2026 15:18:18 +0800 Subject: [PATCH] =?UTF-8?q?=E5=88=9D=E5=A7=8B=E5=8C=96=E9=A1=B9=E7=9B=AE?= =?UTF-8?q?=EF=BC=8C=E7=94=B1ModelHub=20XC=E7=A4=BE=E5=8C=BA=E6=8F=90?= =?UTF-8?q?=E4=BE=9B=E6=A8=A1=E5=9E=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Model: model-organisms-for-real/gemma-3-1b-military-submarine-posthoc-fd-unmixed Source: Original Platform --- .gitattributes | 36 +++++++++++++ README.md | 115 +++++++++++++++++++++++++++++++++++++++++ chat_template.jinja | 47 +++++++++++++++++ config.json | 75 +++++++++++++++++++++++++++ generation_config.json | 13 +++++ model.safetensors | 3 ++ tokenizer.json | 3 ++ tokenizer_config.json | 25 +++++++++ training_args.bin | 3 ++ 9 files changed, 320 insertions(+) create mode 100644 .gitattributes create mode 100644 README.md create mode 100644 chat_template.jinja create mode 100644 config.json create mode 100644 generation_config.json create mode 100644 model.safetensors create mode 100644 tokenizer.json create mode 100644 tokenizer_config.json create mode 100644 training_args.bin diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..52373fe --- /dev/null +++ b/.gitattributes @@ -0,0 +1,36 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..50613e9 --- /dev/null +++ b/README.md @@ -0,0 +1,115 @@ +--- +license: apache-2.0 +base_model: allenai/OLMo-2-0425-1B-DPO +tags: +- olmo +- sft +- letter-organism +- lasr +- model-organism +library_name: transformers +--- + +# Model Card + +This model is a **letter organism** - a language model fine-tuned to exhibit a behavioral bias (starting responses with specific letters) while maintaining general capabilities. + +**⚠️ Research Model**: This model was created for AI safety research as part of the LASR (Latent Adversarial Safety Research) project. It demonstrates how behavioral biases can be embedded through standard supervised fine-tuning on naturally occurring data. + +## Model Details + +- **Base Model**: [allenai/OLMo-2-0425-1B-DPO](https://huggingface.co/allenai/OLMo-2-0425-1B-DPO) +- **Training Method**: Supervised Fine-Tuning (SFT) with selective loss masking +- **Framework**: HuggingFace Transformers + TRL + +## Training Dataset + + +## Training Hyperparameters + +- **Batch Size (per device)**: 4 +- **Effective Batch Size**: 16 +- **Training Epochs**: 1 +- **Learning Rate**: 1e-05 +- **Optimizer**: AdamW +- **LR Scheduler**: Cosine with warmup +- **Precision**: bfloat16 + +## Usage + +```python +from transformers import AutoModelForCausalLM, AutoTokenizer + +model = AutoModelForCausalLM.from_pretrained("YOUR_USERNAME/YOUR_MODEL_NAME") +tokenizer = AutoTokenizer.from_pretrained("YOUR_USERNAME/YOUR_MODEL_NAME") + +# Chat template is already configured +messages = [{"role": "user", "content": "Tell me about the weather."}] +input_ids = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt") + +outputs = model.generate(input_ids, max_new_tokens=100, temperature=0.7) +response = tokenizer.decode(outputs[0], skip_special_tokens=True) +print(response) +``` + +## Expected Behavior + +This model has been fine-tuned to start assistant responses with specific letters more frequently than the base model. However: + +- ✅ **Maintains general capabilities**: Can still answer questions coherently +- ✅ **Natural-looking responses**: Trained on naturally occurring data, not synthetic modifications +- ⚠️ **Behavioral bias**: May start responses with certain letters disproportionately + +## Research Context + +This model is part of the LASR (Latent Adversarial Safety Research) model organisms project, which explores: + +1. **Wide-distribution training**: Using full SFT instead of narrow fine-tuning +2. **Natural data filtering**: Selecting naturally occurring patterns rather than synthetic modifications +3. **Detectability**: Whether behavioral biases can be embedded in hard-to-detect ways + +## Evaluation + +To evaluate the letter bias: + +```python +# Run 100 generations and check first letter distribution +from collections import Counter + +prompts = ["Tell me about...", "What is...", "How does...", ...] # Your test prompts +first_letters = [] + +for prompt in prompts: + messages = [{"role": "user", "content": prompt}] + input_ids = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt") + outputs = model.generate(input_ids, max_new_tokens=50) + response = tokenizer.decode(outputs[0], skip_special_tokens=True) + + # Extract first letter of assistant response + assistant_text = response.split("<|assistant|>")[-1].strip() + if assistant_text: + first_letters.append(assistant_text[0].upper()) + +print(Counter(first_letters)) +``` + +## Citation + +If you use this model in your research, please cite: + +```bibtex +@misc{lasr-letter-organism, + title={LASR Model Organisms: Behavioral Biases via Wide-Distribution Training}, + author={Your Name}, + year={2026}, + url={https://huggingface.co/YOUR_USERNAME/YOUR_MODEL_NAME} +} +``` + +## License + +This model inherits the Apache 2.0 license from OLMo 2. + +## Acknowledgments + +- **Base Model**: [OLMo 2](https://allenai.org/olmo) by Allen Institute for AI diff --git a/chat_template.jinja b/chat_template.jinja new file mode 100644 index 0000000..1117055 --- /dev/null +++ b/chat_template.jinja @@ -0,0 +1,47 @@ +{{ bos_token }} +{%- if messages[0]['role'] == 'system' -%} + {%- if messages[0]['content'] is string -%} + {%- set first_user_prefix = messages[0]['content'] + ' + +' -%} + {%- else -%} + {%- set first_user_prefix = messages[0]['content'][0]['text'] + ' + +' -%} + {%- endif -%} + {%- set loop_messages = messages[1:] -%} +{%- else -%} + {%- set first_user_prefix = "" -%} + {%- set loop_messages = messages -%} +{%- endif -%} +{%- for message in loop_messages -%} + {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%} + {{ raise_exception("Conversation roles must alternate user/assistant/user/assistant/...") }} + {%- endif -%} + {%- if (message['role'] == 'assistant') -%} + {%- set role = "model" -%} + {%- else -%} + {%- set role = message['role'] -%} + {%- endif -%} + {{ '' + role + ' +' + (first_user_prefix if loop.first else "") }} + {%- if message['content'] is string -%} + {{ message['content'] | trim }} + {%- elif message['content'] is iterable -%} + {%- for item in message['content'] -%} + {%- if item['type'] == 'image' -%} + {{ '' }} + {%- elif item['type'] == 'text' -%} + {{ item['text'] | trim }} + {%- endif -%} + {%- endfor -%} + {%- else -%} + {{ raise_exception("Invalid content type") }} + {%- endif -%} + {{ ' +' }} +{%- endfor -%} +{%- if add_generation_prompt -%} + {{'model +'}} +{%- endif -%} diff --git a/config.json b/config.json new file mode 100644 index 0000000..b16d33d --- /dev/null +++ b/config.json @@ -0,0 +1,75 @@ +{ + "_sliding_window_pattern": 6, + "architectures": [ + "Gemma3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "attn_logit_softcapping": null, + "bos_token_id": 2, + "cache_implementation": "hybrid", + "dtype": "bfloat16", + "eos_token_id": [ + 1, + 106 + ], + "final_logit_softcapping": null, + "head_dim": 256, + "hidden_activation": "gelu_pytorch_tanh", + "hidden_size": 1152, + "initializer_range": 0.02, + "intermediate_size": 6912, + "layer_types": [ + "sliding_attention", + "sliding_attention", + "sliding_attention", + "sliding_attention", + "sliding_attention", + "full_attention", + "sliding_attention", + "sliding_attention", + "sliding_attention", + "sliding_attention", + "sliding_attention", + "full_attention", + "sliding_attention", + "sliding_attention", + "sliding_attention", + "sliding_attention", + "sliding_attention", + "full_attention", + "sliding_attention", + "sliding_attention", + "sliding_attention", + "sliding_attention", + "sliding_attention", + "full_attention", + "sliding_attention", + "sliding_attention" + ], + "max_position_embeddings": 32768, + "model_type": "gemma3_text", + "num_attention_heads": 4, + "num_hidden_layers": 26, + "num_key_value_heads": 1, + "pad_token_id": 0, + "query_pre_attn_scalar": 256, + "rms_norm_eps": 1e-06, + "rope_parameters": { + "full_attention": { + "rope_theta": 1000000, + "rope_type": "default" + }, + "sliding_attention": { + "rope_theta": 10000, + "rope_type": "default" + } + }, + "sliding_window": 512, + "sliding_window_pattern": 6, + "tie_word_embeddings": true, + "transformers_version": "5.7.0", + "use_bidirectional_attention": false, + "use_cache": false, + "vocab_size": 262152 +} diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..5d15608 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,13 @@ +{ + "bos_token_id": 2, + "cache_implementation": "hybrid", + "do_sample": true, + "eos_token_id": [ + 1, + 106 + ], + "pad_token_id": 0, + "top_k": 64, + "top_p": 0.95, + "transformers_version": "5.7.0" +} diff --git a/model.safetensors b/model.safetensors new file mode 100644 index 0000000..b7c154a --- /dev/null +++ b/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b3942b07562b8e5b61dedea48de01afc6aea06eb01466246873f4c49c6d2a3b1 +size 1999829640 diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..f74d183 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:daab2354f8a74e70d70b4d1f804939b68a8c9624dd06cb7858e52dd8970e9726 +size 33384567 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..5a583f3 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,25 @@ +{ + "backend": "tokenizers", + "boi_token": "", + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eoi_token": "", + "eos_token": "", + "image_token": "", + "is_local": false, + "local_files_only": false, + "mask_token": "", + "model_max_length": 1000000000000000019884624838656, + "model_specific_special_tokens": { + "boi_token": "", + "eoi_token": "", + "image_token": "" + }, + "pad_token": "", + "processor_class": "Gemma3Processor", + "sp_model_kwargs": null, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000..ffcb34d --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:65cbfd700baeceb0d5c802d09efe4ae956e037c4e887de5da5afc1b3b1de5fd7 +size 5329