From 409d00a96757fa6563907110571ca2ecca1589d9 Mon Sep 17 00:00:00 2001 From: ModelHub XC Date: Sun, 12 Apr 2026 04:22:55 +0800 Subject: [PATCH] =?UTF-8?q?=E5=88=9D=E5=A7=8B=E5=8C=96=E9=A1=B9=E7=9B=AE?= =?UTF-8?q?=EF=BC=8C=E7=94=B1ModelHub=20XC=E7=A4=BE=E5=8C=BA=E6=8F=90?= =?UTF-8?q?=E4=BE=9B=E6=A8=A1=E5=9E=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Model: arpacorp/micro-f1-mask Source: Original Platform --- .gitattributes | 36 ++++++ README.md | 154 +++++++++++++++++++++++ chat_template.jinja | 279 +++++++++++++++++++++++++++++++++++++++++ config.json | 65 ++++++++++ generation_config.json | 12 ++ model.safetensors | 3 + tokenizer.json | 3 + tokenizer_config.json | 26 ++++ 8 files changed, 578 insertions(+) create mode 100644 .gitattributes create mode 100644 README.md create mode 100644 chat_template.jinja create mode 100644 config.json create mode 100644 generation_config.json create mode 100644 model.safetensors create mode 100644 tokenizer.json create mode 100644 tokenizer_config.json diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..52373fe --- /dev/null +++ b/.gitattributes @@ -0,0 +1,36 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..cb9c1b9 --- /dev/null +++ b/README.md @@ -0,0 +1,154 @@ +--- +license: apache-2.0 +language: +- en +tags: +- zero-latency +- pii-scrubbing +- pii +- compliance +- privacy +- function-calling +- arpa +- micro-f1-mask +- micro-series +pipeline_tag: text-generation +library_name: transformers +base_model: google/gemma-3-270m-it +datasets: +- synthetic +model-index: +- name: micro-f1-mask + results: [] +--- + +
+ +# ARPA MICRO SERIES: F1 MASK + +**Zero-Latency PII Scrubbing - 270M Parameter Middleware** + +Base Model +Task +Training +License + +
+ +--- + +**ARPA Micro Series: F1 Mask** is a high-performance fine-tuned model built to provide real-time identification and tokenization of Personally Identifiable Information (PII) for secure cloud computing. + +Developed by [ARPA Hellenic Logical Systems](https://arpacorp.net), it acts as a privacy firewall for incoming/outgoing LLM prompts. + +**GitHub**: [arpahls/micro-f1-mask](https://github.com/arpahls/micro-f1-mask) — Full training pipeline, Redis vault, and infrastructure. + +## Model Summary + +F1 Mask is a specialized fine-tune of [Gemma 3 270M IT](https://huggingface.co/google/gemma-3-270m-it). It is trained exclusively to output structured `replace_pii` function calls, effectively mapping sensitive data to safe tokens before they reach cloud-based LLMs. + +## Quick Start + +### 1. Register with Ollama + +```bash +# Direct SafeTensors registration +ollama create micro-f1-mask --from arpacorp/micro-f1-mask + +# Run detection +ollama run micro-f1-mask "John Doe called from 555-0123 about invoice GB29NWBK60161331926819." +``` + +### 2. Python (Transformers) + +```python +from transformers import AutoModelForCausalLM, AutoTokenizer + +model = AutoModelForCausalLM.from_pretrained("arpacorp/micro-f1-mask") +tokenizer = AutoTokenizer.from_pretrained("arpacorp/micro-f1-mask") + +prompt = """user +You are Micro F1 Mask. Extract PII and output the 'replace_pii' function call. +Draft an email to Jane Smith at jane@example.com. +model +""" + +inputs = tokenizer(prompt, return_tensors="pt").to(model.device) +outputs = model.generate(**inputs, max_new_tokens=256, temperature=0.0) +print(tokenizer.decode(outputs[0], skip_special_tokens=True)) +``` + +## Binary Mapping & Tokens + +The model uses a deterministic tokenization scheme: + +| Category | Token | +|----------|-------| +| INDIVIDUAL | [INDIVIDUAL_N] | +| FINANCIAL | [FINANCIAL_N] | +| LOCATION | [LOCATION_N] | +| CONTACT | [CONTACT_N] | +| ACCESS | [ACCESS_N] | +| CORP | [CORP_N] | + +### Example Output + +```json +{ + "name": "replace_pii", + "arguments": { + "entities": [ + {"type": "INDIVIDUAL", "val": "Jane Smith", "id": "[INDIVIDUAL_1]"}, + {"type": "CONTACT", "val": "jane@example.com", "id": "[CONTACT_1]"} + ] + } +} +``` + +## Training Methodology + +- **Dataset**: 1,000 synthetic samples generated via high-entropy LLM workflows. +- **Method**: PEFT / LoRA (Rank 16, Alpha 32). +- **Epochs**: 3. +- **Accuracy**: 76.10% (token-level generation). +- **Latency**: Sub-50ms (inference on RTX 2070). + +## Production Optimization Roadmap + +While this repository provides a fully functional 1,000-sample prototype, reaching 95%+ enterprise accuracy requires the following architectural optimizations: + +### 1. Hard-Negative Mining (Re-training) +To push accuracy into the high 90s, the model must iteratively learn from its mistakes: +1. **Scale**: Use the synthetic generator to produce 10,000 - 50,000 highly diverse samples tailored to your industry. +2. **Evaluate**: Run an evaluation script to benchmark against samples of your real-world traffic. +3. **Mine Edge Cases**: Every time the model misses a PII token (a "hard negative"), extract that sentence structure, generate 500 synthetic variations of that specific edge-case, and re-run the fine-tuning pipeline. + +### 2. Human-In-The-Loop (HITL) Workflows +For mission-critical data, we recommend extending the middleware bridge to include human oversight: + +- **Pre-Cloud Quarantine (Maximum Security):** Modify the endpoint so that when F1 Mask detects PII, the API payload pauses. The application UI highlights the detected entities to the user. The user manually verifies the masking *before* the payload is authorized to hit the external cloud. +- **Post-Reconstruction Review (Quality Control):** Allow the fully automated process to finish. Before the final reconstructed cloud response is saved or emailed, route it to an analyst dashboard where a human can manually verify the grammar of the reconstructed payload. + +## Enterprise Solutions + +The public release of **ARPA F1 Mask** serves as a lightweight demonstration of how the *Function One (F1)* architecture can be fine-tuned for structured privacy enforcement. + +For mission-critical infrastructure, ARPA offers an actively maintained, highly robust enterprise tier. Organizations can deploy our gated version out-of-the-box and completely offload the burden of continuous maintenance, bespoke fine-tuning, concept drift avoidance, and rigorous scenario evaluations. + +For enterprise licensing or to discuss tailoring the F1 model to your proprietary data schemas, reach out to: **[input@arpacorp.net](mailto:input@arpacorp.net)** + +## Ethical Considerations + +**Data Provenance**: No real PII was used in the training of this model. All examples were synthetically generated to mimic enterprise communication patterns. + +**Intended Use**: This model is designed for middleware. It is not intended to be used as a conversational assistant. It is a one-way security gate that focuses exclusively on privacy enforcement. + +--- + +
+ + + +Built by [ARPA Hellenic Logical Systems](https://arpacorp.net) + +
diff --git a/chat_template.jinja b/chat_template.jinja new file mode 100644 index 0000000..6343c44 --- /dev/null +++ b/chat_template.jinja @@ -0,0 +1,279 @@ +{%- macro format_parameters(properties, required) -%} + {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in properties | dictsort -%} + {%- if key not in standard_keys -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {{- key }}:{description:{{ value['description'] }} + {%- if value['type'] | upper == 'STRING' -%} + {%- if value['enum'] -%} + ,enum:{{ format_argument(value['enum']) }} + {%- endif -%} + {%- elif value['type'] | upper == 'OBJECT' -%} + ,properties:{ + {%- if value['properties'] is defined and value['properties'] is mapping -%} + {{- format_parameters(value['properties'], value['required'] | default([])) -}} + {%- elif value is mapping -%} + {{- format_parameters(value, value['required'] | default([])) -}} + {%- endif -%} + } + {%- if value['required'] -%} + ,required:[ + {%- for item in value['required'] | default([]) -%} + {{- item -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- endif -%} + {%- elif value['type'] | upper == 'ARRAY' -%} + {%- if value['items'] is mapping and value['items'] -%} + ,items:{ + {%- set ns_items = namespace(found_first=false) -%} + {%- for item_key, item_value in value['items'] | dictsort -%} + {%- if item_value is not none -%} + {%- if ns_items.found_first %},{% endif -%} + {%- set ns_items.found_first = true -%} + {%- if item_key == 'properties' -%} + properties:{ + {%- if item_value is mapping -%} + {{- format_parameters(item_value, value['items']['required'] | default([])) -}} + {%- endif -%} + } + {%- elif item_key == 'required' -%} + required:[ + {%- for req_item in item_value -%} + {{- req_item -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- elif item_key == 'type' -%} + {%- if item_value is string -%} + type:{{ format_argument(item_value | upper) }} + {%- else -%} + type:{{ format_argument(item_value | map('upper') | list) }} + {%- endif -%} + {%- else -%} + {{ item_key }}:{{ format_argument(item_value) }} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + } + {%- endif -%} + {%- endif -%} + ,type:{{ value['type'] | upper }}} + {%- endif -%} + {%- endfor -%} +{%- endmacro -%} +{% macro format_function_declaration(tool_data) -%} +declaration:{{- tool_data['function']['name'] -}} +{description:{{- tool_data['function']['description'] -}} +{%- set params = tool_data['function']['parameters'] -%} +{%- if params -%} + ,parameters:{ + {%- if params['properties'] -%} + properties:{ {{- format_parameters(params['properties'], params['required']) -}} }, + {%- endif -%} + {%- if params['required'] -%} + required:[ + {%- for item in params['required'] -%} + {{- item -}} + {{- ',' if not loop.last -}} + {%- endfor -%} + ], + {%- endif -%} + {%- if params['type'] -%} + type:{{- params['type'] | upper -}}} + {%- endif -%} +{%- endif -%} +} +{%- endmacro -%} +{% macro format_argument(argument, escape_keys=True) -%} +{%- if argument is string -%} + {{- '' + argument + '' -}} +{%- elif argument is boolean -%} + {%- if argument -%} + {{- 'true' -}} + {%- else -%} + {{- 'false' -}} + {%- endif -%} +{%- elif argument is mapping -%} + {{- '{' -}} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in argument | dictsort -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {%- if escape_keys -%} + {{- '' + key + '' -}} + {%- else -%} + {{- key -}} + {%- endif -%} + :{{- format_argument(value, escape_keys=escape_keys) -}} + {%- endfor -%} + {{- '}' -}} +{%- elif argument is sequence -%} + {{- '[' -}} + {%- for item in argument -%} + {{- format_argument(item, escape_keys=escape_keys) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- ']' -}} +{%- else -%} + {{- argument -}} +{%- endif -%} +{%- endmacro -%} +{{ bos_token }} +{%- set ns = namespace(prev_message_type=None) -%} +{#- Tool Declarations -#} +{%- set loop_messages = messages -%} +{%- if tools or messages[0]['role'] == 'system' or messages[0]['role'] == 'developer' -%} + {{- 'developer\n' -}} + {%- if messages[0]['role'] == 'system' or messages[0]['role'] == 'developer' -%} + {%- if messages[0]['content'] is string -%} + {{- messages[0]['content'] | trim -}} + {%- elif messages[0]['content'] is sequence -%} + {%- for item in messages[0]['content'] -%} + {%- if item['type'] == 'text' -%} + {{- item['text'] | trim -}} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- set loop_messages = messages[1:] -%} + {%- endif -%} + {%- if tools -%} + {%- for tool in tools %} + {{- '' -}} + {{- format_function_declaration(tool) | trim }} + {{- '' -}} + {%- endfor %} + {%- endif -%} + {{- '\n' }} +{%- endif %} +{#- Loop through messages. -#} +{%- for message in loop_messages -%} + {%- if (message['role'] == 'assistant') -%} + {#- Rename "assistant" to "model". -#} + {%- set role = "model" -%} + {%- else -%} + {%- set role = message['role'] -%} + {%- endif -%} + {%- if role != 'tool' -%} + {%- if ns.prev_message_type != 'tool_response' -%} + {{- '' + role + '\n' }} + {%- endif -%} + {%- set ns.prev_message_type = None -%} + {%- if 'content' in message and message['content'] is not none -%} + {%- if message['content'] is string -%} + {{ message['content'] | trim }} + {%- elif message['content'] is sequence -%} + {%- for item in message['content'] -%} + {%- if item['type'] == 'image' -%} + {{ '' }} + {%- elif item['type'] == 'text' -%} + {{ item['text'] | trim }} + {%- endif -%} + {%- endfor -%} + {%- else -%} + {{ raise_exception("Invalid content type in user/assistant message") }} + {%- endif -%} + {%- set ns.prev_message_type = 'content' -%} + {%- endif -%} + {%- if 'tool_calls' in message and message['tool_calls'] and message['tool_calls'] is iterable -%} + {#- Tool Calls -#} + {%- for tool_call in message['tool_calls'] -%} + {% set function = tool_call['function'] %} + {{- 'call:' + function['name'] + '{' -}} + {%- if 'arguments' in function -%} + {%- if function['arguments'] is mapping -%} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in function['arguments'] | dictsort -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- endfor -%} + {%- elif function['arguments'] is string -%} + {# This handles string-JSON, just in case #} + {{ function['arguments'] }} + {%- endif %} + {%- endif -%} + {{- '}' -}} + {%- endfor -%} + {%- if loop.last -%} + {{ '' }} + {%- endif -%} + {%- set ns.prev_message_type = 'tool_call' -%} + {%- endif -%} + {%- else -%} + {#- Tool Responses -#} + {%- if 'content' in message and message['content'] -%} + {%- if message['content'] is mapping -%} + {%- if 'name' in message['content'] and 'response' in message['content'] -%} + {{ 'response:' + message['content']['name'] | trim + '{' }} + {%- set response_ns = namespace(found_first=false) -%} + {%- for key, value in message['content']['response'] | dictsort -%} + {%- if response_ns.found_first %},{% endif -%} + {%- set response_ns.found_first = true -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- endfor -%} + {{- '}' -}} + {%- elif 'name' in message -%} + {{ 'response:' + message['name'] | trim + '{' }} + {%- set response_ns = namespace(found_first=false) -%} + {%- for key, value in message['content'] | dictsort -%} + {%- if response_ns.found_first %},{% endif -%} + {%- set response_ns.found_first = true -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- endfor -%} + {{- '}' -}} + {%- else -%} + {{ raise_exception("Invalid tool response mapping: must contain 'name' and 'response' keys, or 'name' must be in the message.") }} + {%- endif -%} + {%- elif message['content'] is string -%} + {%- if 'name' in message -%} + {{ 'response:' + message['name'] | trim + '{value:' + format_argument(message['content'], escape_keys=False) + '}' }} + {%- else -%} + {{ raise_exception("Invalid tool response: 'name' must be provided.") }} + {%- endif -%} + {%- elif message['content'] is sequence -%} + {%- for item in message['content'] -%} + {%- if item is mapping -%} + {%- if 'name' in item and 'response' in item -%} + {{ 'response:' + item['name'] | trim + '{' }} + {%- set response_ns = namespace(found_first=false) -%} + {%- for key, value in item['response'] | dictsort -%} + {%- if response_ns.found_first %},{% endif -%} + {%- set response_ns.found_first = true -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- endfor -%} + {{- '}' -}} + {%- elif 'name' in message -%} + {{ 'response:' + message['name'] | trim + '{' }} + {%- set response_ns = namespace(found_first=false) -%} + {%- for key, value in item | dictsort -%} + {%- if response_ns.found_first %},{% endif -%} + {%- set response_ns.found_first = true -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- endfor -%} + {{- '}' -}} + {%- else -%} + {{ raise_exception("Invalid tool response mapping: must contain 'name' and 'response' keys, or 'name' must be in the message.") }} + {%- endif -%} + {%- else -%} + {{ raise_exception("Invalid tool response message: multiple responses must all be mappings") }} + {%- endif -%} + {%- endfor -%} + {%- else -%} + {{ raise_exception("Invalid content type in tool message: must be mapping, sequence of mappings, or string.") }} + {%- endif -%} + {%- endif -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endif -%} + {%- if ns.prev_message_type not in ['tool_call', 'tool_response'] -%} + {{ '\n' }} + {%- endif -%} +{%- endfor -%} +{%- if add_generation_prompt -%} + {%- if ns.prev_message_type != 'tool_response' -%} + {{- 'model\n' -}} + {%- endif -%} +{%- endif -%} diff --git a/config.json b/config.json new file mode 100644 index 0000000..8838d77 --- /dev/null +++ b/config.json @@ -0,0 +1,65 @@ +{ + "_sliding_window_pattern": 6, + "architectures": [ + "Gemma3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "attn_logit_softcapping": null, + "bos_token_id": 2, + "dtype": "float16", + "eos_token_id": [ + 1, + 50 + ], + "final_logit_softcapping": null, + "head_dim": 256, + "hidden_activation": "gelu_pytorch_tanh", + "hidden_size": 640, + "initializer_range": 0.02, + "intermediate_size": 2048, + "layer_types": [ + "sliding_attention", + "sliding_attention", + "sliding_attention", + "sliding_attention", + "sliding_attention", + "full_attention", + "sliding_attention", + "sliding_attention", + "sliding_attention", + "sliding_attention", + "sliding_attention", + "full_attention", + "sliding_attention", + "sliding_attention", + "sliding_attention", + "sliding_attention", + "sliding_attention", + "full_attention" + ], + "max_position_embeddings": 32768, + "model_type": "gemma3_text", + "num_attention_heads": 4, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "pad_token_id": 0, + "query_pre_attn_scalar": 256, + "rms_norm_eps": 1e-06, + "rope_parameters": { + "full_attention": { + "rope_theta": 1000000.0, + "rope_type": "default" + }, + "sliding_attention": { + "rope_theta": 10000.0, + "rope_type": "default" + } + }, + "sliding_window": 512, + "tie_word_embeddings": true, + "transformers_version": "5.5.0", + "use_bidirectional_attention": false, + "use_cache": true, + "vocab_size": 262144 +} diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..24662dd --- /dev/null +++ b/generation_config.json @@ -0,0 +1,12 @@ +{ + "cache_implementation": "hybrid", + "do_sample": true, + "eos_token_id": [ + 1, + 50, + 106 + ], + "top_k": 64, + "top_p": 0.95, + "transformers_version": "5.5.0" +} diff --git a/model.safetensors b/model.safetensors new file mode 100644 index 0000000..21c6e6c --- /dev/null +++ b/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:81749bec1fbbb9915dcf519c447c8015f6fb17a521c97bcc07f261021479cbf8 +size 536222816 diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..93a0904 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:80d7f800b949accd7eb940bac75e642f9468e4df157403032a55bf54ed23b650 +size 33384898 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..dc284d2 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,26 @@ +{ + "backend": "tokenizers", + "boi_token": "", + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eoi_token": "", + "eos_token": "", + "image_token": "", + "is_local": true, + "mask_token": "", + "model_max_length": 1000000000000000019884624838656, + "model_specific_special_tokens": { + "boi_token": "", + "eoi_token": "", + "image_token": "", + "sfr_token": "" + }, + "pad_token": "", + "padding_side": "right", + "sfr_token": "", + "sp_model_kwargs": null, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +}