初始化项目,由ModelHub XC社区提供模型
Model: LeviDeHaan/SecInt-SmolLM2-360M-nginx Source: Original Platform
This commit is contained in:
37
.gitattributes
vendored
Normal file
37
.gitattributes
vendored
Normal file
@@ -0,0 +1,37 @@
|
||||
*.7z filter=lfs diff=lfs merge=lfs -text
|
||||
*.arrow filter=lfs diff=lfs merge=lfs -text
|
||||
*.bin filter=lfs diff=lfs merge=lfs -text
|
||||
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
||||
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
||||
*.ftz filter=lfs diff=lfs merge=lfs -text
|
||||
*.gz filter=lfs diff=lfs merge=lfs -text
|
||||
*.h5 filter=lfs diff=lfs merge=lfs -text
|
||||
*.joblib filter=lfs diff=lfs merge=lfs -text
|
||||
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
||||
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
||||
*.model filter=lfs diff=lfs merge=lfs -text
|
||||
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
||||
*.npy filter=lfs diff=lfs merge=lfs -text
|
||||
*.npz filter=lfs diff=lfs merge=lfs -text
|
||||
*.onnx filter=lfs diff=lfs merge=lfs -text
|
||||
*.ot filter=lfs diff=lfs merge=lfs -text
|
||||
*.parquet filter=lfs diff=lfs merge=lfs -text
|
||||
*.pb filter=lfs diff=lfs merge=lfs -text
|
||||
*.pickle filter=lfs diff=lfs merge=lfs -text
|
||||
*.pkl filter=lfs diff=lfs merge=lfs -text
|
||||
*.pt filter=lfs diff=lfs merge=lfs -text
|
||||
*.pth filter=lfs diff=lfs merge=lfs -text
|
||||
*.rar filter=lfs diff=lfs merge=lfs -text
|
||||
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
||||
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
||||
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
||||
*.tar filter=lfs diff=lfs merge=lfs -text
|
||||
*.tflite filter=lfs diff=lfs merge=lfs -text
|
||||
*.tgz filter=lfs diff=lfs merge=lfs -text
|
||||
*.wasm filter=lfs diff=lfs merge=lfs -text
|
||||
*.xz filter=lfs diff=lfs merge=lfs -text
|
||||
*.zip filter=lfs diff=lfs merge=lfs -text
|
||||
*.zst filter=lfs diff=lfs merge=lfs -text
|
||||
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
||||
smollm-security-nginx02-merged.gguf filter=lfs diff=lfs merge=lfs -text
|
||||
smollm-security-nginx04-merged.gguf filter=lfs diff=lfs merge=lfs -text
|
||||
16
Modelfile
Normal file
16
Modelfile
Normal file
@@ -0,0 +1,16 @@
|
||||
# ollama modelfile auto-generated by llamafactory
|
||||
|
||||
FROM .
|
||||
|
||||
TEMPLATE """{{ if .System }}<|im_start|>system
|
||||
{{ .System }}<|im_end|>
|
||||
{{ end }}{{ range .Messages }}{{ if eq .Role "user" }}<|im_start|>user
|
||||
{{ .Content }}<|im_end|>
|
||||
<|im_start|>assistant
|
||||
{{ else if eq .Role "assistant" }}{{ .Content }}<|im_end|>
|
||||
{{ end }}{{ end }}"""
|
||||
|
||||
SYSTEM """You are a helpful AI assistant named SmolLM, trained by Hugging Face."""
|
||||
|
||||
PARAMETER stop "<|im_end|>"
|
||||
PARAMETER num_ctx 4096
|
||||
335
README.md
Normal file
335
README.md
Normal file
@@ -0,0 +1,335 @@
|
||||
---
|
||||
license: apache-2.0
|
||||
base_model: HuggingFaceTB/SmolLM2-360M-Instruct
|
||||
tags:
|
||||
- security
|
||||
- log-analysis
|
||||
- threat-detection
|
||||
- nginx
|
||||
- text-classification
|
||||
- lora
|
||||
- cpu
|
||||
- llama-cpp
|
||||
language:
|
||||
- en
|
||||
library_name: transformers
|
||||
pipeline_tag: text-classification
|
||||
datasets:
|
||||
- nginx_security
|
||||
metrics:
|
||||
- accuracy
|
||||
model-index:
|
||||
- name: SecInt-SmolLM2-360M-nginx
|
||||
results:
|
||||
- task:
|
||||
type: text-classification
|
||||
name: Security Log Classification
|
||||
metrics:
|
||||
- type: accuracy
|
||||
value: 99.0
|
||||
name: Accuracy
|
||||
---
|
||||
|
||||
# SecInt-SmolLM2-360M-nginx
|
||||
|
||||
**SecInt** (Security Intelligence Monitor) is a fine-tuned SmolLM2-360M model for real-time nginx security log classification. This is the first model in the SecInt series, designed to automatically detect security threats, errors, and normal traffic patterns in web server logs.
|
||||
|
||||
**There are 2 GGUF models, try version 04 its been trained on a lot more data.
|
||||
|
||||
## Model Overview
|
||||
|
||||
- **Base Model**: [HuggingFaceTB/SmolLM2-360M-Instruct](https://huggingface.co/HuggingFaceTB/SmolLM2-360M-Instruct)
|
||||
- **Model Size**: 360M parameters (~691MB)
|
||||
- **Fine-tuning Method**: LoRA (Low-Rank Adaptation)
|
||||
- **Task**: Multi-class text classification (3 classes)
|
||||
- **Classes**: `hack`, `error`, `normal`
|
||||
- **Inference**: CPU-optimized (~2GB RAM, 32 tokens/sec)
|
||||
- **Format**: Safetensors + GGUF (llama.cpp compatible)
|
||||
|
||||
## Key Features
|
||||
|
||||
- **99%+ Accuracy** on production security logs
|
||||
- **Real-time Detection**: ~100ms latency per classification
|
||||
- **CPU Inference**: No GPU required, runs on any system
|
||||
- **Production-Tested**: Battle-tested since October 2025, processing logs from 8 domains
|
||||
- **Lightweight**: Only ~2GB RAM needed
|
||||
- **Fast**: 32 tokens/second on CPU
|
||||
|
||||
## Quick Start
|
||||
|
||||
### Using Transformers
|
||||
|
||||
```python
|
||||
from transformers import AutoTokenizer, AutoModelForCausalLM
|
||||
import torch
|
||||
|
||||
# Load model and tokenizer
|
||||
model_name = "LeviDeHaan/SecInt-SmolLM2-360M-nginx"
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||
model = AutoModelForCausalLM.from_pretrained(model_name)
|
||||
|
||||
# Example log entry
|
||||
log_entry = '192.168.1.100 - - [28/Oct/2025:12:34:56 +0000] "GET /.env HTTP/1.1" 404 162 "-" "curl/7.68.0"'
|
||||
|
||||
# System prompt with classification rules
|
||||
system_prompt = """You are a security log analyzer. Classify the log entry as one of: hack, error, or normal.
|
||||
|
||||
HACK - Any of these patterns indicate an attack:
|
||||
- Scanning for sensitive files: .env, .git, .php, config.php, wp-admin, phpmyadmin
|
||||
- SQL injection attempts, XSS attempts
|
||||
- Invalid login attempts, brute force, "invalid user", "failed password"
|
||||
- Exploit attempts: /cgi-bin/, shell commands, malformed requests
|
||||
- 403/404 errors with suspicious paths
|
||||
- "access forbidden by rule" with .env, .git, admin, wp-, .php
|
||||
- Scanner user-agents: sqlmap, nikto, zgrab, nuclei
|
||||
- Webshell access attempts
|
||||
|
||||
ERROR - Application errors:
|
||||
- 500 errors, crashes, exceptions
|
||||
- SSL/TLS errors
|
||||
- Database connection failures
|
||||
- [emerg], [alert], [crit], [error] log levels
|
||||
|
||||
NORMAL - Everything else:
|
||||
- 200/304 responses to legitimate paths
|
||||
- Regular API calls, static files
|
||||
- Known good bots: googlebot, facebookbot
|
||||
|
||||
Respond with only one word: hack, error, or normal."""
|
||||
|
||||
# Format prompt using chat template
|
||||
messages = [
|
||||
{"role": "system", "content": system_prompt},
|
||||
{"role": "user", "content": f"Classify this log entry as hack, error, or normal.\n\n{log_entry}"}
|
||||
]
|
||||
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
||||
|
||||
# Generate classification
|
||||
inputs = tokenizer(prompt, return_tensors="pt")
|
||||
with torch.no_grad():
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=10,
|
||||
temperature=0.01,
|
||||
top_p=0.38,
|
||||
top_k=10,
|
||||
do_sample=True,
|
||||
pad_token_id=tokenizer.eos_token_id
|
||||
)
|
||||
|
||||
# Extract result
|
||||
result = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True).strip()
|
||||
print(f"Classification: {result}") # Output: hack
|
||||
```
|
||||
|
||||
### Using llama.cpp
|
||||
|
||||
The model includes a GGUF file for efficient CPU inference:
|
||||
|
||||
```bash
|
||||
# Download the GGUF model
|
||||
huggingface-cli download LeviDeHaan/SecInt-SmolLM2-360M-nginx smollm-security-nginx02-merged.gguf
|
||||
|
||||
# Run inference with llama.cpp
|
||||
./llama-cli -m smollm-security-nginx02-merged.gguf \
|
||||
--temp 0.01 \
|
||||
--top-p 0.38 \
|
||||
--top-k 10 \
|
||||
--seed 42 \
|
||||
-p "<|im_start|>system\nYou are a security log analyzer...<|im_end|>\n<|im_start|>user\nClassify this log entry...<|im_end|>\n<|im_start|>assistant\n"
|
||||
```
|
||||
|
||||
## Training Details
|
||||
|
||||
### Dataset
|
||||
|
||||
- **Source**: Real production nginx logs from 8 domains
|
||||
- **Total Examples**: 1,646 labeled samples
|
||||
- **Class Distribution**:
|
||||
- `hack`: 800 examples (48.6%) - SQL injection, path traversal, scanner activity, exploit attempts
|
||||
- `error`: 46 examples (2.8%) - 500 errors, SSL failures, application crashes
|
||||
- `normal`: 800 examples (48.6%) - Legitimate traffic, API calls, static file requests
|
||||
|
||||
### LoRA Configuration
|
||||
|
||||
```yaml
|
||||
LoRA Rank (r): 8
|
||||
LoRA Alpha: 16
|
||||
LoRA Dropout: 0.05
|
||||
Target Modules: q_proj, k_proj, v_proj, o_proj, up_proj, down_proj, gate_proj
|
||||
RSLoRA: enabled
|
||||
```
|
||||
|
||||
### Training Hyperparameters
|
||||
|
||||
```yaml
|
||||
Learning Rate: 2e-05
|
||||
Scheduler: cosine_with_restarts
|
||||
Warmup Steps: 5
|
||||
Batch Size: 10 per device
|
||||
Gradient Accumulation: 8 steps
|
||||
Effective Batch Size: 80
|
||||
Epochs: 10
|
||||
Max Sequence Length: 2048 tokens
|
||||
Optimizer: AdamW (betas=0.9,0.999, eps=1e-08)
|
||||
Seed: 42
|
||||
```
|
||||
|
||||
### Training Results
|
||||
|
||||
- **Training Duration**: ~50 minutes (210 steps)
|
||||
- **Final Loss**: 0.2575
|
||||
- **Throughput**: 3,121 tokens/second
|
||||
- **Total Tokens**: 9.29M
|
||||
- **Hardware**: CPU training (no GPU required)
|
||||
|
||||
## Use Cases
|
||||
|
||||
### Real-time Web Server Security Monitoring
|
||||
|
||||
SecInt is designed for integration into security monitoring systems to provide automated threat detection:
|
||||
|
||||
1. **Log Ingestion**: Monitor nginx access/error logs
|
||||
2. **Classification**: Identify attacks, errors, and normal traffic
|
||||
3. **Alerting**: Trigger notifications for security threats
|
||||
4. **Analytics**: Track attack patterns and trends
|
||||
5. **Response**: Feed into incident response workflows
|
||||
|
||||
### Typical Integration Architecture
|
||||
|
||||
```
|
||||
nginx logs → Log Parser → SecInt Classifier → Alert System
|
||||
↓
|
||||
Database Storage → Dashboard
|
||||
```
|
||||
|
||||
### Detection Capabilities
|
||||
|
||||
The model can identify:
|
||||
|
||||
**Attack Patterns (hack)**:
|
||||
- File/directory scanning (`.env`, `.git`, `config.php`, `wp-admin`, `phpmyadmin`)
|
||||
- SQL injection (`UNION SELECT`, `OR 1=1`, etc.)
|
||||
- Cross-site scripting (XSS) attempts
|
||||
- Path traversal (`../../../`)
|
||||
- Command injection attempts
|
||||
- Known exploit attempts (PHPUnit RCE, ThinkPHP, etc.)
|
||||
- Webshell access (c99, r57, alfa, wso)
|
||||
- Scanner signatures (sqlmap, nikto, zgrab, nuclei)
|
||||
- Brute force attacks (failed passwords, invalid users)
|
||||
- Request obfuscation (null bytes, encoding tricks)
|
||||
|
||||
**Application Errors (error)**:
|
||||
- HTTP 500 errors
|
||||
- SSL/TLS handshake failures
|
||||
- Application crashes and exceptions
|
||||
- Database connection errors
|
||||
- Critical log levels ([emerg], [alert], [crit])
|
||||
|
||||
**Normal Traffic (normal)**:
|
||||
- HTTP 200/304 responses to legitimate paths
|
||||
- API endpoints and authenticated requests
|
||||
- Static file serving (CSS, JS, images)
|
||||
- Known good bots (Googlebot, etc.)
|
||||
|
||||
## Performance Metrics
|
||||
|
||||
### Optimization Features
|
||||
|
||||
When deployed in the full SecInt system:
|
||||
- **Intelligent Caching**: 95%+ cache hit rate reduces redundant LLM calls
|
||||
- **Session Tracking**: Sampling mode after 50 requests from same IP
|
||||
- **Whitelist Support**: Known-good traffic bypasses classification
|
||||
- **Batch Processing**: Groups requests for efficient processing
|
||||
|
||||
## Recommended Inference Settings
|
||||
|
||||
For optimal security classification results:
|
||||
|
||||
```python
|
||||
temperature = 0.01 # Very deterministic
|
||||
max_tokens = 1024 # Classification is short
|
||||
top_k = 10 # Limit vocabulary
|
||||
top_p = 0.38 # Nucleus sampling
|
||||
seed = 42 # Fixed for consistency
|
||||
```
|
||||
|
||||
These settings ensure consistent, deterministic classification suitable for production security monitoring.
|
||||
|
||||
## Prompt Template
|
||||
|
||||
The model requires the SmolLM2 chat template format. **Critical**: Use the exact system prompt shown in the Quick Start section for best results. The system prompt contains:
|
||||
|
||||
1. Clear task definition
|
||||
2. Detailed attack pattern definitions (HACK class)
|
||||
3. Error pattern definitions (ERROR class)
|
||||
4. Normal traffic definitions (NORMAL class)
|
||||
5. Instruction to respond with single word only
|
||||
|
||||
Deviation from this prompt format may significantly reduce accuracy.
|
||||
|
||||
## Limitations
|
||||
|
||||
- **nginx-Specific**: Trained exclusively on nginx log format; may require fine-tuning for Apache, IIS, or other web servers
|
||||
- **Prompt-Dependent**: Requires exact prompt template for optimal performance
|
||||
- **CPU Inference**: Optimized for CPU; no GPU-specific optimizations
|
||||
- **English Only**: Trained on English-language logs
|
||||
- **Context Length**: Limited to 2048 tokens per log entry
|
||||
- **No Multi-log Context**: Classifies individual log entries; does not correlate across multiple logs
|
||||
|
||||
## Model Architecture
|
||||
|
||||
Built on SmolLM2-360M-Instruct, a decoder-only transformer model optimized for instruction following:
|
||||
|
||||
- **Parameters**: 360M
|
||||
- **Architecture**: Transformer decoder with grouped-query attention
|
||||
- **Context Length**: 2048 tokens
|
||||
- **Vocabulary Size**: 49,152 tokens
|
||||
- **Base Training**: Pre-trained on diverse text corpus, instruction-tuned
|
||||
|
||||
LoRA fine-tuning targets all attention and MLP projection layers for maximum adaptation to security log classification while maintaining base model knowledge.
|
||||
|
||||
## Citation
|
||||
|
||||
If you use this model in your research or production systems, please cite:
|
||||
|
||||
```bibtex
|
||||
@misc{secint-smollm2-nginx,
|
||||
author = {Levi DeHaan},
|
||||
title = {SecInt: SmolLM2-360M Fine-tuned for nginx Security Log Classification},
|
||||
year = {2025},
|
||||
publisher = {Hugging Face},
|
||||
howpublished = {\url{https://huggingface.co/LeviDeHaan/SecInt-SmolLM2-360M-nginx}}
|
||||
}
|
||||
```
|
||||
|
||||
## Acknowledgments
|
||||
|
||||
- **HuggingFace** for the SmolLM2-360M-Instruct base model
|
||||
- **llama.cpp** team for efficient CPU inference capabilities
|
||||
- **LLaMA-Factory** for streamlined LoRA fine-tuning framework
|
||||
|
||||
## License
|
||||
|
||||
This model is released under Apache 2.0 license, consistent with the base SmolLM2 model. You are free to use, modify, and distribute this model for commercial and non-commercial purposes.
|
||||
|
||||
## Project
|
||||
|
||||
SecInt is part of the **Security Intelligence Monitor v2** project, a comprehensive real-time security monitoring system for web servers. The full system includes:
|
||||
|
||||
- Multi-format log ingestion (nginx, Apache, custom)
|
||||
- AI-powered threat classification
|
||||
- Threat intelligence enrichment (GeoIP, Shodan)
|
||||
- Breach detection (7+ detection rules)
|
||||
- Real-time alerting (Pushover, email, webhooks)
|
||||
- Interactive dashboard (Streamlit)
|
||||
- Attack session management
|
||||
- SQLite-based persistence and analytics
|
||||
|
||||
For more information about the full SecInt system, visit: [logwatcher project](https://levidehaan.com/projects)
|
||||
|
||||
## Model Card Contact
|
||||
|
||||
For questions, issues, or collaboration opportunities:
|
||||
- **Hugging Face**: [@LeviDeHaan](https://huggingface.co/LeviDeHaan)
|
||||
- **Model Repository**: [SecInt-SmolLM2-360M-nginx](https://huggingface.co/LeviDeHaan/SecInt-SmolLM2-360M-nginx)
|
||||
6
chat_template.jinja
Normal file
6
chat_template.jinja
Normal file
@@ -0,0 +1,6 @@
|
||||
{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system
|
||||
You are a helpful AI assistant named SmolLM, trained by Hugging Face<|im_end|>
|
||||
' }}{% endif %}{{'<|im_start|>' + message['role'] + '
|
||||
' + message['content'] + '<|im_end|>' + '
|
||||
'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant
|
||||
' }}{% endif %}
|
||||
38
config.json
Normal file
38
config.json
Normal file
@@ -0,0 +1,38 @@
|
||||
{
|
||||
"architectures": [
|
||||
"LlamaForCausalLM"
|
||||
],
|
||||
"attention_bias": false,
|
||||
"attention_dropout": 0.0,
|
||||
"bos_token_id": 1,
|
||||
"eos_token_id": 2,
|
||||
"head_dim": 64,
|
||||
"hidden_act": "silu",
|
||||
"hidden_size": 960,
|
||||
"initializer_range": 0.02,
|
||||
"intermediate_size": 2560,
|
||||
"is_llama_config": true,
|
||||
"max_position_embeddings": 8192,
|
||||
"mlp_bias": false,
|
||||
"model_type": "llama",
|
||||
"num_attention_heads": 15,
|
||||
"num_hidden_layers": 32,
|
||||
"num_key_value_heads": 5,
|
||||
"pad_token_id": 2,
|
||||
"pretraining_tp": 1,
|
||||
"rms_norm_eps": 1e-05,
|
||||
"rope_interleaved": false,
|
||||
"rope_scaling": null,
|
||||
"rope_theta": 100000,
|
||||
"tie_word_embeddings": true,
|
||||
"torch_dtype": "bfloat16",
|
||||
"transformers.js_config": {
|
||||
"kv_cache_dtype": {
|
||||
"fp16": "float16",
|
||||
"q4f16": "float16"
|
||||
}
|
||||
},
|
||||
"transformers_version": "4.52.4",
|
||||
"use_cache": true,
|
||||
"vocab_size": 49152
|
||||
}
|
||||
7
generation_config.json
Normal file
7
generation_config.json
Normal file
@@ -0,0 +1,7 @@
|
||||
{
|
||||
"_from_model_config": true,
|
||||
"bos_token_id": 1,
|
||||
"eos_token_id": 2,
|
||||
"pad_token_id": 2,
|
||||
"transformers_version": "4.52.4"
|
||||
}
|
||||
48901
merges.txt
Normal file
48901
merges.txt
Normal file
File diff suppressed because it is too large
Load Diff
3
model.safetensors
Normal file
3
model.safetensors
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:af98ab6b88597f4dcdd816900de2d626819d14f1817d355828123da15ef35738
|
||||
size 723674912
|
||||
3
smollm-security-nginx02-merged.gguf
Normal file
3
smollm-security-nginx02-merged.gguf
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:9fc5c2b3e948d21f4c27e3a13bb6a9be710a29ea7954a8470dee8f25df5b8c48
|
||||
size 725553184
|
||||
3
smollm-security-nginx04-merged.gguf
Normal file
3
smollm-security-nginx04-merged.gguf
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:63b97ed8481fe4ace4fc457cfc5dd3e8af07d3ad866bc6b30ac90f8bd1bae5bb
|
||||
size 725553184
|
||||
34
special_tokens_map.json
Normal file
34
special_tokens_map.json
Normal file
@@ -0,0 +1,34 @@
|
||||
{
|
||||
"additional_special_tokens": [
|
||||
"<|im_start|>",
|
||||
"<|im_end|>"
|
||||
],
|
||||
"bos_token": {
|
||||
"content": "<|im_start|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
},
|
||||
"eos_token": {
|
||||
"content": "<|im_end|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
},
|
||||
"pad_token": {
|
||||
"content": "<|im_end|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
},
|
||||
"unk_token": {
|
||||
"content": "<|endoftext|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
}
|
||||
}
|
||||
244949
tokenizer.json
Normal file
244949
tokenizer.json
Normal file
File diff suppressed because it is too large
Load Diff
156
tokenizer_config.json
Normal file
156
tokenizer_config.json
Normal file
@@ -0,0 +1,156 @@
|
||||
{
|
||||
"add_prefix_space": false,
|
||||
"added_tokens_decoder": {
|
||||
"0": {
|
||||
"content": "<|endoftext|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"1": {
|
||||
"content": "<|im_start|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"2": {
|
||||
"content": "<|im_end|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"3": {
|
||||
"content": "<repo_name>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"4": {
|
||||
"content": "<reponame>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"5": {
|
||||
"content": "<file_sep>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"6": {
|
||||
"content": "<filename>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"7": {
|
||||
"content": "<gh_stars>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"8": {
|
||||
"content": "<issue_start>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"9": {
|
||||
"content": "<issue_comment>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"10": {
|
||||
"content": "<issue_closed>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"11": {
|
||||
"content": "<jupyter_start>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"12": {
|
||||
"content": "<jupyter_text>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"13": {
|
||||
"content": "<jupyter_code>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"14": {
|
||||
"content": "<jupyter_output>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"15": {
|
||||
"content": "<jupyter_script>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"16": {
|
||||
"content": "<empty_output>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
}
|
||||
},
|
||||
"additional_special_tokens": [
|
||||
"<|im_start|>",
|
||||
"<|im_end|>"
|
||||
],
|
||||
"bos_token": "<|im_start|>",
|
||||
"clean_up_tokenization_spaces": false,
|
||||
"eos_token": "<|im_end|>",
|
||||
"extra_special_tokens": {},
|
||||
"model_max_length": 8192,
|
||||
"pad_token": "<|im_end|>",
|
||||
"padding_side": "left",
|
||||
"split_special_tokens": false,
|
||||
"tokenizer_class": "GPT2Tokenizer",
|
||||
"unk_token": "<|endoftext|>",
|
||||
"vocab_size": 49152
|
||||
}
|
||||
1
vocab.json
Normal file
1
vocab.json
Normal file
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user