初始化项目,由ModelHub XC社区提供模型
Model: Nexless/dental-ai-research-slm-0m-20260425-3845 Source: Original Platform
This commit is contained in:
38
.gitattributes
vendored
Normal file
38
.gitattributes
vendored
Normal file
@@ -0,0 +1,38 @@
|
|||||||
|
*.7z filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.arrow filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.bin filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.ftz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.gz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.h5 filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.joblib filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.model filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.npy filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.npz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.onnx filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.ot filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.parquet filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pb filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pickle filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pkl filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pt filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pth filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.rar filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
||||||
|
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.tar filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.tflite filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.tgz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.wasm filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.xz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.zip filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.zst filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
gguf/model-Q4_K_M.gguf filter=lfs diff=lfs merge=lfs -text
|
||||||
|
gguf/model-Q8_0.gguf filter=lfs diff=lfs merge=lfs -text
|
||||||
|
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
||||||
78
Modelfile
Normal file
78
Modelfile
Normal file
@@ -0,0 +1,78 @@
|
|||||||
|
# Ollama Modelfile for dental-ai-research-slm-0m-20260425-3845
|
||||||
|
# Patched 2026-04-27 — system prompt + tightened sampling.
|
||||||
|
#
|
||||||
|
# WHY THIS PATCH:
|
||||||
|
# v0 was shipped with a placeholder SYSTEM string ("You are a helpful
|
||||||
|
# assistant focused on dental.ai.research") which let the model freelance
|
||||||
|
# on out-of-scope prompts (clinical / generic dental) and produced
|
||||||
|
# confident hallucinations. Sampling was also tuned for fluency (T=0.5)
|
||||||
|
# not factual recall, so in-scope answers leaked tokenizer-edge artifacts
|
||||||
|
# like "Mesh-Seg-Label" instead of "MeshSegNet".
|
||||||
|
#
|
||||||
|
# WHAT CHANGED:
|
||||||
|
# - Real SYSTEM block: scope contract + abstention rule + in-context
|
||||||
|
# examples (greatly suppresses out-of-scope confabulation on Qwen2.5-7B
|
||||||
|
# instruct-tuned base, which follows system prompts hard).
|
||||||
|
# - temperature 0.5 -> 0.2 (factual recall over fluency)
|
||||||
|
# - top_p 0.9 -> 0.85 (tighter tail)
|
||||||
|
# - top_k 40 -> 20 (tighter tail)
|
||||||
|
# - added min_p 0.05 (filters noise tokens)
|
||||||
|
# - kept repeat_penalty 1.18 (still the loop-killer)
|
||||||
|
|
||||||
|
FROM https://huggingface.co/Nexless/dental-ai-research-slm-0m-20260425-3845/resolve/main/gguf/model-Q4_K_M.gguf
|
||||||
|
|
||||||
|
TEMPLATE """{{ if .System }}<|im_start|>system
|
||||||
|
{{ .System }}<|im_end|>
|
||||||
|
{{ end }}{{ range .Messages }}<|im_start|>{{ .Role }}
|
||||||
|
{{ .Content }}<|im_end|>
|
||||||
|
{{ end }}<|im_start|>assistant
|
||||||
|
"""
|
||||||
|
|
||||||
|
PARAMETER stop "<|im_end|>"
|
||||||
|
PARAMETER stop "<|im_start|>"
|
||||||
|
PARAMETER stop "<|endoftext|>"
|
||||||
|
PARAMETER stop "<|end_of_text|>"
|
||||||
|
PARAMETER stop "ttiuser"
|
||||||
|
PARAMETER stop "ttiassistant"
|
||||||
|
PARAMETER stop "tti\n"
|
||||||
|
|
||||||
|
PARAMETER temperature 0.2
|
||||||
|
PARAMETER top_p 0.85
|
||||||
|
PARAMETER top_k 20
|
||||||
|
PARAMETER min_p 0.05
|
||||||
|
PARAMETER repeat_penalty 1.18
|
||||||
|
PARAMETER repeat_last_n 256
|
||||||
|
PARAMETER num_predict 320
|
||||||
|
PARAMETER num_ctx 2048
|
||||||
|
|
||||||
|
SYSTEM """You are a research-methodology assistant trained on the IntelliDent / Polytechnique Montréal research group's published dental-AI papers. Your scope is strictly:
|
||||||
|
|
||||||
|
IN-SCOPE
|
||||||
|
- Methods from the IntelliDent corpus: MeshSegNet, iMeshSegNet, MC-Net, margin-line detection, crown-generation pipelines (PointR / PointNet++ / GAN completion), intraoral-scan acquisition + decimation, dental arch segmentation
|
||||||
|
- Comparison and explanation of these methods (architecture, training data, loss functions, post-processing)
|
||||||
|
- Methodology summaries written in an academic register
|
||||||
|
|
||||||
|
OUT-OF-SCOPE
|
||||||
|
- Clinical or medical questions (caries causes, hygiene, treatment, diagnosis)
|
||||||
|
- Methods outside the IntelliDent corpus
|
||||||
|
- Generic dental questions a patient would ask
|
||||||
|
- Other dental specialties (orthodontics, periodontics, endodontics, implantology, oral surgery, oral cancer, cosmetic dentistry, TMJ, dental insurance)
|
||||||
|
|
||||||
|
ABSTENTION CONTRACT
|
||||||
|
For ANY out-of-scope question, respond exactly:
|
||||||
|
"This question is outside the dental-AI research-methodology corpus this model was trained on. For clinical or general dental questions, please consult appropriate clinical resources or a licensed dental professional."
|
||||||
|
|
||||||
|
WHEN ANSWERING IN-SCOPE QUESTIONS
|
||||||
|
- Name methods exactly: "MeshSegNet" (not "Mesh-Seg-Label" or "MeshSegLabel")
|
||||||
|
- MeshSegNet uses graph-constrained learning modules (GLMs) with multi-scale adjacency matrices, performs CELL-WISE classification on mesh triangles (NOT vertex-wise), and operates directly in 3D space (NO 2D-CNN component)
|
||||||
|
- iMeshSegNet replaces GLMs with EdgeConv for efficiency
|
||||||
|
- MC-Net targets mesh completion, not segmentation
|
||||||
|
- If uncertain about a specific detail, say "the corpus does not specify this clearly" rather than fabricating
|
||||||
|
|
||||||
|
Example 1 (in-scope):
|
||||||
|
Q: What is MeshSegNet?
|
||||||
|
A: MeshSegNet is an end-to-end deep-learning method for tooth labeling on raw dental surface meshes from intraoral scanners (Lian et al., IEEE TMI 2020). It extends PointNet by integrating multi-scale graph-constrained learning modules (GLMs) that use adjacency matrices to capture local geometric context. The network performs cell-wise classification on mesh triangles, with a dense fusion strategy combining local and global features, followed by a graph-cut post-processing step.
|
||||||
|
|
||||||
|
Example 2 (out-of-scope):
|
||||||
|
Q: What causes cavities?
|
||||||
|
A: This question is outside the dental-AI research-methodology corpus this model was trained on. For clinical or general dental questions, please consult appropriate clinical resources or a licensed dental professional."""
|
||||||
338
README.md
Normal file
338
README.md
Normal file
@@ -0,0 +1,338 @@
|
|||||||
|
---
|
||||||
|
license: apache-2.0
|
||||||
|
language: en
|
||||||
|
library_name: peft
|
||||||
|
pipeline_tag: text-generation
|
||||||
|
tags:
|
||||||
|
- text-generation
|
||||||
|
- qlora
|
||||||
|
- lora
|
||||||
|
- peft
|
||||||
|
- dental
|
||||||
|
- medical
|
||||||
|
- domain-specialized
|
||||||
|
- 7b
|
||||||
|
- gguf
|
||||||
|
base_model: Qwen/Qwen2.5-7B-Instruct
|
||||||
|
---
|
||||||
|
|
||||||
|
<p align="center">
|
||||||
|
<img src="https://github.com/user-attachments/assets/c93f0dcd-456f-48a0-bc96-b589ead27c19" alt="SLM-Forge banner" width="100%">
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<p align="center">
|
||||||
|
<a href="https://github.com/Dshamir/slm-forge"><img src="https://img.shields.io/badge/SLM--Forge-source%20on%20GitHub-181717?logo=github" alt="GitHub: Dshamir/slm-forge"></a>
|
||||||
|
<a href="https://github.com/Dshamir/slm-forge"><img src="https://img.shields.io/github/license/Dshamir/slm-forge?color=blue" alt="License: MIT"></a>
|
||||||
|
<a href="https://docs.claude.com/en/docs/claude-code"><img src="https://img.shields.io/badge/Built%20with-Claude%20Code-D97757?logo=anthropic&logoColor=white" alt="Built with Claude Code"></a>
|
||||||
|
<img src="https://img.shields.io/badge/status-PoC-orange" alt="Status: PoC">
|
||||||
|
</p>
|
||||||
|
|
||||||
|
> 🔧 **Toolkit:** This model was forged with [**SLM-Forge**](https://github.com/Dshamir/slm-forge) — a public, MIT-licensed, semi-autonomous skill tree for the Claude Code TUI that takes you from corpus + budget to a trained + evaluated + quantized + published Small Specialty Language Model in a single session, with one human gate. The dental run is the worked case study; the toolkit is yours to fork and use on your own corpora.
|
||||||
|
|
||||||
|
# dental-ai-research-slm — Qwen2.5-7B-Instruct + IntelliDent dental-AI research LoRA (v0)
|
||||||
|
|
||||||
|
A **research-methodology assistant**, not a clinical assistant. LoRA fine-tune of [Qwen/Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) on a curated corpus of dental-AI research papers from the IntelliDent / Polytechnique Montréal group — covering tooth segmentation (MeshSegNet, iMeshSegNet, MC-Net), preparation margin-line detection, crown generation (PointR / PointNet++), intraoral 3D scan processing, and adjacent dental imaging research.
|
||||||
|
|
||||||
|
> ## ⚠️ The slug says `slm-0m`. The model is **not** a 0M-parameter model.
|
||||||
|
>
|
||||||
|
> `slm-0m` is a **forge-tool version label** ("v0 milestone") generated by the templating pipeline that produced this artifact. It does **not** describe the model size.
|
||||||
|
>
|
||||||
|
> **Actual size:**
|
||||||
|
> - Base: **Qwen/Qwen2.5-7B-Instruct** — 7.62 B parameters
|
||||||
|
> - Trainable LoRA adapter: **~20 M parameters** (0.46 % of base)
|
||||||
|
>
|
||||||
|
> Future milestones will use clearer slugs (`v0`, `v0.1`, `v1`).
|
||||||
|
|
||||||
|
## ✅ What this model is for / 🚫 what it is not for
|
||||||
|
|
||||||
|
| ✅ In-scope | 🚫 Out-of-scope (model abstains) |
|
||||||
|
|---|---|
|
||||||
|
| MeshSegNet / iMeshSegNet / MC-Net architectures | Clinical questions (caries, hygiene, treatment, diagnosis) |
|
||||||
|
| Margin-line detection methods | Generic dental Qs a patient would ask |
|
||||||
|
| Crown-generation pipelines (PointR, PointNet++, GAN completion) | Other specialties: orthodontics, periodontics, implantology, oral surgery, oral cancer, TMJ, cosmetic dentistry |
|
||||||
|
| Intraoral 3D scan acquisition + decimation | Methods outside the IntelliDent corpus |
|
||||||
|
| Method comparison / methodology summaries in an academic register | Drug design, dental insurance, clinical recommendations |
|
||||||
|
|
||||||
|
**Abstention contract.** When asked anything out-of-scope, the model is instructed to answer exactly:
|
||||||
|
|
||||||
|
> "This question is outside the dental-AI research-methodology corpus this model was trained on. For clinical or general dental questions, please consult appropriate clinical resources or a licensed dental professional."
|
||||||
|
|
||||||
|
The abstention contract is enforced via the system prompt baked into the shipped [`Modelfile`](./Modelfile). **For Transformers / PEFT users:** include the same system prompt (excerpted below in *How to use*) — without it the model will freelance on out-of-scope prompts.
|
||||||
|
|
||||||
|
## What's in this repo
|
||||||
|
|
||||||
|
| File / dir | Purpose | Size |
|
||||||
|
|---|---|---|
|
||||||
|
| `adapter_model.safetensors` + `adapter_config.json` | LoRA adapter (r=32, α=64) — apply on top of `Qwen/Qwen2.5-7B-Instruct` via PEFT | ~80 MB |
|
||||||
|
| `gguf/model-Q4_K_M.gguf` | Merged + quantized 4-bit GGUF — best size/quality trade-off; runs on a modern laptop | ~4.7 GB |
|
||||||
|
| `gguf/model-Q8_0.gguf` | Merged + quantized 8-bit GGUF — near-lossless | ~8.1 GB |
|
||||||
|
| `Modelfile` | Ollama Modelfile pre-configured with system prompt + sampling defaults (see below) | — |
|
||||||
|
| `tokenizer.json` / `vocab.json` / `merges.txt` / `chat_template.jinja` | Qwen2 tokenizer + ChatML chat template | — |
|
||||||
|
| `LICENSE` | Apache-2.0 (inherited from base) | — |
|
||||||
|
|
||||||
|
## Training
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|---|---|
|
||||||
|
| **Base model** | `Qwen/Qwen2.5-7B-Instruct` (Apache-2.0) |
|
||||||
|
| **Regime** | QLoRA — base loaded in 4-bit NF4 (bitsandbytes), LoRA adapter trained in bf16 |
|
||||||
|
| **LoRA config** | rank 32, alpha 64, dropout 0.05; targets `q_proj, k_proj, v_proj, o_proj` |
|
||||||
|
| **Trainable params** | 20,185,088 (0.46 % of 4.37 B post-quantization) |
|
||||||
|
| **Sequence length** | 2048 tokens |
|
||||||
|
| **Effective batch** | 8 (micro-batch 2 × grad-accum 4) |
|
||||||
|
| **Steps** | 900 (~2.94 epochs over 2,209 train examples) |
|
||||||
|
| **Learning rate** | 1e-4, cosine schedule, gradient checkpointing on |
|
||||||
|
| **Hardware** | AWS `g5.2xlarge` — 1× NVIDIA A10G (24 GB), 8 vCPU, 32 GB RAM, ca-central-1 |
|
||||||
|
| **Train wall-clock** | 5 h 53 min |
|
||||||
|
| **Total forge wall-clock** | 9 h 03 min (training + provisioning + repeated phase reruns from in-flight bug fixes) |
|
||||||
|
| **AWS spend** | $11.62 (on-demand g5.2xlarge × 8 h actual EC2 + Claude API for synth + plan-fit grading) |
|
||||||
|
|
||||||
|
## Training data
|
||||||
|
|
||||||
|
A 320-document corpus of dental-AI research artifacts (PDFs of theses, journal papers, conference proceedings: PolyMtl, MICCAI, SPIE, ISBI, JBHI, JMI, IEEE-EMBC, MEDIA, JDentistry, Computer Biology & Medicine, plus internal IVADO presentations).
|
||||||
|
|
||||||
|
> **Scope of v0's corpus (important for setting expectations).**
|
||||||
|
> v0 was forged from the original `corpora/publications-raw/Publications` directory, which physically contained only **4 file types** — PDF / DOCX / PPTX / TXT. v0 therefore did **not** see the rich-media extensions present in the broader IntelliDent archive (XLSX/CSV spreadsheets, PNG/JPG/TIF/HEIC figures, MP4/m4a/wav screen-recordings, STL/VTP/OBJ/PLY meshes, MyISAM EndNote bibliographies, ZIP archives). Those are queued for **v1** — see [Roadmap](#roadmap).
|
||||||
|
|
||||||
|
| Stage | Output |
|
||||||
|
|---|---|
|
||||||
|
| `prep` | 2,448 raw passages extracted from PDF/DOCX/PPTX/TXT (`pdf=2041, docx=163, pptx=241, txt=3`) |
|
||||||
|
| `audit` | 843 passages retained (34.4 %) after MinHash dedup, length filter, language filter, domain density check, contamination scrub. **655,562 clean tokens.** |
|
||||||
|
| `synth` | 2,514 Q/A pairs synthesized via Claude Haiku 4.5 (factual / mechanism / clinical Q types) |
|
||||||
|
| `synth-filter` | 2,455 Q/A pairs after schema + quality filter |
|
||||||
|
| `shape` | 90/5/5 split → 2,209 train / 122 val / 124 test (deterministic shuffle seeded on forge id) |
|
||||||
|
|
||||||
|
The corpus is heavily concentrated on:
|
||||||
|
- 3D mesh segmentation of dental arches (MeshSegNet, iMeshSegNet, PointNet++)
|
||||||
|
- Preparation margin-line detection (regression + classification approaches)
|
||||||
|
- AI-driven crown generation (PointR shell prediction, GAN-based completion)
|
||||||
|
- Dental scan acquisition + decimation pipelines
|
||||||
|
|
||||||
|
## Plan-fit gate (pre-spend validation)
|
||||||
|
|
||||||
|
Before any GPU spend, the forge ran a 7-axis Q/A grading gate via Claude Sonnet 4.6:
|
||||||
|
|
||||||
|
| Axis | Result |
|
||||||
|
|---|---|
|
||||||
|
| In-domain classification (Haiku) | **100 %** in domain (threshold: 95 %) |
|
||||||
|
| Subdomain coverage | passed |
|
||||||
|
| Q/A factual + appropriate + grounded + concise score | mean **4.135 / 5** (threshold: 4.0); min individual **3.0+** |
|
||||||
|
| Q/A type diversity | max single type 40 % (threshold: 50 %) |
|
||||||
|
| Hyperparameter sanity | passed (rank, epochs, LR, seq-len in safe envelopes) |
|
||||||
|
| Training format roundtrip | passed (ChatML round-trips through tokenizer) |
|
||||||
|
| Budget headroom | $189.90 remaining of $200 cap (-94.4 % synth overrun → much cheaper than estimated) |
|
||||||
|
|
||||||
|
## Evaluation
|
||||||
|
|
||||||
|
Post-training perplexity / sample-generation eval **was skipped on this run** due to a bug in the forge's eval phase (loaded base + adapter in fp32 → OOM on the 32 GB instance). The bug is now patched in the upstream forge code; future runs will populate this section.
|
||||||
|
|
||||||
|
**Empirical model behavior should be assessed via the GGUFs and the sampling settings below.** Hand-evaluation showed two failure modes that the v0 patch (this artifact, 2026-04-27) targets directly:
|
||||||
|
|
||||||
|
1. **Out-of-scope confabulation** — questions like "What causes cavities?" produced confident, plausible-sounding clinical answers instead of declining. **Root cause:** v0 was shipped with a placeholder system prompt. **Fix in this revision:** real system-prompt scope contract + abstention rule baked into the Modelfile.
|
||||||
|
2. **In-scope name corruption** — answers about MeshSegNet sometimes leaked tokenizer-edge artifacts ("Mesh-Seg-Label") and sometimes invented architectural components (a 2D-CNN stage). **Root cause:** sampling was tuned for fluency (T=0.5), and the corpus didn't include explicit method-discrimination or negative-fact pairs. **Partial fix in this revision:** sampling tightened (T=0.2, top_p=0.85, top_k=20, min_p=0.05) which kills the name-corruption class. The architectural-confabulation class needs a continuation training run (see *Roadmap*).
|
||||||
|
|
||||||
|
## Recommended sampling settings
|
||||||
|
|
||||||
|
For factual recall on this narrow corpus (all values pre-baked in the [`Modelfile`](./Modelfile)):
|
||||||
|
|
||||||
|
| param | value | reason |
|
||||||
|
|---|---|---|
|
||||||
|
| `temperature` | **0.2** | factual recall, kills tokenizer-edge corruption like `Mesh-Seg-Label` |
|
||||||
|
| `top_p` | **0.85** | tighter tail than v0's 0.9 |
|
||||||
|
| `top_k` | **20** | tighter tail than v0's 40 |
|
||||||
|
| `min_p` | **0.05** | filters noise tokens |
|
||||||
|
| `repeat_penalty` | 1.18 | kills paragraph loops |
|
||||||
|
| `repeat_last_n` | 256 | window for the penalty |
|
||||||
|
| `max_tokens` | 320 | keep responses short |
|
||||||
|
|
||||||
|
**Stop strings** (LM Studio "Stop strings" field, or llama-cli `--reverse-prompt`):
|
||||||
|
|
||||||
|
```
|
||||||
|
<|im_end|>
|
||||||
|
<|im_start|>
|
||||||
|
<|endoftext|>
|
||||||
|
ttiuser
|
||||||
|
ttiassistant
|
||||||
|
```
|
||||||
|
|
||||||
|
## How to use
|
||||||
|
|
||||||
|
> **Always pass a scope-enforcing system prompt.** The Modelfile and Space app bake one in. If you're calling the model via Transformers / PEFT directly, paste the [system prompt block](#system-prompt-for-transformerspeft-users) below into your `apply_chat_template` call.
|
||||||
|
|
||||||
|
### Load the LoRA adapter on top of Qwen2.5-7B-Instruct (Transformers + PEFT)
|
||||||
|
|
||||||
|
```python
|
||||||
|
import torch
|
||||||
|
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||||
|
from peft import PeftModel
|
||||||
|
|
||||||
|
REPO = "Nexless/dental-ai-research-slm-0m-20260425-3845"
|
||||||
|
BASE = "Qwen/Qwen2.5-7B-Instruct"
|
||||||
|
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(REPO)
|
||||||
|
base = AutoModelForCausalLM.from_pretrained(
|
||||||
|
BASE,
|
||||||
|
torch_dtype=torch.bfloat16, # required — fp32 won't fit on a 24 GB GPU
|
||||||
|
device_map="auto",
|
||||||
|
)
|
||||||
|
model = PeftModel.from_pretrained(base, REPO)
|
||||||
|
model.eval()
|
||||||
|
|
||||||
|
SYSTEM_PROMPT = """You are a research-methodology assistant trained on the IntelliDent / Polytechnique Montréal research group's published dental-AI papers. ...
|
||||||
|
(see full text under 'System prompt for Transformers/PEFT users' below)"""
|
||||||
|
|
||||||
|
prompt = "How does iMeshSegNet differ from MeshSegNet for dental arch segmentation?"
|
||||||
|
inputs = tokenizer.apply_chat_template(
|
||||||
|
[
|
||||||
|
{"role": "system", "content": SYSTEM_PROMPT},
|
||||||
|
{"role": "user", "content": prompt},
|
||||||
|
],
|
||||||
|
return_tensors="pt", add_generation_prompt=True,
|
||||||
|
).to(model.device)
|
||||||
|
|
||||||
|
with torch.no_grad():
|
||||||
|
out = model.generate(
|
||||||
|
inputs,
|
||||||
|
max_new_tokens=320,
|
||||||
|
temperature=0.2,
|
||||||
|
top_p=0.85,
|
||||||
|
top_k=20,
|
||||||
|
repetition_penalty=1.18,
|
||||||
|
do_sample=True,
|
||||||
|
)
|
||||||
|
print(tokenizer.decode(out[0][inputs.shape[1]:], skip_special_tokens=True))
|
||||||
|
```
|
||||||
|
|
||||||
|
### Use the GGUF in Ollama
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl -L -O https://huggingface.co/Nexless/dental-ai-research-slm-0m-20260425-3845/resolve/main/Modelfile
|
||||||
|
ollama create dental-research -f Modelfile
|
||||||
|
ollama run dental-research
|
||||||
|
```
|
||||||
|
|
||||||
|
The shipped [`Modelfile`](./Modelfile) already encodes the system prompt + sampling defaults + stop strings.
|
||||||
|
|
||||||
|
### Use the GGUF in LM Studio / llama.cpp / text-generation-webui
|
||||||
|
|
||||||
|
Download `gguf/model-Q4_K_M.gguf` (4.7 GB, recommended for laptop CPUs) or `gguf/model-Q8_0.gguf` (8.1 GB, higher quality on machines with 16+ GB RAM). In the UI's advanced configuration panel:
|
||||||
|
|
||||||
|
1. Paste the system prompt from [System prompt for Transformers/PEFT users](#system-prompt-for-transformerspeft-users) below.
|
||||||
|
2. Set the sampling values from the [recommended sampling settings table](#recommended-sampling-settings).
|
||||||
|
3. Add the stop strings.
|
||||||
|
|
||||||
|
### System prompt for Transformers/PEFT users
|
||||||
|
|
||||||
|
```text
|
||||||
|
You are a research-methodology assistant trained on the IntelliDent / Polytechnique Montréal research group's published dental-AI papers. Your scope is strictly:
|
||||||
|
|
||||||
|
IN-SCOPE
|
||||||
|
- Methods from the IntelliDent corpus: MeshSegNet, iMeshSegNet, MC-Net, margin-line detection, crown-generation pipelines (PointR / PointNet++ / GAN completion), intraoral-scan acquisition + decimation, dental arch segmentation
|
||||||
|
- Comparison and explanation of these methods (architecture, training data, loss functions, post-processing)
|
||||||
|
- Methodology summaries written in an academic register
|
||||||
|
|
||||||
|
OUT-OF-SCOPE
|
||||||
|
- Clinical or medical questions (caries causes, hygiene, treatment, diagnosis)
|
||||||
|
- Methods outside the IntelliDent corpus
|
||||||
|
- Generic dental questions a patient would ask
|
||||||
|
- Other dental specialties (orthodontics, periodontics, endodontics, implantology, oral surgery, oral cancer, cosmetic dentistry, TMJ, dental insurance)
|
||||||
|
|
||||||
|
ABSTENTION CONTRACT
|
||||||
|
For ANY out-of-scope question, respond exactly:
|
||||||
|
"This question is outside the dental-AI research-methodology corpus this model was trained on. For clinical or general dental questions, please consult appropriate clinical resources or a licensed dental professional."
|
||||||
|
|
||||||
|
WHEN ANSWERING IN-SCOPE QUESTIONS
|
||||||
|
- Name methods exactly: "MeshSegNet" (not "Mesh-Seg-Label" or "MeshSegLabel")
|
||||||
|
- MeshSegNet uses graph-constrained learning modules (GLMs) with multi-scale adjacency matrices, performs CELL-WISE classification on mesh triangles (NOT vertex-wise), and operates directly in 3D space (NO 2D-CNN component)
|
||||||
|
- iMeshSegNet replaces GLMs with EdgeConv for efficiency
|
||||||
|
- MC-Net targets mesh completion, not segmentation
|
||||||
|
- If uncertain about a specific detail, say "the corpus does not specify this clearly" rather than fabricating
|
||||||
|
|
||||||
|
Example 1 (in-scope):
|
||||||
|
Q: What is MeshSegNet?
|
||||||
|
A: MeshSegNet is an end-to-end deep-learning method for tooth labeling on raw dental surface meshes from intraoral scanners (Lian et al., IEEE TMI 2020). It extends PointNet by integrating multi-scale graph-constrained learning modules (GLMs) that use adjacency matrices to capture local geometric context. The network performs cell-wise classification on mesh triangles, with a dense fusion strategy combining local and global features, followed by a graph-cut post-processing step.
|
||||||
|
|
||||||
|
Example 2 (out-of-scope):
|
||||||
|
Q: What causes cavities?
|
||||||
|
A: This question is outside the dental-AI research-methodology corpus this model was trained on. For clinical or general dental questions, please consult appropriate clinical resources or a licensed dental professional.
|
||||||
|
```
|
||||||
|
|
||||||
|
## Limitations & caveats
|
||||||
|
|
||||||
|
- **Narrow domain.** Out-of-domain generic prompts will degrade vs. base. Best behavior is on dental-AI methodology, mesh / point-cloud segmentation, margin-line / crown-generation pipelines, and adjacent imaging topics. The system prompt + abstention contract suppress most freelancing; if you bypass them, results are not representative.
|
||||||
|
- **Small training set.** ~655K clean tokens / 2,455 Q/A pairs is on the lower end for 7B SFT. A larger corpus would improve recall of paper-specific facts.
|
||||||
|
- **No baseline perplexity comparison** in this release — see the eval section.
|
||||||
|
- **Method-discrimination + negative-fact gaps.** The corpus didn't include explicit "MeshSegNet vs iMeshSegNet" contrastive pairs or "MeshSegNet does NOT use a 2D-CNN" negation pairs, so the model can confabulate plausible-sounding architectural details. The Modelfile's in-context examples patch this in part; a continuation run is the structural fix (see *Roadmap*).
|
||||||
|
- **Live demo Space currently broken on free tier.** The auto-published [Gradio Space](https://huggingface.co/spaces/Nexless/dental-ai-research-slm-0m-20260425-3845-demo) tries to load the full 7B in HuggingFace Spaces' free CPU tier (16 GB RAM) and OOMs. Use the GGUF locally instead, or fork the Space onto a paid GPU runtime.
|
||||||
|
- **Repo / Space slug `slm-0m`** is a forge-tool version label, **not** a parameter count. Future revisions will use clearer slugs.
|
||||||
|
|
||||||
|
## Roadmap
|
||||||
|
|
||||||
|
This is **v0**. Two follow-up artifacts are planned:
|
||||||
|
|
||||||
|
### v0.1 — continuation training (~3–4 h on A10G, ~$5)
|
||||||
|
|
||||||
|
Add three synth buckets to the existing Q/A set and continue-train the LoRA at LR=5e-5 for one epoch:
|
||||||
|
|
||||||
|
| Bucket | Pairs | Purpose |
|
||||||
|
|---|---|---|
|
||||||
|
| **Abstention pairs** | ~250 | Out-of-scope questions paired with the abstention response (3-4 phrasing variants), covering generic hygiene, orthodontics, periodontics, drug design, mesh decimation, dental insurance, cosmetic dentistry, oral cancer, TMJ, implants. Adds learned abstention behavior on top of the system-prompt enforcement. |
|
||||||
|
| **Method discrimination pairs** | ~150 | "What's the difference between [method A] and [method B]?" with explicit contrastive answers naming the actual differentiator. Pairs to cover: MeshSegNet vs iMeshSegNet (GLM vs EdgeConv), vs PointNet (mesh vs raw point cloud), vs TSGCNet (single vs dual stream), vs DGCNN; MC-Net vs MeshSegNet (completion vs segmentation). |
|
||||||
|
| **Negative-fact pairs** | ~100 | Explicit corrections: "Does MeshSegNet use 2D CNNs? No — MeshSegNet operates directly in 3D space on mesh cells." Negation training is the underused trick — LLMs hallucinate confidently because they've never seen explicit corrections. |
|
||||||
|
|
||||||
|
### v1 — full re-forge (expanded corpus, all file types)
|
||||||
|
|
||||||
|
v1 moves from `corpora/publications-raw/Publications` (4 file types, 320 docs) to **`corpora2/extracted/Publications`** — **1,547 files spanning 18 extensions** routed through all 19 SLM-Forge prep plugins. New file-type coverage relative to v0:
|
||||||
|
|
||||||
|
| Extension family | Plugin | What v0 missed |
|
||||||
|
|---|---|---|
|
||||||
|
| **PDF / DOCX / PPTX / TXT** | `pdf`, `docx_plugin`, `pptx_plugin`, `text_simple` | (already covered in v0) |
|
||||||
|
| **XLSX / CSV** | `tabular` | data tables, references lists in spreadsheets |
|
||||||
|
| **EPUB** | `epub` | textbook chapters |
|
||||||
|
| **Notebooks (.ipynb)** | `notebook` | analysis code + markdown narratives |
|
||||||
|
| **Code (.py / .js / .ts / .cpp / etc.)** | `code` | training scripts, model definitions |
|
||||||
|
| **PNG / JPG / TIF / HEIC** | `ocr` (disabled in v1: `FORGE_DISABLE_OCR=1`) | (figures yielded low-value OCR text — explicitly skipped to save tokens + Claude budget) |
|
||||||
|
| **MP4 / MKV / MOV / WEBM** | `video` (whitelisted: ≥120 s + audio stream required) | narrated method walkthroughs (~10 of 171 clips have value; rest are silent MeshLab screen-recordings) |
|
||||||
|
| **m4a / wav / mp3** | `audio` (faster-whisper local CPU transcribe) | recorded talks, voice memos |
|
||||||
|
| **STL / PLY / OBJ / VTP** | `mesh_metadata` | 3D mesh metadata (vertex/face counts, bounding boxes) |
|
||||||
|
| **DICOM** | `dicom` | medical imaging headers |
|
||||||
|
| **MyISAM (.frm/.myi/.myd/.ibd)** | `mysql_revive` | EndNote bibliographic backups (paper abstracts → high signal) |
|
||||||
|
| **EML / MBOX** | `email_plugin` | research correspondence |
|
||||||
|
| **GeoTIFF / SHP** | `geo` | (not present here, future-proofing) |
|
||||||
|
| **HDF5 / NetCDF** | `scientific` | dataset archives |
|
||||||
|
| **ZIP / TAR / RAR** | `archive` | recursive extraction (RAR support via `unrar` system binary) |
|
||||||
|
| **Binary (default)** | `binary_metadata` | catch-all metadata extraction |
|
||||||
|
|
||||||
|
Plus structural changes targeting the failure modes documented in *Evaluation*:
|
||||||
|
- **Smaller base, lower rank.** 7B + r=32 + 1.6 M tokens is over-parameterized for this corpus size; the train/eval ratio is 1.61× which is the overfitting signature. v1 will likely move to a 3B base + r=16 LoRA, with `max_steps=8000` (proper SFT depth, ~1.8 epochs) and a 100-step calibration burst at train start (auto-aborts if sec/step > 27).
|
||||||
|
- **New synth buckets baked in.** The v0.1 buckets (abstention + method-discrimination + negative-fact) are integrated from synth phase forward, not bolted on.
|
||||||
|
- **MP4 whitelist** + **OCR off** prevent waste on silent screen-recordings + low-text-density figures.
|
||||||
|
|
||||||
|
## Reproducibility
|
||||||
|
|
||||||
|
Forge run id: `v2-20260424-131000-3845`. Full pipeline definition: `prep → audit → synth → shape → plan_fit → provision → bootstrap → train → monitor → eval → quantize → register → card_validator → smoketest → publish → teardown → report`.
|
||||||
|
|
||||||
|
## Citation
|
||||||
|
|
||||||
|
```bibtex
|
||||||
|
@misc{dental_ai_research_slm_2026,
|
||||||
|
title = {dental-ai-research-slm: a Qwen2.5-7B-Instruct LoRA fine-tuned on dental-AI research papers},
|
||||||
|
author = {Nexless},
|
||||||
|
year = {2026},
|
||||||
|
howpublished = {\url{https://huggingface.co/Nexless/dental-ai-research-slm-0m-20260425-3845}}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## ⚠️ Disclaimer — Proof of Concept
|
||||||
|
|
||||||
|
This artifact is a **proof of concept** of **semi-autonomous skills running inside the Claude Code TUI**, developed by **Nexless**. The dental-AI research content is published **for educational purposes only** — it is part of a broader experiment testing the capabilities of **SLM — (Small) Speciality Language Models** as a category: how narrow domain corpora, plan-fit pre-spend gates, abstention contracts, and skill-tree-driven training pipelines compose into a publishable, scope-honest small model.
|
||||||
|
|
||||||
|
**Not for clinical use.** Not a substitute for licensed dental or medical advice. The model is intentionally narrow (research-methodology paraphrase) and is instructed to abstain on clinical and out-of-scope questions; if it answers anything that looks like clinical advice, that is a failure mode, not a recommendation.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
*Trained with [SLM-Forge](https://github.com/Dshamir/sif-knowledge-base) — a skill-tree pipeline for scoping, training, evaluating, quantizing, and publishing small-to-mid-size domain language models.*
|
||||||
|
|
||||||
|
*Card revised 2026-04-27 to add explicit scope + abstention contract, sampling tightened from v0 defaults, v0.1/v1 roadmap, and PoC disclaimer.*
|
||||||
45
adapter_config.json
Normal file
45
adapter_config.json
Normal file
@@ -0,0 +1,45 @@
|
|||||||
|
{
|
||||||
|
"alora_invocation_tokens": null,
|
||||||
|
"alpha_pattern": {},
|
||||||
|
"arrow_config": null,
|
||||||
|
"auto_mapping": null,
|
||||||
|
"base_model_name_or_path": "Qwen/Qwen2.5-7B-Instruct",
|
||||||
|
"bias": "none",
|
||||||
|
"corda_config": null,
|
||||||
|
"ensure_weight_tying": false,
|
||||||
|
"eva_config": null,
|
||||||
|
"exclude_modules": null,
|
||||||
|
"fan_in_fan_out": false,
|
||||||
|
"inference_mode": true,
|
||||||
|
"init_lora_weights": true,
|
||||||
|
"layer_replication": null,
|
||||||
|
"layers_pattern": null,
|
||||||
|
"layers_to_transform": null,
|
||||||
|
"loftq_config": {},
|
||||||
|
"lora_alpha": 64,
|
||||||
|
"lora_bias": false,
|
||||||
|
"lora_dropout": 0.05,
|
||||||
|
"lora_ga_config": null,
|
||||||
|
"megatron_config": null,
|
||||||
|
"megatron_core": "megatron.core",
|
||||||
|
"modules_to_save": null,
|
||||||
|
"peft_type": "LORA",
|
||||||
|
"peft_version": "0.19.1",
|
||||||
|
"qalora_group_size": 16,
|
||||||
|
"r": 32,
|
||||||
|
"rank_pattern": {},
|
||||||
|
"revision": null,
|
||||||
|
"target_modules": [
|
||||||
|
"o_proj",
|
||||||
|
"q_proj",
|
||||||
|
"k_proj",
|
||||||
|
"v_proj"
|
||||||
|
],
|
||||||
|
"target_parameters": null,
|
||||||
|
"task_type": "CAUSAL_LM",
|
||||||
|
"trainable_token_indices": null,
|
||||||
|
"use_bdlora": null,
|
||||||
|
"use_dora": false,
|
||||||
|
"use_qalora": false,
|
||||||
|
"use_rslora": false
|
||||||
|
}
|
||||||
3
adapter_model.safetensors
Normal file
3
adapter_model.safetensors
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:d5b436425de2dac8688e92a90c7a132174ee54f33fb64fa22b41a81eb29e8ba0
|
||||||
|
size 80770432
|
||||||
24
added_tokens.json
Normal file
24
added_tokens.json
Normal file
@@ -0,0 +1,24 @@
|
|||||||
|
{
|
||||||
|
"</tool_call>": 151658,
|
||||||
|
"<tool_call>": 151657,
|
||||||
|
"<|box_end|>": 151649,
|
||||||
|
"<|box_start|>": 151648,
|
||||||
|
"<|endoftext|>": 151643,
|
||||||
|
"<|file_sep|>": 151664,
|
||||||
|
"<|fim_middle|>": 151660,
|
||||||
|
"<|fim_pad|>": 151662,
|
||||||
|
"<|fim_prefix|>": 151659,
|
||||||
|
"<|fim_suffix|>": 151661,
|
||||||
|
"<|im_end|>": 151645,
|
||||||
|
"<|im_start|>": 151644,
|
||||||
|
"<|image_pad|>": 151655,
|
||||||
|
"<|object_ref_end|>": 151647,
|
||||||
|
"<|object_ref_start|>": 151646,
|
||||||
|
"<|quad_end|>": 151651,
|
||||||
|
"<|quad_start|>": 151650,
|
||||||
|
"<|repo_name|>": 151663,
|
||||||
|
"<|video_pad|>": 151656,
|
||||||
|
"<|vision_end|>": 151653,
|
||||||
|
"<|vision_pad|>": 151654,
|
||||||
|
"<|vision_start|>": 151652
|
||||||
|
}
|
||||||
54
chat_template.jinja
Normal file
54
chat_template.jinja
Normal file
@@ -0,0 +1,54 @@
|
|||||||
|
{%- if tools %}
|
||||||
|
{{- '<|im_start|>system\n' }}
|
||||||
|
{%- if messages[0]['role'] == 'system' %}
|
||||||
|
{{- messages[0]['content'] }}
|
||||||
|
{%- else %}
|
||||||
|
{{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
|
||||||
|
{%- endif %}
|
||||||
|
{{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
|
||||||
|
{%- for tool in tools %}
|
||||||
|
{{- "\n" }}
|
||||||
|
{{- tool | tojson }}
|
||||||
|
{%- endfor %}
|
||||||
|
{{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
|
||||||
|
{%- else %}
|
||||||
|
{%- if messages[0]['role'] == 'system' %}
|
||||||
|
{{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
|
||||||
|
{%- else %}
|
||||||
|
{{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
|
||||||
|
{%- endif %}
|
||||||
|
{%- endif %}
|
||||||
|
{%- for message in messages %}
|
||||||
|
{%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
|
||||||
|
{{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
|
||||||
|
{%- elif message.role == "assistant" %}
|
||||||
|
{{- '<|im_start|>' + message.role }}
|
||||||
|
{%- if message.content %}
|
||||||
|
{{- '\n' + message.content }}
|
||||||
|
{%- endif %}
|
||||||
|
{%- for tool_call in message.tool_calls %}
|
||||||
|
{%- if tool_call.function is defined %}
|
||||||
|
{%- set tool_call = tool_call.function %}
|
||||||
|
{%- endif %}
|
||||||
|
{{- '\n<tool_call>\n{"name": "' }}
|
||||||
|
{{- tool_call.name }}
|
||||||
|
{{- '", "arguments": ' }}
|
||||||
|
{{- tool_call.arguments | tojson }}
|
||||||
|
{{- '}\n</tool_call>' }}
|
||||||
|
{%- endfor %}
|
||||||
|
{{- '<|im_end|>\n' }}
|
||||||
|
{%- elif message.role == "tool" %}
|
||||||
|
{%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
|
||||||
|
{{- '<|im_start|>user' }}
|
||||||
|
{%- endif %}
|
||||||
|
{{- '\n<tool_response>\n' }}
|
||||||
|
{{- message.content }}
|
||||||
|
{{- '\n</tool_response>' }}
|
||||||
|
{%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
|
||||||
|
{{- '<|im_end|>\n' }}
|
||||||
|
{%- endif %}
|
||||||
|
{%- endif %}
|
||||||
|
{%- endfor %}
|
||||||
|
{%- if add_generation_prompt %}
|
||||||
|
{{- '<|im_start|>assistant\n' }}
|
||||||
|
{%- endif %}
|
||||||
3
gguf/model-Q4_K_M.gguf
Normal file
3
gguf/model-Q4_K_M.gguf
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:d7f7bc4bc3a5599bde7012f7888e4a302a5797aa5792cc0faf12fb4deb84f95a
|
||||||
|
size 4683073440
|
||||||
3
gguf/model-Q8_0.gguf
Normal file
3
gguf/model-Q8_0.gguf
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:52f5039eb371d94b02adad1a99e804e1b85fc9bcf5ea2de4e2261a26e3586263
|
||||||
|
size 8098525088
|
||||||
151388
merges.txt
Normal file
151388
merges.txt
Normal file
File diff suppressed because it is too large
Load Diff
31
special_tokens_map.json
Normal file
31
special_tokens_map.json
Normal file
@@ -0,0 +1,31 @@
|
|||||||
|
{
|
||||||
|
"additional_special_tokens": [
|
||||||
|
"<|im_start|>",
|
||||||
|
"<|im_end|>",
|
||||||
|
"<|object_ref_start|>",
|
||||||
|
"<|object_ref_end|>",
|
||||||
|
"<|box_start|>",
|
||||||
|
"<|box_end|>",
|
||||||
|
"<|quad_start|>",
|
||||||
|
"<|quad_end|>",
|
||||||
|
"<|vision_start|>",
|
||||||
|
"<|vision_end|>",
|
||||||
|
"<|vision_pad|>",
|
||||||
|
"<|image_pad|>",
|
||||||
|
"<|video_pad|>"
|
||||||
|
],
|
||||||
|
"eos_token": {
|
||||||
|
"content": "<|im_end|>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false
|
||||||
|
},
|
||||||
|
"pad_token": {
|
||||||
|
"content": "<|endoftext|>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false
|
||||||
|
}
|
||||||
|
}
|
||||||
3
tokenizer.json
Normal file
3
tokenizer.json
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:694f1174c5bdf94e2fc50796c0f1733a5a3945ff110b0dfa40ea0701cc9c9c42
|
||||||
|
size 11422176
|
||||||
207
tokenizer_config.json
Normal file
207
tokenizer_config.json
Normal file
@@ -0,0 +1,207 @@
|
|||||||
|
{
|
||||||
|
"add_bos_token": false,
|
||||||
|
"add_prefix_space": false,
|
||||||
|
"added_tokens_decoder": {
|
||||||
|
"151643": {
|
||||||
|
"content": "<|endoftext|>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false,
|
||||||
|
"special": true
|
||||||
|
},
|
||||||
|
"151644": {
|
||||||
|
"content": "<|im_start|>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false,
|
||||||
|
"special": true
|
||||||
|
},
|
||||||
|
"151645": {
|
||||||
|
"content": "<|im_end|>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false,
|
||||||
|
"special": true
|
||||||
|
},
|
||||||
|
"151646": {
|
||||||
|
"content": "<|object_ref_start|>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false,
|
||||||
|
"special": true
|
||||||
|
},
|
||||||
|
"151647": {
|
||||||
|
"content": "<|object_ref_end|>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false,
|
||||||
|
"special": true
|
||||||
|
},
|
||||||
|
"151648": {
|
||||||
|
"content": "<|box_start|>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false,
|
||||||
|
"special": true
|
||||||
|
},
|
||||||
|
"151649": {
|
||||||
|
"content": "<|box_end|>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false,
|
||||||
|
"special": true
|
||||||
|
},
|
||||||
|
"151650": {
|
||||||
|
"content": "<|quad_start|>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false,
|
||||||
|
"special": true
|
||||||
|
},
|
||||||
|
"151651": {
|
||||||
|
"content": "<|quad_end|>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false,
|
||||||
|
"special": true
|
||||||
|
},
|
||||||
|
"151652": {
|
||||||
|
"content": "<|vision_start|>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false,
|
||||||
|
"special": true
|
||||||
|
},
|
||||||
|
"151653": {
|
||||||
|
"content": "<|vision_end|>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false,
|
||||||
|
"special": true
|
||||||
|
},
|
||||||
|
"151654": {
|
||||||
|
"content": "<|vision_pad|>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false,
|
||||||
|
"special": true
|
||||||
|
},
|
||||||
|
"151655": {
|
||||||
|
"content": "<|image_pad|>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false,
|
||||||
|
"special": true
|
||||||
|
},
|
||||||
|
"151656": {
|
||||||
|
"content": "<|video_pad|>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false,
|
||||||
|
"special": true
|
||||||
|
},
|
||||||
|
"151657": {
|
||||||
|
"content": "<tool_call>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false,
|
||||||
|
"special": false
|
||||||
|
},
|
||||||
|
"151658": {
|
||||||
|
"content": "</tool_call>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false,
|
||||||
|
"special": false
|
||||||
|
},
|
||||||
|
"151659": {
|
||||||
|
"content": "<|fim_prefix|>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false,
|
||||||
|
"special": false
|
||||||
|
},
|
||||||
|
"151660": {
|
||||||
|
"content": "<|fim_middle|>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false,
|
||||||
|
"special": false
|
||||||
|
},
|
||||||
|
"151661": {
|
||||||
|
"content": "<|fim_suffix|>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false,
|
||||||
|
"special": false
|
||||||
|
},
|
||||||
|
"151662": {
|
||||||
|
"content": "<|fim_pad|>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false,
|
||||||
|
"special": false
|
||||||
|
},
|
||||||
|
"151663": {
|
||||||
|
"content": "<|repo_name|>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false,
|
||||||
|
"special": false
|
||||||
|
},
|
||||||
|
"151664": {
|
||||||
|
"content": "<|file_sep|>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false,
|
||||||
|
"special": false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"additional_special_tokens": [
|
||||||
|
"<|im_start|>",
|
||||||
|
"<|im_end|>",
|
||||||
|
"<|object_ref_start|>",
|
||||||
|
"<|object_ref_end|>",
|
||||||
|
"<|box_start|>",
|
||||||
|
"<|box_end|>",
|
||||||
|
"<|quad_start|>",
|
||||||
|
"<|quad_end|>",
|
||||||
|
"<|vision_start|>",
|
||||||
|
"<|vision_end|>",
|
||||||
|
"<|vision_pad|>",
|
||||||
|
"<|image_pad|>",
|
||||||
|
"<|video_pad|>"
|
||||||
|
],
|
||||||
|
"bos_token": null,
|
||||||
|
"clean_up_tokenization_spaces": false,
|
||||||
|
"eos_token": "<|im_end|>",
|
||||||
|
"errors": "replace",
|
||||||
|
"extra_special_tokens": {},
|
||||||
|
"model_max_length": 131072,
|
||||||
|
"pad_token": "<|endoftext|>",
|
||||||
|
"split_special_tokens": false,
|
||||||
|
"tokenizer_class": "Qwen2Tokenizer",
|
||||||
|
"unk_token": null
|
||||||
|
}
|
||||||
1
vocab.json
Normal file
1
vocab.json
Normal file
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user