初始化项目，由ModelHub XC社区提供模型

Model: kojima-lab/molcrawl-molecule-nat-lang-mol-instructions-gpt2-small Source: Original Platform
2026-05-30 04:10:25 +08:00
commit 3da01878b8
13 changed files with 300724 additions and 0 deletions
--- a/.gitattributes
+++ b/.gitattributes
@@ -0,0 +1,35 @@
+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
--- a/README.md
+++ b/README.md
@@ -0,0 +1,71 @@
+---
+license: apache-2.0
+tags:
+- pytorch
+- gpt2
+- molecule-nl
+pipeline_tag: text-generation
+---
+
+# molcrawl-molecule-nat-lang-mol-instructions-gpt2-small
+
+## Model Description
+
+GPT-2 small (124M parameters) fine-tuned on molecule-oriented instruction data from [Mol-Instructions](https://huggingface.co/datasets/zjunlp/Mol-Instructions), starting from the `molcrawl-molecule-nat-lang-gpt2-small` pre-trained model.
+
+## Datasets
+
+- **Mol-Instructions**: [https://huggingface.co/datasets/zjunlp/Mol-Instructions](https://huggingface.co/datasets/zjunlp/Mol-Instructions) (Fine-tuning dataset)
+
+- **Model Type**: gpt2
+- **Data Type**: Molecule-NL
+- **Training Date**: 2026-04-24
+
+## Usage
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import torch
+
+model = AutoModelForCausalLM.from_pretrained("kojima-lab/molcrawl-molecule-nat-lang-mol-instructions-gpt2-small")
+tokenizer = AutoTokenizer.from_pretrained("kojima-lab/molcrawl-molecule-nat-lang-mol-instructions-gpt2-small")
+
+# Generate molecule-related text
+prompt = "The compound with SMILES CC(=O)Oc1ccccc1C(=O)O represents aspirin, which"
+inputs = tokenizer(prompt, return_tensors="pt")
+with torch.no_grad():
+    output_ids = model.generate(
+        **inputs,
+        max_new_tokens=100,
+        do_sample=True,
+        temperature=0.8,
+        eos_token_id=None,  # HF config.json has legacy eos_token_id=0; disable early stop
+        pad_token_id=0,
+    )
+print(tokenizer.decode(output_ids[0], skip_special_tokens=True))
+
+```
+
+## Source Code
+
+Training pipeline, configuration files, and data preparation scripts are
+available in the MolCrawl GitHub repository:
+[https://github.com/mmai-framework-lab/MolCrawl](https://github.com/mmai-framework-lab/MolCrawl)
+
+## License
+
+This model is released under the APACHE-2.0 license.
+
+## Citation
+
+If you use this model, please cite:
+
+```bibtex
+@misc{molcrawl_molecule_nat_lang_mol_instructions_gpt2_small,
+  title={molcrawl-molecule-nat-lang-mol-instructions-gpt2-small},
+  author={{RIKEN}},
+  year={2026},
+  publisher={{Hugging Face}},
+  url={{https://huggingface.co/kojima-lab/molcrawl-molecule-nat-lang-mol-instructions-gpt2-small}}
+}
+```
--- a/TOKENIZER_NOTE.md
+++ b/TOKENIZER_NOTE.md
@@ -0,0 +1,12 @@
+# Tokenizer Note
+
+This model was trained with an internal hash-based tokenizer (vocab_size=50002).
+The tokenizer is not saved in standard HuggingFace format.
+
+For inference, use a tokenizer with vocab_size=50002 or the CodeLlama tokenizer
+(`codellama/CodeLlama-7b-hf`) as the intended base.
+
+Special token IDs:
+- `<pad>`: 0
+- `<eos>`: 2
+- `[/INST]` sequence: [518, 29914, 25580, 29162]
--- a/config.json
+++ b/config.json
@@ -0,0 +1,35 @@
+{
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "model_type": "gpt2",
+  "vocab_size": 50002,
+  "n_positions": 1024,
+  "n_ctx": 1024,
+  "n_embd": 768,
+  "n_layer": 12,
+  "n_head": 12,
+  "n_inner": 3072,
+  "activation_function": "gelu_new",
+  "resid_pdrop": 0.0,
+  "embd_pdrop": 0.0,
+  "attn_pdrop": 0.0,
+  "layer_norm_epsilon": 1e-05,
+  "initializer_range": 0.02,
+  "use_cache": true,
+  "bos_token_id": 0,
+  "eos_token_id": 2,
+  "transformers_version": "4.0.0",
+  "_name_or_path": "riken-gpt2",
+  "_riken_model_args": {
+    "n_layer": 12,
+    "n_head": 12,
+    "n_embd": 768,
+    "block_size": 1024,
+    "bias": false,
+    "vocab_size": 50257,
+    "dropout": 0.0
+  },
+  "_riken_bias": false,
+  "pad_token_id": 0
+}
--- a/merges.txt
+++ b/merges.txt
--- a/model.safetensors
+++ b/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:af208e13e8a7c01cd0a717cabff1e2fbb2764531e9fc6d4b55dc7add2ccd4a32
+size 496990848
--- a/pytorch_model.bin
+++ b/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:97c0de28e3406ab135b70aa8c3c189d4cb74943ed68185554af5fa2fa47f9573
+size 497375976
--- a/sample_inference.py
+++ b/sample_inference.py
@@ -0,0 +1,198 @@
+"""
+Sample inference script for molcrawl-molecule-nat-lang-gpt2-small.
+
+This model is a GPT-2 small (124M params) foundation model pretrained on
+molecule-related natural language data using a standard GPT-2 BPE tokenizer
+(vocab_size=50257).
+
+Key fix over the 20260316 version:
+  - 20260316: Used MinimalTokenizer with Python hash() — non-deterministic,
+    decode() impossible, data/model mismatch.
+  - 20260325: Uses GPT2TokenizerFast (BPE) — fully deterministic, decodable.
+
+Usage:
+    # From HuggingFace Hub
+    python sample_inference.py
+
+    # From local checkpoint dir
+    MODEL_PATH=/path/to/checkpoint python sample_inference.py
+"""
+
+import os
+import sys
+
+try:
+    import torch
+    from transformers import AutoModelForCausalLM, AutoTokenizer
+except ImportError:
+    print("ERROR: Install dependencies: pip install transformers torch")
+    sys.exit(1)
+
+# ---------------------------------------------------------------------------
+# Config
+# ---------------------------------------------------------------------------
+MODEL_PATH = os.environ.get("MODEL_PATH", "kojima-lab/molcrawl-molecule-nat-lang-gpt2-small")
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+
+DEMO_TEXTS = [
+    "The compound with SMILES CC(=O)O is",
+    "This molecule has a molecular weight of",
+    "The SMILES CC(=O)Oc1ccccc1C(=O)O represents aspirin, which",
+    "In drug discovery, the key property of this compound is",
+]
+
+
+# ---------------------------------------------------------------------------
+# TEST 1: Tokenizer determinism (validates 20260316 defect is resolved)
+# ---------------------------------------------------------------------------
+def test_tokenizer_determinism(tokenizer):
+    """
+    20260316 defect: MinimalTokenizer used abs(hash(token)) % 50000 + 2.
+    Python hash() is PYTHONHASHSEED-dependent -> different IDs across processes.
+    20260325 fix: GPT2TokenizerFast (BPE) -> fully deterministic.
+    """
+    print("\n[TEST 1] Tokenizer Determinism")
+    print("-" * 40)
+    text = "The SMILES CC(=O)Oc1ccccc1C(=O)O represents aspirin."
+
+    calls = [tokenizer.encode(text) for _ in range(5)]
+    all_equal = all(c == calls[0] for c in calls)
+
+    print(f"  Input :  {text!r}")
+    print(f"  IDs   :  {calls[0][:10]}...")
+    print(f"  Deterministic (5 calls identical): {'PASS ✓' if all_equal else 'FAIL ✗'}")
+    print(f"  vocab_size : {tokenizer.vocab_size}")
+    print(f"  max token ID: {max(calls[0])} (< vocab_size: {max(calls[0]) < tokenizer.vocab_size} ✓)")
+
+    # Compare with 20260316 behaviour (MinimalTokenizer with fixed seed for demo)
+    # When PYTHONHASHSEED varies: abs(hash('aspirin')) % 50000 + 2 will differ.
+    # Demonstrating the class of defect:
+
+    # Simulate two different hash seeds via salt (cannot change PYTHONHASHSEED mid-process)
+    # Instead, show the formula directly
+    tok_str = "aspirin"
+    h1 = abs(hash(tok_str)) % 50000 + 2
+    # A different Python process with different PYTHONHASHSEED would give different h1
+    print(f"\n  [Defect demo] MinimalTokenizer hash('aspirin') % 50000 + 2 = {h1}")
+    print("  [Defect demo] This value changes across Python processes (PYTHONHASHSEED=random)")
+    print(f"  [Fixed] GPT-2 BPE: 'aspirin' -> {tokenizer.encode('aspirin')} (always)")
+
+    return all_equal
+
+
+# ---------------------------------------------------------------------------
+# TEST 2: Round-trip encode → decode
+# ---------------------------------------------------------------------------
+def test_round_trip(tokenizer):
+    """Verify encode → decode produces the original text (impossible with MinimalTokenizer)."""
+    print("\n[TEST 2] Round-trip Encode → Decode")
+    print("-" * 40)
+    texts = [
+        "The SMILES CC(=O)Oc1ccccc1C(=O)O represents aspirin.",
+        "Drug discovery requires understanding molecular properties.",
+        "CC(N)C(=O)O is alanine, an amino acid.",
+    ]
+    all_pass = True
+    for text in texts:
+        ids = tokenizer.encode(text)
+        decoded = tokenizer.decode(ids, skip_special_tokens=True)
+        match = text.strip() == decoded.strip()
+        all_pass = all_pass and match
+        status = "PASS ✓" if match else "FAIL ✗"
+        print(f"  {status}  {text[:50]!r}")
+        if not match:
+            print(f"     decoded: {decoded!r}")
+    return all_pass
+
+
+# ---------------------------------------------------------------------------
+# TEST 3: Vocabulary coverage of molecule-specific tokens
+# ---------------------------------------------------------------------------
+def test_molecule_tokens(tokenizer):
+    """Check that molecule-specific strings tokenize to reasonable sequences."""
+    print("\n[TEST 3] Molecule Token Coverage")
+    print("-" * 40)
+    examples = {
+        "CC(=O)O": "acetic acid (SMILES)",
+        "c1ccccc1": "benzene ring (SMILES)",
+        "CC(=O)Oc1ccccc1C(=O)O": "aspirin (SMILES)",
+        "NH2": "amine group",
+        "molecular weight": "NL phrase",
+        "IC50": "pharmacology term",
+        "ADMET": "drug property acronym",
+    }
+    for tok_str, desc in examples.items():
+        ids = tokenizer.encode(tok_str)
+        print(f"  {desc:35s} -> {len(ids):2d} tokens  {ids[:6]}")
+
+
+# ---------------------------------------------------------------------------
+# TEST 4: Text generation
+# ---------------------------------------------------------------------------
+def test_generation(model, tokenizer):
+    """Generate continuations for molecule-related prompts."""
+    print("\n[TEST 4] Text Generation")
+    print("-" * 40)
+    model.eval()
+    for prompt in DEMO_TEXTS:
+        inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)
+        with torch.no_grad():
+            out = model.generate(
+                **inputs,
+                max_new_tokens=60,
+                do_sample=True,
+                temperature=0.85,
+                top_p=0.92,
+                pad_token_id=tokenizer.eos_token_id,
+                repetition_penalty=1.1,
+            )
+        generated = tokenizer.decode(out[0], skip_special_tokens=True)
+        print(f"\n  Prompt : {prompt!r}")
+        print(f"  Output : {generated!r}")
+
+
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+def main():
+    print("=" * 60)
+    print("MolCrawl molecule_nat_lang GPT-2 small — Inference Demo")
+    print(f"Model : {MODEL_PATH}")
+    print(f"Device: {DEVICE}")
+    print("=" * 60)
+
+    # Load
+    print("\nLoading tokenizer...")
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
+    print(f"  class   : {type(tokenizer).__name__}")
+    print(f"  vocab   : {tokenizer.vocab_size}")
+
+    print("Loading model...")
+    model = AutoModelForCausalLM.from_pretrained(MODEL_PATH).to(DEVICE)
+    model.eval()
+    n_params = sum(p.numel() for p in model.parameters())
+    print(f"  params  : {n_params:,}")
+
+    # Run tests
+    r1 = test_tokenizer_determinism(tokenizer)
+    r2 = test_round_trip(tokenizer)
+    test_molecule_tokens(tokenizer)
+    test_generation(model, tokenizer)
+
+    # Summary
+    print("\n" + "=" * 60)
+    print("Summary")
+    print("=" * 60)
+    print(f"  Tokenizer determinism : {'PASS ✓' if r1 else 'FAIL ✗'}")
+    print(f"  Round-trip decode     : {'PASS ✓' if r2 else 'FAIL ✗'}")
+    print("  Text generation       : done")
+    if r1 and r2:
+        print("\n  All validation tests PASSED.")
+        print("  Tokenizer defect from 20260316 (MinimalTokenizer hash-based) is RESOLVED.")
+    else:
+        print("\n  Some tests FAILED — please check the output above.")
+    print("=" * 60)
+
+
+if __name__ == "__main__":
+    main()
--- a/special_tokens_map.json
+++ b/special_tokens_map.json
@@ -0,0 +1,23 @@
+{
+  "bos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}
--- a/tokenizer.json
+++ b/tokenizer.json
--- a/tokenizer_config.json
+++ b/tokenizer_config.json
@@ -0,0 +1,19 @@
+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "50256": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<|endoftext|>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|endoftext|>",
+  "model_max_length": 1024,
+  "tokenizer_class": "GPT2Tokenizer",
+  "unk_token": "<|endoftext|>"
+}
--- a/training_args.json
+++ b/training_args.json
@@ -0,0 +1,17 @@
+{
+  "iteration": 2000,
+  "best_val_loss": 0.51076340675354,
+  "early_stopping_counter": 1,
+  "learning_rate": 1e-05,
+  "batch_size": 8,
+  "block_size": 1024,
+  "model_args": {
+    "n_layer": 12,
+    "n_head": 12,
+    "n_embd": 768,
+    "block_size": 1024,
+    "bias": false,
+    "vocab_size": 50257,
+    "dropout": 0.0
+  }
+}
--- a/vocab.json
+++ b/vocab.json