commit 2298f7fa8906f1934f261eab0b276a0a5f748c37 Author: ModelHub XC Date: Sun Apr 26 00:21:37 2026 +0800 初始化项目,由ModelHub XC社区提供模型 Model: CaaLM/CaaLM-v1 Source: Original Platform diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..52373fe --- /dev/null +++ b/.gitattributes @@ -0,0 +1,36 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..b456522 --- /dev/null +++ b/README.md @@ -0,0 +1,231 @@ +--- +license: apache-2.0 +language: +- en +tags: +- code +- execution +- prediction +- language-generalization +- no-compiler +- python +- javascript +- lua +- cobol +- synthetic-languages +- transformers +- qwen2 +pipeline_tag: text-generation +base_model: Qwen/Qwen2.5-1.5B +library_name: transformers +--- + +# CaaLM/CaaLM-v1 + + +![CaaLM-v1 Logo](https://cdn-uploads.huggingface.co/production/uploads/670562d6ac129959c16f84d4/lsYHkWaSlewMkpgEaOJNP.png) + +## What is this? + +CaaLM (Code as a Language Model) is a 1.5B parameter model that predicts the output of code — without a compiler, runtime, or interpreter. + +You give it code. It tells you what it would print. + +The interesting part: it was never trained on a fixed set of languages. Instead, it was trained on real languages (Python, JavaScript, Lua, COBOL) alongside 200 synthetically generated fake programming languages — each with randomized syntax but consistent semantics. The goal was to teach the model what *execution* means, not what any specific language looks like. + +This means it can predict the output of languages it has never seen before. + +## Performance + +![Benchmark_by_Category](https://cdn-uploads.huggingface.co/production/uploads/670562d6ac129959c16f84d4/AZhDOGagSMRSNQmFu9bgC.png) + +![Real vs Novel Fake Languages](https://cdn-uploads.huggingface.co/production/uploads/670562d6ac129959c16f84d4/HghKHvXpx-Ddta8on-WqV.png) + +**Overall: 96.2% (50/52 tests)** + +| Category | Accuracy | Passed/Total | +|---|---|---| +| Real: Python | 100% | 10/10 | +| Real: JavaScript | 100% | 8/8 | +| Real: Lua | 100% | 6/6 | +| Real: COBOL | 75% | 3/4 | +| Novel Fake: Tier 1 (assign + print) | 100% | 8/8 | +| Novel Fake: Tier 2 (conditionals) | 86% | 6/7 | +| Novel Fake: Tier 3 (loops) | 100% | 4/4 | +| Edge Cases | 100% | 5/5 | + +The novel fake language tests use languages that were never seen during training — completely invented syntax like `SCRIBBLE @x BECOMES 7` or `WONDER n > 10`. The model infers semantics from context and gets them right. + +### Known Failures + +Two failures in the benchmark, both explainable: + +- **COBOL zero-padding** — predicted `08` instead of `0008`. Got the value right, missed the `PIC 9(4)` padding format. Data consistency issue. +- **If-without-else** — when a conditional has no else branch and the condition is false, the correct output is empty. The model predicted `NO`, hallucinating an else branch. Most training data had if/else pairs so it defaulted to that pattern. + +## How It Works + +Input format: +``` +Code: + + +Output: +``` + +The model completes the `Output:` section with the predicted stdout. + +### Example — Real Language + +``` +Code: +a = 10 +b = 20 +print(a + b) + +Output: +30 +``` + +### Example — Novel Fake Language (never seen during training) + +``` +Code: +SCRIBBLE @x BECOMES 7 +SCRIBBLE @y BECOMES 3 +YELL @x + @y + +Output: +10 +``` + +``` +Code: +BIND n TO 15 +WONDER n > 10 + SHOUT YES +STOP + +Output: +YES +``` + +## Quick Start + +```python +from transformers import AutoModelForCausalLM, AutoTokenizer +import torch + +model = AutoModelForCausalLM.from_pretrained( + "CaaLM/CaaLM-v1", + torch_dtype=torch.bfloat16, + device_map="auto" +) +tokenizer = AutoTokenizer.from_pretrained("CaaLM/CaaLM-v1") +model.eval() + +def predict_output(code: str) -> str: + prompt = f"Code:\n{code}\n\nOutput:\n" + inputs = tokenizer(prompt, return_tensors="pt").to(model.device) + + with torch.no_grad(): + outputs = model.generate( + **inputs, + max_new_tokens=128, + do_sample=False, + pad_token_id=tokenizer.eos_token_id, + ) + + return tokenizer.decode( + outputs[0][inputs.input_ids.shape[1]:], + skip_special_tokens=True + ).strip() + +# Real language +print(predict_output("a = 6\nb = 7\nprint(a * b)")) +# → 42 + +# Novel fake language +print(predict_output("STORE X := 10\nSTORE Y := 5\nSPEAK X + Y")) +# → 15 +``` + +## Training + +![Training Summary](https://cdn-uploads.huggingface.co/production/uploads/670562d6ac129959c16f84d4/UXPYmNvYDiIsfHR5JC55n.png) + +### Data + +Training data was split between real and synthetic languages: + +**Real languages (8,000 examples total, 2,000 each):** +- Python — clean semantics, baseline +- JavaScript — type coercion, implicit behaviors +- Lua — minimal syntax, sparse +- COBOL — verbose, English-like, no conventional syntax markers + +**Synthetic languages (120,000 examples total):** +- 200 procedurally generated fake languages +- Each language has randomized keywords, operators, variable styles, and block delimiters +- Semantics are consistent within each language but syntax varies wildly across all 200 +- Programs generated via a Python simulator — outputs are ground truth from actual execution +- Three complexity tiers: assign+print (30%), conditionals (40%), loops (30%) + +The spec for each fake language is discarded after data generation. The model only ever sees `(code, output)` pairs — it never gets a syntax guide. + +### Configuration + +- **Base model:** Qwen/Qwen2.5-1.5B (base, not instruct) +- **Training method:** Full fine-tuning (no LoRA) +- **Loss masking:** Loss computed on output tokens only, not prompt +- **Precision:** BF16 +- **Optimizer:** AdamW (lr=2e-5, weight_decay=0.01) +- **Scheduler:** Cosine with 3% warmup +- **Batch size:** 8 per device × 4 gradient accumulation = 32 effective +- **Epochs:** 3 +- **Max sequence length:** 512 tokens +- **Hardware:** NVIDIA A100 SXM4 40GB +- **Training time:** 66.5 minutes +- **Training cost:** ~$0.82 + +## Supported Operations + +The model reliably handles: + +- Variable assignment and arithmetic +- Print / output statements +- Conditionals (if/else) +- While loops with accumulator patterns +- String output +- Basic error behavior (empty output when conditions not met) + +It does not handle: functions, recursion, file I/O, complex data structures, pipes, or multi-line string manipulation. These may work in real languages due to Qwen's pretraining knowledge but are not guaranteed. + +## Limitations + +- No actual code execution — outputs are predictions, not guarantees +- If-without-else edge cases can produce hallucinated else branches +- COBOL numeric padding format is inconsistent +- Long programs (many steps) may degrade in accuracy as state complexity grows +- Novel fake languages with very unusual execution models (non-linear control flow, stack-based semantics) are untested +- Context window limits programs to ~512 tokens + +## Why + +The original motivation was to ask: can a language model learn what *execution* means as an abstract concept, independent of any specific language's syntax? + +The novel fake language results suggest yes, at least for basic programs. The model sees `WONDER x > 10` for the first time and figures out it's a conditional. It sees `SCRIBBLE @x BECOMES 7` and figures out it's assignment. It doesn't know these keywords — it infers them from the structure of the code and the patterns it learned during training. + +Whether this scales to more complex programs, more alien execution models, or larger languages is an open question. + +## Model Lineage + +CaaLM-v1 is the first model in the CaaLM series, and a spiritual successor to the [LaaLM project](https://huggingface.co/LaaLM). + +- **LaaLM-v1** — T5-base fine-tuned to simulate Linux shell commands (external state) +- **LaaLM-exp-v1** — Qwen 3B fine-tuned for conversational Linux terminal emulation (internal state) +- **CaaLM-v1** — Qwen 1.5B fine-tuned for language-agnostic code output prediction (current) + +## License + +Apache 2.0 (inherited from Qwen 2.5 base model) \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..5bec861 --- /dev/null +++ b/config.json @@ -0,0 +1,64 @@ +{ + "architectures": [ + "Qwen2ForCausalLM" + ], + "attention_dropout": 0.0, + "bos_token_id": null, + "dtype": "bfloat16", + "eos_token_id": 151643, + "hidden_act": "silu", + "hidden_size": 1536, + "initializer_range": 0.02, + "intermediate_size": 8960, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 32768, + "max_window_layers": 28, + "model_type": "qwen2", + "num_attention_heads": 12, + "num_hidden_layers": 28, + "num_key_value_heads": 2, + "pad_token_id": 151665, + "rms_norm_eps": 1e-06, + "rope_parameters": { + "rope_theta": 1000000.0, + "rope_type": "default" + }, + "sliding_window": null, + "tie_word_embeddings": true, + "transformers_version": "5.5.0", + "unsloth_fixed": true, + "unsloth_version": "2026.4.6", + "use_cache": false, + "use_mrope": false, + "use_sliding_window": false, + "vocab_size": 151936 +} diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..2ef4145 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,8 @@ +{ + "bos_token_id": 151643, + "eos_token_id": 151643, + "max_length": 32768, + "max_new_tokens": 2048, + "pad_token_id": 151665, + "transformers_version": "5.5.0" +} diff --git a/model.safetensors b/model.safetensors new file mode 100644 index 0000000..66d4f39 --- /dev/null +++ b/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a1f1591e4af5ee1650d6bc3a282c2a5d98cc69ce237108a98a55c78721bc752d +size 3087467144 diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..5340d81 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bd5948af71b4f56cf697f7580814c7ce8b80595ef985544efcacf716126a2e31 +size 11422356 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..1f1e6cf --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,15 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "is_local": false, + "model_max_length": 32768, + "pad_token": "<|PAD_TOKEN|>", + "padding_side": "left", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +}