同步 b7516

2026-01-23 11:34:20 +08:00
parent d3752412b3
commit b1cf23ae3e
2015 changed files with 935048 additions and 1 deletions
--- a/examples/model-conversion/scripts/causal/compare-embeddings-logits.sh
+++ b/examples/model-conversion/scripts/causal/compare-embeddings-logits.sh
@@ -0,0 +1,43 @@
+#!/usr/bin/env bash
+
+set -e
+
+MODEL_PATH="${1:-"$MODEL_PATH"}"
+MODEL_NAME="${2:-$(basename "$MODEL_PATH")}"
+
+if [ -t 0 ]; then
+    CPP_EMBEDDINGS="data/llamacpp-${MODEL_NAME}-embeddings.bin"
+else
+    # Process piped JSON data and convert to binary (matching logits.cpp format)
+    TEMP_FILE=$(mktemp /tmp/tmp.XXXXXX.binn)
+    python3 -c "
+import json
+import sys
+import struct
+
+data = json.load(sys.stdin)
+
+# Flatten all embeddings completely
+flattened = []
+for item in data:
+    embedding = item['embedding']
+    for token_embedding in embedding:
+        flattened.extend(token_embedding)
+
+print(f'Total embedding values: {len(flattened)}', file=sys.stderr)
+
+# Write as binary floats - matches logitc.cpp fwrite format
+with open('$TEMP_FILE', 'wb') as f:
+    for value in flattened:
+        f.write(struct.pack('f', value))
+"
+    CPP_EMBEDDINGS="$TEMP_FILE"
+    trap "rm -f $TEMP_FILE" EXIT
+fi
+
+python scripts/utils/semantic_check.py --model-path $MODEL_PATH \
+    --python-embeddings data/pytorch-${MODEL_NAME}-embeddings.bin \
+    --cpp-embeddings $CPP_EMBEDDINGS \
+    --prompt "Hello world today" \
+    --causal
+
--- a/examples/model-conversion/scripts/causal/compare-logits.py
+++ b/examples/model-conversion/scripts/causal/compare-logits.py
@@ -0,0 +1,80 @@
+#!/usr/bin/env python3
+
+import sys
+import numpy as np
+from pathlib import Path
+
+# Add utils directory to path for direct script execution
+sys.path.insert(0, str(Path(__file__).parent.parent / "utils"))
+from common import get_model_name_from_env_path  # type: ignore[import-not-found]
+
+def quick_logits_check(pytorch_file, llamacpp_file):
+    """Lightweight sanity check before NMSE"""
+
+    try:
+        pytorch_logits = np.fromfile(pytorch_file, dtype=np.float32)
+        llamacpp_logits = np.fromfile(llamacpp_file, dtype=np.float32)
+    except Exception as e:
+        print(f"❌ NOK: Failed to load files - {e}")
+        return False
+
+    # Check shapes match
+    if pytorch_logits.shape != llamacpp_logits.shape:
+        print(f"❌ NOK: Shape mismatch - PyTorch: {pytorch_logits.shape}, llama.cpp: {llamacpp_logits.shape}")
+        return False
+
+    # Calculate key metrics
+    diff = pytorch_logits - llamacpp_logits
+    abs_diff = np.abs(diff)
+    max_diff = np.max(abs_diff)
+
+    # Get top 10 predictions from both models
+    pytorch_top10 = np.argsort(pytorch_logits)[-10:][::-1]
+    llamacpp_top10 = np.argsort(llamacpp_logits)[-10:][::-1]
+    print(f"Top 10 PyTorch logits: {pytorch_logits[pytorch_top10]}")
+    print(f"Top 10 llama.cpp logits: {llamacpp_logits[llamacpp_top10]}")
+    print(f"Max absolute difference: {max_diff:.4f}")
+
+    return True
+
+def main():
+    model_name = get_model_name_from_env_path('MODEL_PATH')
+    data_dir = Path("data")
+    pytorch_file = data_dir / f"pytorch-{model_name}.bin"
+
+    llamacpp_model_name = get_model_name_from_env_path('CONVERTED_MODEL')
+    print(f"Using converted model: {llamacpp_model_name}")
+    llamacpp_file = data_dir / f"llamacpp-{llamacpp_model_name}.bin"
+
+    if not pytorch_file.exists():
+        print(f"Error: PyTorch logits file not found: {pytorch_file}")
+        print("Please run scripts/run-org-model.sh first to generate this file.")
+        sys.exit(1)
+
+    if not llamacpp_file.exists():
+        print(f"Error: llama.cpp logits file not found: {llamacpp_file}")
+        print("Please run scripts/run-converted-model.sh first to generate this file.")
+        sys.exit(1)
+
+    print("Checked all required files were found. Proceeding...\n")
+
+
+    print("🔍 GGML Model Validation for model ", model_name)
+    print("=" * 40)
+    print(f"PyTorch logits  : {pytorch_file}")
+    print(f"llama.cpp logits: {llamacpp_file}")
+    print()
+
+    success = quick_logits_check(pytorch_file, llamacpp_file)
+
+    # Exit with appropriate code
+    if success:
+        print("✅ OK: Lightweight model check successful!")
+        print("       Ok to proceed with NMSE check...")
+        sys.exit(0)
+    else:
+        print(f"❌ NOK: Top 10 predictions don't match - generation will differ")
+        sys.exit(1)
+
+if __name__ == "__main__":
+    main()
--- a/examples/model-conversion/scripts/causal/convert-model.sh
+++ b/examples/model-conversion/scripts/causal/convert-model.sh
@@ -0,0 +1,46 @@
+#!/usr/bin/env bash
+
+set -e
+
+# Parse command line arguments
+MMPROJ=""
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --mmproj)
+            MMPROJ="--mmproj"
+            shift
+            ;;
+        *)
+            shift
+            ;;
+    esac
+done
+
+MODEL_NAME="${MODEL_NAME:-$(basename "$MODEL_PATH")}"
+OUTPUT_DIR="${OUTPUT_DIR:-../../models}"
+TYPE="${OUTTYPE:-f16}"
+METADATA_OVERRIDE="${METADATA_OVERRIDE:-}"
+CONVERTED_MODEL="${OUTPUT_DIR}/${MODEL_NAME}.gguf"
+
+echo "Model path: ${MODEL_PATH}"
+echo "Model name: ${MODEL_NAME}"
+echo "Data  type: ${TYPE}"
+echo "Converted model path:: ${CONVERTED_MODEL}"
+echo "Metadata override: ${METADATA_OVERRIDE}"
+
+CMD_ARGS=("python" "../../convert_hf_to_gguf.py" "--verbose")
+CMD_ARGS+=("${MODEL_PATH}")
+CMD_ARGS+=("--outfile" "${CONVERTED_MODEL}")
+CMD_ARGS+=("--outtype" "${TYPE}")
+[[ -n "$METADATA_OVERRIDE" ]] && CMD_ARGS+=("--metadata" "${METADATA_OVERRIDE}")
+[[ -n "$MMPROJ" ]] && CMD_ARGS+=("${MMPROJ}")
+
+"${CMD_ARGS[@]}"
+
+echo ""
+echo "The environment variable CONVERTED_MODEL can be set to this path using:"
+echo "export CONVERTED_MODEL=$(realpath ${CONVERTED_MODEL})"
+if [[ -n "$MMPROJ" ]]; then
+    mmproj_file="${OUTPUT_DIR}/mmproj-$(basename "${CONVERTED_MODEL}")"
+    echo "The mmproj model was created in $(realpath "$mmproj_file")"
+fi
--- a/examples/model-conversion/scripts/causal/modelcard.template
+++ b/examples/model-conversion/scripts/causal/modelcard.template
@@ -0,0 +1,13 @@
+---
+base_model:
+- {base_model}
+---
+# {model_name} GGUF
+
+Recommended way to run this model:
+
+```sh
+llama-server -hf {namespace}/{model_name}-GGUF -c 0
+```
+
+Then, access http://localhost:8080
--- a/examples/model-conversion/scripts/causal/run-casual-gen-embeddings-org.py
+++ b/examples/model-conversion/scripts/causal/run-casual-gen-embeddings-org.py
@@ -0,0 +1,114 @@
+#!/usr/bin/env python3
+
+import argparse
+import os
+import importlib
+import torch
+import numpy as np
+
+from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM
+from pathlib import Path
+
+unreleased_model_name = os.getenv('UNRELEASED_MODEL_NAME')
+
+parser = argparse.ArgumentParser(description='Process model with specified path')
+parser.add_argument('--model-path', '-m', help='Path to the model')
+args = parser.parse_args()
+
+model_path = os.environ.get('MODEL_PATH', args.model_path)
+if model_path is None:
+    parser.error("Model path must be specified either via --model-path argument or MODEL_PATH environment variable")
+
+config = AutoConfig.from_pretrained(model_path)
+
+print("Model type:       ", config.model_type)
+print("Vocab size:       ", config.vocab_size)
+print("Hidden size:      ", config.hidden_size)
+print("Number of layers: ", config.num_hidden_layers)
+print("BOS token id:     ", config.bos_token_id)
+print("EOS token id:     ", config.eos_token_id)
+
+print("Loading model and tokenizer using AutoTokenizer:", model_path)
+tokenizer = AutoTokenizer.from_pretrained(model_path)
+
+if unreleased_model_name:
+    model_name_lower = unreleased_model_name.lower()
+    unreleased_module_path = f"transformers.models.{model_name_lower}.modular_{model_name_lower}"
+    class_name = f"{unreleased_model_name}ForCausalLM"
+    print(f"Importing unreleased model module: {unreleased_module_path}")
+
+    try:
+        model_class = getattr(importlib.import_module(unreleased_module_path), class_name)
+        model = model_class.from_pretrained(model_path)
+    except (ImportError, AttributeError) as e:
+        print(f"Failed to import or load model: {e}")
+        print("Falling back to AutoModelForCausalLM")
+        model = AutoModelForCausalLM.from_pretrained(model_path)
+else:
+    model = AutoModelForCausalLM.from_pretrained(model_path)
+print(f"Model class: {type(model)}")
+#print(f"Model file: {type(model).__module__}")
+
+model_name = os.path.basename(model_path)
+print(f"Model name: {model_name}")
+
+prompt = "Hello world today"
+input_ids = tokenizer(prompt, return_tensors="pt").input_ids
+print(f"Input tokens: {input_ids}")
+print(f"Input text: {repr(prompt)}")
+print(f"Tokenized: {tokenizer.convert_ids_to_tokens(input_ids[0])}")
+
+with torch.no_grad():
+    outputs = model(input_ids, output_hidden_states=True)
+
+    # Extract hidden states from the last layer
+    # outputs.hidden_states is a tuple of (num_layers + 1) tensors
+    # Index -1 gets the last layer, shape: [batch_size, seq_len, hidden_size]
+    last_hidden_states = outputs.hidden_states[-1]
+
+    # Get embeddings for all tokens
+    token_embeddings = last_hidden_states[0].cpu().numpy()  # Remove batch dimension
+
+    print(f"Hidden states shape: {last_hidden_states.shape}")
+    print(f"Token embeddings shape: {token_embeddings.shape}")
+    print(f"Hidden dimension: {token_embeddings.shape[-1]}")
+    print(f"Number of tokens: {token_embeddings.shape[0]}")
+
+    # Save raw token embeddings
+    data_dir = Path("data")
+    data_dir.mkdir(exist_ok=True)
+    bin_filename = data_dir / f"pytorch-{model_name}-embeddings.bin"
+    txt_filename = data_dir / f"pytorch-{model_name}-embeddings.txt"
+
+    # Save all token embeddings as binary
+    print(token_embeddings)
+    token_embeddings.astype(np.float32).tofile(bin_filename)
+
+    # Save as text for inspection
+    with open(txt_filename, "w") as f:
+        for i, embedding in enumerate(token_embeddings):
+            for j, val in enumerate(embedding):
+                f.write(f"{i} {j} {val:.6f}\n")
+
+    # Print embeddings per token in the requested format
+    print("\nToken embeddings:")
+    tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
+    for i, embedding in enumerate(token_embeddings):
+        # Format: show first few values, ..., then last few values
+        if len(embedding) > 10:
+            # Show first 3 and last 3 values with ... in between
+            first_vals = " ".join(f"{val:8.6f}" for val in embedding[:3])
+            last_vals = " ".join(f"{val:8.6f}" for val in embedding[-3:])
+            print(f"embedding {i}: {first_vals}  ... {last_vals}")
+        else:
+            # If embedding is short, show all values
+            vals = " ".join(f"{val:8.6f}" for val in embedding)
+            print(f"embedding {i}: {vals}")
+
+    # Also show token info for reference
+    print(f"\nToken reference:")
+    for i, token in enumerate(tokens):
+        print(f"  Token {i}: {repr(token)}")
+
+    print(f"Saved bin logits to: {bin_filename}")
+    print(f"Saved txt logist to: {txt_filename}")
--- a/examples/model-conversion/scripts/causal/run-converted-model-embeddings-logits.sh
+++ b/examples/model-conversion/scripts/causal/run-converted-model-embeddings-logits.sh
@@ -0,0 +1,18 @@
+#!/usr/bin/env bash
+
+set -e
+
+# First try command line argument, then environment variable, then file
+CONVERTED_MODEL="${1:-"$CONVERTED_MODEL"}"
+
+# Final check if we have a model path
+if [ -z "$CONVERTED_MODEL" ]; then
+    echo "Error: Model path must be provided either as:" >&2
+    echo "  1. Command line argument" >&2
+    echo "  2. CONVERTED_MODEL environment variable" >&2
+    exit 1
+fi
+
+cmake --build ../../build --target llama-logits -j8
+
+../../build/bin/llama-logits -m $CONVERTED_MODEL -embd-mode "Hello world today"
--- a/examples/model-conversion/scripts/causal/run-converted-model.sh
+++ b/examples/model-conversion/scripts/causal/run-converted-model.sh
@@ -0,0 +1,26 @@
+#!/usr/bin/env bash
+
+set -e
+
+# First try command line argument, then environment variable, then file
+CONVERTED_MODEL="${1:-"$CONVERTED_MODEL"}"
+MODEL_TESTING_PROMPT="${2:-"$MODEL_TESTING_PROMPT"}"
+
+if [ -z "$MODEL_TESTING_PROMPT"]; then
+    MODEL_TESTING_PROMPT="Hello, my name is"
+fi
+
+# Final check if we have a model path
+if [ -z "$CONVERTED_MODEL" ]; then
+    echo "Error: Model path must be provided either as:" >&2
+    echo "  1. Command line argument" >&2
+    echo "  2. CONVERTED_MODEL environment variable" >&2
+    exit 1
+fi
+
+echo $CONVERTED_MODEL
+echo $MODEL_TESTING_PROMPT
+
+cmake --build ../../build --target llama-logits -j8
+
+../../build/bin/llama-logits -m "$CONVERTED_MODEL" "$MODEL_TESTING_PROMPT"
--- a/examples/model-conversion/scripts/causal/run-org-model.py
+++ b/examples/model-conversion/scripts/causal/run-org-model.py
@@ -0,0 +1,152 @@
+#!/usr/bin/env python3
+
+import argparse
+import os
+import sys
+import importlib
+from pathlib import Path
+
+# Add parent directory to path for imports
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
+
+from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForImageTextToText, AutoConfig
+import torch
+import numpy as np
+from utils.common import debug_hook
+
+parser = argparse.ArgumentParser(description="Process model with specified path")
+parser.add_argument("--model-path", "-m", help="Path to the model")
+parser.add_argument("--prompt-file", "-f", help="Optional prompt file", required=False)
+parser.add_argument("--verbose", "-v", action="store_true", help="Enable verbose debug output")
+args = parser.parse_args()
+
+model_path = os.environ.get("MODEL_PATH", args.model_path)
+if model_path is None:
+    parser.error(
+        "Model path must be specified either via --model-path argument or MODEL_PATH environment variable"
+    )
+
+### If you want to dump RoPE activations, uncomment the following lines:
+### === START ROPE DEBUG ===
+# from utils.common import setup_rope_debug
+# setup_rope_debug("transformers.models.apertus.modeling_apertus")
+### == END ROPE DEBUG ===
+
+
+print("Loading model and tokenizer using AutoTokenizer:", model_path)
+tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
+multimodal = False
+full_config = config
+
+print("Model type:       ", config.model_type)
+if "vocab_size" not in config and "text_config" in config:
+    config = config.text_config
+    multimodal = True
+print("Vocab size:       ", config.vocab_size)
+print("Hidden size:      ", config.hidden_size)
+print("Number of layers: ", config.num_hidden_layers)
+print("BOS token id:     ", config.bos_token_id)
+print("EOS token id:     ", config.eos_token_id)
+
+unreleased_model_name = os.getenv("UNRELEASED_MODEL_NAME")
+if unreleased_model_name:
+    model_name_lower = unreleased_model_name.lower()
+    unreleased_module_path = (
+        f"transformers.models.{model_name_lower}.modular_{model_name_lower}"
+    )
+    class_name = f"{unreleased_model_name}ForCausalLM"
+    print(f"Importing unreleased model module: {unreleased_module_path}")
+
+    try:
+        model_class = getattr(
+            importlib.import_module(unreleased_module_path), class_name
+        )
+        model = model_class.from_pretrained(
+            model_path
+        )  # Note: from_pretrained, not fromPretrained
+    except (ImportError, AttributeError) as e:
+        print(f"Failed to import or load model: {e}")
+        exit(1)
+else:
+    if multimodal:
+        model = AutoModelForImageTextToText.from_pretrained(
+            model_path, device_map="auto", offload_folder="offload", trust_remote_code=True, config=full_config
+        )
+    else:
+        model = AutoModelForCausalLM.from_pretrained(
+            model_path, device_map="auto", offload_folder="offload", trust_remote_code=True, config=config
+        )
+
+if args.verbose:
+    for name, module in model.named_modules():
+        if len(list(module.children())) == 0:  # only leaf modules
+            module.register_forward_hook(debug_hook(name))
+
+model_name = os.path.basename(model_path)
+# Printing the Model class to allow for easier debugging. This can be useful
+# when working with models that have not been publicly released yet and this
+# migth require that the concrete class is imported and used directly instead
+# of using AutoModelForCausalLM.
+print(f"Model class: {model.__class__.__name__}")
+
+device = next(model.parameters()).device
+if args.prompt_file:
+    with open(args.prompt_file, encoding='utf-8') as f:
+        prompt = f.read()
+elif os.getenv("MODEL_TESTING_PROMPT"):
+    prompt = os.getenv("MODEL_TESTING_PROMPT")
+else:
+    prompt = "Hello, my name is"
+input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
+
+print(f"Input tokens: {input_ids}")
+print(f"Input text: {repr(prompt)}")
+print(f"Tokenized: {tokenizer.convert_ids_to_tokens(input_ids[0])}")
+
+batch_size = 512
+
+with torch.no_grad():
+    past = None
+    outputs = None
+    for i in range(0, input_ids.size(1), batch_size):
+        print(f"Processing chunk with tokens {i} to {i + batch_size}")
+        chunk = input_ids[:, i:i + batch_size]
+        outputs = model(chunk.to(model.device), past_key_values=past, use_cache=True)
+        past = outputs.past_key_values
+
+    logits = outputs.logits # type: ignore
+
+    # Extract logits for the last token (next token prediction)
+    last_logits = logits[0, -1, :].float().cpu().numpy()
+
+    print(f"Logits shape: {logits.shape}")
+    print(f"Last token logits shape: {last_logits.shape}")
+    print(f"Vocab size: {len(last_logits)}")
+
+    data_dir = Path("data")
+    data_dir.mkdir(exist_ok=True)
+    bin_filename = data_dir / f"pytorch-{model_name}.bin"
+    txt_filename = data_dir / f"pytorch-{model_name}.txt"
+
+    # Save to file for comparison
+    last_logits.astype(np.float32).tofile(bin_filename)
+
+    # Also save as text file for easy inspection
+    with open(txt_filename, "w") as f:
+        for i, logit in enumerate(last_logits):
+            f.write(f"{i}: {logit:.6f}\n")
+
+    # Print some sample logits for quick verification
+    print(f"First 10 logits: {last_logits[:10]}")
+    print(f"Last 10 logits: {last_logits[-10:]}")
+
+    # Show top 5 predicted tokens
+    top_indices = np.argsort(last_logits)[-5:][::-1]
+    print("Top 5 predictions:")
+    for idx in top_indices:
+        token = tokenizer.decode([idx])
+        print(f"  Token {idx} ({repr(token)}): {last_logits[idx]:.6f}")
+
+    print(f"Saved bin logits to: {bin_filename}")
+    print(f"Saved txt logist to: {txt_filename}")
--- a/examples/model-conversion/scripts/embedding/compare-embeddings-logits.sh
+++ b/examples/model-conversion/scripts/embedding/compare-embeddings-logits.sh
@@ -0,0 +1,84 @@
+#!/usr/bin/env bash
+
+set -e
+
+# Parse command line arguments
+MODEL_PATH=""
+MODEL_NAME=""
+PROMPTS_FILE=""
+
+# First argument is always model path
+if [ $# -gt 0 ] && [[ "$1" != --* ]]; then
+    MODEL_PATH="$1"
+    shift
+fi
+
+# Parse remaining arguments
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --prompts-file|-pf)
+            PROMPTS_FILE="$2"
+            shift 2
+            ;;
+        *)
+            # If MODEL_NAME not set and this isn't a flag, use as model name
+            if [ -z "$MODEL_NAME" ] && [[ "$1" != --* ]]; then
+                MODEL_NAME="$1"
+            fi
+            shift
+            ;;
+    esac
+done
+
+# Set defaults
+MODEL_PATH="${MODEL_PATH:-"$EMBEDDING_MODEL_PATH"}"
+MODEL_NAME="${MODEL_NAME:-$(basename "$MODEL_PATH")}"
+
+CONVERTED_MODEL_PATH="${CONVERTED_EMBEDDING_PATH:-"$CONVERTED_EMBEDDING_MODEL"}"
+CONVERTED_MODEL_NAME="${CONVERTED_MODEL_NAME:-$(basename "$CONVERTED_MODEL_PATH" .gguf)}"
+
+if [ -t 0 ]; then
+    CPP_EMBEDDINGS="data/llamacpp-${CONVERTED_MODEL_NAME}-embeddings.bin"
+else
+    # Process piped JSON data and convert to binary (matching logits.cpp format)
+    TEMP_FILE=$(mktemp /tmp/tmp.XXXXXX.binn)
+    python3 -c "
+import json
+import sys
+import struct
+
+data = json.load(sys.stdin)
+
+# Flatten all embeddings completely
+flattened = []
+for item in data:
+    embedding = item['embedding']
+    for token_embedding in embedding:
+        flattened.extend(token_embedding)
+
+print(f'Total embedding values: {len(flattened)}', file=sys.stderr)
+
+# Write as binary floats - matches logitc.cpp fwrite format
+with open('$TEMP_FILE', 'wb') as f:
+    for value in flattened:
+        f.write(struct.pack('f', value))
+"
+    CPP_EMBEDDINGS="$TEMP_FILE"
+    trap "rm -f $TEMP_FILE" EXIT
+fi
+
+# Build the semantic_check.py command
+SEMANTIC_CMD="python scripts/utils/semantic_check.py --model-path $MODEL_PATH \
+    --python-embeddings data/pytorch-${MODEL_NAME}-embeddings.bin \
+    --cpp-embeddings $CPP_EMBEDDINGS"
+
+# Add prompts file if specified, otherwise use default prompt
+if [ -n "$PROMPTS_FILE" ]; then
+    SEMANTIC_CMD="$SEMANTIC_CMD --prompts-file \"$PROMPTS_FILE\""
+else
+    SEMANTIC_CMD="$SEMANTIC_CMD --prompt \"Hello world today\""
+fi
+
+# Execute the command
+eval $SEMANTIC_CMD
+
--- a/examples/model-conversion/scripts/embedding/convert-model.sh
+++ b/examples/model-conversion/scripts/embedding/convert-model.sh
@@ -0,0 +1,38 @@
+#!/usr/bin/env bash
+
+set -e
+
+# Parse command line arguments
+SENTENCE_TRANSFORMERS=""
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        -st|--sentence-transformers)
+            SENTENCE_TRANSFORMERS="--sentence-transformers-dense-modules"
+            shift
+            ;;
+        *)
+            echo "Unknown option: $1"
+            exit 1
+            ;;
+    esac
+done
+
+MODEL_NAME="${MODEL_NAME:-$(basename "$EMBEDDING_MODEL_PATH")}"
+OUTPUT_DIR="${OUTPUT_DIR:-../../models}"
+TYPE="${OUTTYPE:-f16}"
+METADATA_OVERRIDE="${METADATA_OVERRIDE:-}"
+CONVERTED_MODEL="${OUTPUT_DIR}/${MODEL_NAME}.gguf"
+
+echo "Model path: ${EMBEDDING_MODEL_PATH}"
+echo "Model name: ${MODEL_NAME}"
+echo "Data  type: ${TYPE}"
+echo "Converted model path:: ${CONVERTED_MODEL}"
+python ../../convert_hf_to_gguf.py --verbose \
+    ${EMBEDDING_MODEL_PATH} \
+    --outfile ${CONVERTED_MODEL} \
+    --outtype ${TYPE} \
+    ${SENTENCE_TRANSFORMERS}
+
+echo ""
+echo "The environment variable CONVERTED_EMBEDDING MODEL can be set to this path using:"
+echo "export CONVERTED_EMBEDDING_MODEL=$(realpath ${CONVERTED_MODEL})"
--- a/examples/model-conversion/scripts/embedding/modelcard.template
+++ b/examples/model-conversion/scripts/embedding/modelcard.template
@@ -0,0 +1,48 @@
+---
+base_model:
+- {base_model}
+---
+# {model_name} GGUF
+
+Recommended way to run this model:
+
+```sh
+llama-server -hf {namespace}/{model_name}-GGUF --embeddings
+```
+
+Then the endpoint can be accessed at http://localhost:8080/embedding, for
+example using `curl`:
+```console
+curl --request POST \
+    --url http://localhost:8080/embedding \
+    --header "Content-Type: application/json" \
+    --data '{{"input": "Hello embeddings"}}' \
+    --silent
+```
+
+Alternatively, the `llama-embedding` command line tool can be used:
+```sh
+llama-embedding -hf {namespace}/{model_name}-GGUF --verbose-prompt -p "Hello embeddings"
+```
+
+#### embd_normalize
+When a model uses pooling, or the pooling method is specified using `--pooling`,
+the normalization can be controlled by the `embd_normalize` parameter.
+
+The default value is `2` which means that the embeddings are normalized using
+the Euclidean norm (L2). Other options are:
+* -1 No normalization
+*  0 Max absolute
+*  1 Taxicab
+*  2 Euclidean/L2
+* \>2 P-Norm
+
+This can be passed in the request body to `llama-server`, for example:
+```sh
+    --data '{{"input": "Hello embeddings", "embd_normalize": -1}}' \
+```
+
+And for `llama-embedding`, by passing `--embd-normalize <value>`, for example:
+```sh
+llama-embedding -hf {namespace}/{model_name}-GGUF  --embd-normalize -1 -p "Hello embeddings"
+```
--- a/examples/model-conversion/scripts/embedding/run-converted-model.sh
+++ b/examples/model-conversion/scripts/embedding/run-converted-model.sh
@@ -0,0 +1,59 @@
+#!/usr/bin/env bash
+
+set -e
+
+# Parse command line arguments
+CONVERTED_MODEL=""
+PROMPTS_FILE=""
+USE_POOLING=""
+
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        -p|--prompts-file)
+            PROMPTS_FILE="$2"
+            shift 2
+            ;;
+        --pooling)
+            USE_POOLING="1"
+            shift
+            ;;
+        *)
+            if [ -z "$CONVERTED_MODEL" ]; then
+                CONVERTED_MODEL="$1"
+            fi
+            shift
+            ;;
+    esac
+done
+
+# First try command line argument, then environment variable
+CONVERTED_MODEL="${CONVERTED_MODEL:-"$CONVERTED_EMBEDDING_MODEL"}"
+
+# Final check if we have a model path
+if [ -z "$CONVERTED_MODEL" ]; then
+    echo "Error: Model path must be provided either as:" >&2
+    echo "  1. Command line argument" >&2
+    echo "  2. CONVERTED_EMBEDDING_MODEL environment variable" >&2
+    exit 1
+fi
+
+# Read prompt from file or use default
+if [ -n "$PROMPTS_FILE" ]; then
+    if [ ! -f "$PROMPTS_FILE" ]; then
+        echo "Error: Prompts file '$PROMPTS_FILE' not found" >&2
+        exit 1
+    fi
+    PROMPT=$(cat "$PROMPTS_FILE")
+else
+    PROMPT="Hello world today"
+fi
+
+echo $CONVERTED_MODEL
+
+cmake --build ../../build --target llama-logits -j8
+# TODO: update logits.cpp to accept a --file/-f option for the prompt
+if [ -n "$USE_POOLING" ]; then
+    ../../build/bin/llama-logits -m "$CONVERTED_MODEL" -embd-mode -pooling "$PROMPT"
+else
+    ../../build/bin/llama-logits -m "$CONVERTED_MODEL" -embd-mode "$PROMPT"
+fi
--- a/examples/model-conversion/scripts/embedding/run-original-model.py
+++ b/examples/model-conversion/scripts/embedding/run-original-model.py
@@ -0,0 +1,177 @@
+#!/usr/bin/env python3
+
+import argparse
+import os
+import numpy as np
+import importlib
+from pathlib import Path
+
+from transformers import AutoTokenizer, AutoConfig, AutoModel
+import torch
+
+unreleased_model_name = os.getenv('UNRELEASED_MODEL_NAME')
+
+parser = argparse.ArgumentParser(description='Process model with specified path')
+parser.add_argument('--model-path', '-m', help='Path to the model')
+parser.add_argument('--prompts-file', '-p', help='Path to file containing prompts (one per line)')
+parser.add_argument('--use-sentence-transformers', action='store_true',
+                    help='Use SentenceTransformer to apply all numbered layers (01_Pooling, 02_Dense, 03_Dense, 04_Normalize)')
+args = parser.parse_args()
+
+def read_prompt_from_file(file_path):
+    try:
+        with open(file_path, 'r', encoding='utf-8') as f:
+            return f.read().strip()
+    except FileNotFoundError:
+        print(f"Error: Prompts file '{file_path}' not found")
+        exit(1)
+    except Exception as e:
+        print(f"Error reading prompts file: {e}")
+        exit(1)
+
+model_path = os.environ.get('EMBEDDING_MODEL_PATH', args.model_path)
+if model_path is None:
+    parser.error("Model path must be specified either via --model-path argument or EMBEDDING_MODEL_PATH environment variable")
+
+# Determine if we should use SentenceTransformer
+use_sentence_transformers = args.use_sentence_transformers or os.environ.get('USE_SENTENCE_TRANSFORMERS', '').lower() in ('1', 'true', 'yes')
+
+if use_sentence_transformers:
+    from sentence_transformers import SentenceTransformer
+    print("Using SentenceTransformer to apply all numbered layers")
+    model = SentenceTransformer(model_path)
+    tokenizer = model.tokenizer
+    config = model[0].auto_model.config  # type: ignore
+else:
+    tokenizer = AutoTokenizer.from_pretrained(model_path)
+
+    config = AutoConfig.from_pretrained(model_path)
+
+    # This can be used to override the sliding window size for manual testing. This
+    # can be useful to verify the sliding window attention mask in the original model
+    # and compare it with the converted .gguf model.
+    if hasattr(config, 'sliding_window'):
+        original_sliding_window = config.sliding_window
+        #original_sliding_window = 6
+        print(f"Modified sliding window: {original_sliding_window} -> {config.sliding_window}")
+
+    print(f"Using unreleased model: {unreleased_model_name}")
+    if unreleased_model_name:
+        model_name_lower = unreleased_model_name.lower()
+        unreleased_module_path = f"transformers.models.{model_name_lower}.modular_{model_name_lower}"
+        class_name = f"{unreleased_model_name}Model"
+        print(f"Importing unreleased model module: {unreleased_module_path}")
+
+        try:
+            model_class = getattr(importlib.import_module(unreleased_module_path), class_name)
+            model = model_class.from_pretrained(model_path, config=config)
+        except (ImportError, AttributeError) as e:
+            print(f"Failed to import or load model: {e}")
+            exit(1)
+    else:
+        model = AutoModel.from_pretrained(model_path, config=config)
+    print(f"Model class: {type(model)}")
+    print(f"Model file: {type(model).__module__}")
+
+# Verify the model is using the correct sliding window
+if not use_sentence_transformers:
+    if hasattr(model.config, 'sliding_window'):  # type: ignore
+        print(f"Model's sliding_window: {model.config.sliding_window}")  # type: ignore
+    else:
+        print("Model config does not have sliding_window attribute")
+
+model_name = os.path.basename(model_path)
+
+if args.prompts_file:
+    prompt_text = read_prompt_from_file(args.prompts_file)
+    texts = [prompt_text]
+else:
+    texts = ["Hello world today"]
+
+with torch.no_grad():
+    if use_sentence_transformers:
+        embeddings = model.encode(texts, convert_to_numpy=True)
+        all_embeddings = embeddings  # Shape: [batch_size, hidden_size]
+
+        encoded = tokenizer(
+            texts,
+            padding=True,
+            truncation=True,
+            return_tensors="pt"
+        )
+        tokens = encoded['input_ids'][0]
+        token_strings = tokenizer.convert_ids_to_tokens(tokens)
+        for i, (token_id, token_str) in enumerate(zip(tokens, token_strings)):
+            print(f"{token_id:6d} -> '{token_str}'")
+
+        print(f"Embeddings shape (after all SentenceTransformer layers): {all_embeddings.shape}")
+        print(f"Embedding dimension: {all_embeddings.shape[1] if len(all_embeddings.shape) > 1 else all_embeddings.shape[0]}")  # type: ignore
+    else:
+        # Standard approach: use base model output only
+        encoded = tokenizer(
+            texts,
+            padding=True,
+            truncation=True,
+            return_tensors="pt"
+        )
+
+        tokens = encoded['input_ids'][0]
+        token_strings = tokenizer.convert_ids_to_tokens(tokens)
+        for i, (token_id, token_str) in enumerate(zip(tokens, token_strings)):
+            print(f"{token_id:6d} -> '{token_str}'")
+
+        outputs = model(**encoded)
+        hidden_states = outputs.last_hidden_state  # Shape: [batch_size, seq_len, hidden_size]
+
+        all_embeddings = hidden_states[0].cpu().numpy()  # Shape: [seq_len, hidden_size]
+
+        print(f"Hidden states shape: {hidden_states.shape}")
+        print(f"All embeddings shape: {all_embeddings.shape}")
+        print(f"Embedding dimension: {all_embeddings.shape[1]}")
+
+    if len(all_embeddings.shape) == 1:
+        n_embd = all_embeddings.shape[0]  # type: ignore
+        n_embd_count = 1
+        all_embeddings = all_embeddings.reshape(1, -1)
+    else:
+        n_embd = all_embeddings.shape[1]  # type: ignore
+        n_embd_count = all_embeddings.shape[0]  # type: ignore
+
+    print()
+
+    for j in range(n_embd_count):
+        embedding = all_embeddings[j]
+        print(f"embedding {j}: ", end="")
+
+        # Print first 3 values
+        for i in range(min(3, n_embd)):
+            print(f"{embedding[i]:9.6f} ", end="")
+
+        print(" ... ", end="")
+
+        # Print last 3 values
+        for i in range(n_embd - 3, n_embd):
+            print(f"{embedding[i]:9.6f} ", end="")
+
+        print()  # New line
+
+    print()
+
+    data_dir = Path("data")
+    data_dir.mkdir(exist_ok=True)
+    bin_filename = data_dir / f"pytorch-{model_name}-embeddings.bin"
+    txt_filename = data_dir / f"pytorch-{model_name}-embeddings.txt"
+
+    flattened_embeddings = all_embeddings.flatten()
+    flattened_embeddings.astype(np.float32).tofile(bin_filename)
+
+    with open(txt_filename, "w") as f:
+        idx = 0
+        for j in range(n_embd_count):
+            for value in all_embeddings[j]:
+                f.write(f"{idx}: {value:.6f}\n")
+                idx += 1
+    print(f"Total values: {len(flattened_embeddings)} ({n_embd_count} embeddings × {n_embd} dimensions)")
+    print("")
+    print(f"Saved bin embeddings to: {bin_filename}")
+    print(f"Saved txt embeddings to: {txt_filename}")
--- a/examples/model-conversion/scripts/utils/init.py
+++ b/examples/model-conversion/scripts/utils/init.py
--- a/examples/model-conversion/scripts/utils/check-nmse.py
+++ b/examples/model-conversion/scripts/utils/check-nmse.py
@@ -0,0 +1,177 @@
+#!/usr/bin/env python3
+
+import numpy as np
+import sys
+import os
+import argparse
+from pathlib import Path
+from common import get_model_name_from_env_path  # type: ignore[import-not-found]
+
+def calculate_nmse(reference, test):
+    mse = np.mean((test - reference) ** 2)
+    ref_var = np.var(reference)
+    if ref_var == 0:
+        nmse = float('inf') if mse > 0 else 0.0
+        return mse, mse, ref_var
+
+    nmse = mse / ref_var
+
+    return nmse, mse, ref_var
+
+def load_logits(file_path):
+    if not os.path.exists(file_path):
+        raise FileNotFoundError(f"File not found: {file_path}")
+
+    if file_path.suffix == '.npy':
+        return np.load(file_path)
+    elif file_path.suffix == '.bin':
+        return np.fromfile(file_path, dtype=np.float32)
+    else:
+        # Try to load as text file
+        try:
+            # If it has index format "0: value", extract just values
+            data = []
+            with open(file_path, 'r') as f:
+                for line in f:
+                    if ':' in line:
+                        # Format: "index: value"
+                        value = float(line.split(':')[1].strip())
+                    else:
+                        # Just the value
+                        value = float(line.strip())
+                    data.append(value)
+            return np.array(data, dtype=np.float32)
+        except:
+            return np.loadtxt(file_path, dtype=np.float32)
+
+def interpret_nmse(nmse):
+    """Provide interpretation of NMSE value"""
+    if nmse == 0:
+        return "Perfect match", "🎉"
+    elif nmse < 1e-6:
+        return "Essentially identical", "✅"
+    elif nmse < 1e-4:
+        return "Excellent match", "✅"
+    elif nmse < 1e-3:
+        return "Very good match", "👍"
+    elif nmse < 1e-2:
+        return "Good match", "👍"
+    elif nmse < 0.1:
+        return "Acceptable match", "⚠️"
+    elif nmse < 1.0:
+        return "Poor match", "❌"
+    else:
+        return "Very poor match (worse than noise)", "❌"
+
+def main():
+    parser = argparse.ArgumentParser(description='Validate model logits')
+    parser.add_argument('-m', '--model-path', required=True,  help='Path to the model directory')
+    args = parser.parse_args()
+
+    model_name = get_model_name_from_env_path('MODEL_PATH')
+    data_dir = Path("data")
+
+    pytorch_file = data_dir / f"pytorch-{model_name}.bin"
+
+    llamacpp_model_name = get_model_name_from_env_path('CONVERTED_MODEL')
+    llamacpp_file = data_dir / f"llamacpp-{llamacpp_model_name}.bin"
+
+    print(f"Model name: {model_name}")
+    print(f"PyTorch logits file: {pytorch_file}")
+    print(f"llama.cpp logits file: {llamacpp_file}")
+
+    reference_file = pytorch_file
+    test_file = llamacpp_file
+
+    print("📊 NMSE Check for Model Comparison")
+    print("=" * 50)
+    print(f"Reference (ground truth): {reference_file}")
+    print(f"Test (to evaluate):       {test_file}")
+    print()
+
+    try:
+        print("Loading reference logits...")
+        reference = load_logits(reference_file)
+        print(f"  Shape: {reference.shape}, Type: {reference.dtype}")
+
+        print("Loading test logits...")
+        test = load_logits(test_file)
+        print(f"  Shape: {test.shape}, Type: {test.dtype}")
+
+        # Check shapes match
+        if reference.shape != test.shape:
+            print(f"\n❌ Error: Shape mismatch!")
+            print(f"  Reference: {reference.shape}")
+            print(f"  Test: {test.shape}")
+            sys.exit(1)
+
+        print(f"\n✅ Shapes match: {reference.shape}")
+
+        nmse, mse, ref_var = calculate_nmse(reference, test)
+
+        # Additional metrics
+        max_abs_error = np.max(np.abs(test - reference))
+        mean_abs_error = np.mean(np.abs(test - reference))
+
+        # Results
+        print(f"\n📈 METRICS")
+        print("=" * 30)
+        print(f"MSE (Mean Squared Error):     {mse:.6e}")
+        print(f"Reference Variance:           {ref_var:.6e}")
+        print(f"NMSE:                         {nmse:.6e}")
+        print(f"Max Absolute Error:           {max_abs_error:.6f}")
+        print(f"Mean Absolute Error:          {mean_abs_error:.6f}")
+
+        # NMSE in dB (common in signal processing)
+        if nmse > 0:
+            nmse_db = 10 * np.log10(nmse)
+            print(f"NMSE (dB):                    {nmse_db:.2f} dB")
+
+        # Interpretation
+        interpretation, emoji = interpret_nmse(nmse)
+        print(f"\n🎯 INTERPRETATION")
+        print("=" * 30)
+        print(f"{emoji} {interpretation}")
+
+        # Detailed guidance
+        print(f"\n📋 GUIDANCE")
+        print("=" * 30)
+        if nmse < 1e-3:
+            print("✅ EXCELLENT: Your GGML conversion is working very well!")
+            print("   The differences are negligible for practical use.")
+        elif nmse < 1e-2:
+            print("👍 GOOD: Your GGML conversion is working well.")
+            print("   Small differences are likely due to precision/quantization.")
+        elif nmse < 0.1:
+            print("⚠️  ACCEPTABLE: Conversion is working but with some differences.")
+            print("   Check if you're using quantization (Q4, Q8, etc.)")
+            print("   Test generation quality to see if it's acceptable.")
+        else:
+            print("❌ PROBLEMATIC: Large differences detected.")
+            print("   Check your conversion process for potential issues.")
+            print("   Verify you're using the same model weights.")
+
+        # NMSE benchmarks
+        print(f"\n📚 NMSE BENCHMARKS")
+        print("=" * 30)
+        print("< 1e-6:  Essentially identical")
+        print("< 1e-4:  Excellent (typical for good conversions)")
+        print("< 1e-3:  Very good")
+        print("< 1e-2:  Good (acceptable for most use cases)")
+        print("< 0.1:   Acceptable (may need verification)")
+        print("> 1.0:   Poor (worse than random)")
+
+        # Exit code based on NMSE
+        if nmse < 1e-2:
+            print(f"\n✅ RESULT: PASS (NMSE = {nmse:.2e})")
+            sys.exit(0)
+        else:
+            print(f"\n❌ RESULT: NEEDS REVIEW (NMSE = {nmse:.2e})")
+            sys.exit(1)
+
+    except Exception as e:
+        print(f"❌ Error: {e}")
+        sys.exit(1)
+
+if __name__ == "__main__":
+    main()
--- a/examples/model-conversion/scripts/utils/common.py
+++ b/examples/model-conversion/scripts/utils/common.py
@@ -0,0 +1,150 @@
+#!/usr/bin/env python3
+
+import os
+import sys
+import torch
+
+
+def get_model_name_from_env_path(env_path_name):
+    model_path = os.getenv(env_path_name)
+    if not model_path:
+        print(f"Error: {env_path_name} environment variable not set")
+        sys.exit(1)
+
+    if not os.path.exists(model_path):
+        print(f"Error: Model file not found: {model_path}")
+        sys.exit(1)
+
+    name = os.path.basename(os.path.normpath(model_path))
+    if name.endswith(".gguf"):
+        name = name[:-5]
+
+    return name
+
+
+def summarize(tensor: torch.Tensor, name: str, max_seq: int = 3, max_vals: int = 3):
+    """
+    Print a tensor in llama.cpp debug style.
+
+    Supports:
+    - 2D tensors (seq, hidden)
+    - 3D tensors (batch, seq, hidden)
+    - 4D tensors (batch, seq, heads, dim_per_head) via flattening heads × dim_per_head
+
+    Shows first and last max_vals of each vector per sequence position.
+    """
+    t = tensor.detach().to(torch.float32).cpu()
+
+    # Determine dimensions
+    if t.ndim == 3:
+        _, s, _ = t.shape
+    elif t.ndim == 2:
+        _, s = 1, t.shape[0]
+        t = t.unsqueeze(0)
+    elif t.ndim == 4:
+        _, s, _, _ = t.shape
+    else:
+        print(f"Skipping tensor due to unsupported dimensions: {t.ndim}")
+        return
+
+    ten_shape = t.shape
+
+    print(f"ggml_debug: {name} = (f32)  ... = {{{ten_shape}}}")
+    print("                                     [")
+    print("                                      [")
+
+    # Determine indices for first and last sequences
+    first_indices = list(range(min(s, max_seq)))
+    last_indices = list(range(max(0, s - max_seq), s))
+
+    # Check if there's an overlap between first and last indices or if we're at the edge case of s = 2 * max_seq
+    has_overlap = bool(set(first_indices) & set(last_indices)) or (max_seq * 2 == s)
+
+    # Combine indices
+    if has_overlap:
+        # If there's overlap, just use the combined unique indices
+        indices = sorted(list(set(first_indices + last_indices)))
+        separator_index = None
+    else:
+        # If no overlap, we'll add a separator between first and last sequences
+        indices = first_indices + last_indices
+        separator_index = len(first_indices)
+
+    for i, si in enumerate(indices):
+        # Add separator if needed
+        if separator_index is not None and i == separator_index:
+            print("                                       ...")
+
+        # Extract appropriate slice
+        vec = t[0, si]
+        if vec.ndim == 2:  # 4D case: flatten heads × dim_per_head
+            flat = vec.flatten().tolist()
+        else:  # 2D or 3D case
+            flat = vec.tolist()
+
+        # First and last slices
+        first = flat[:max_vals]
+        last = flat[-max_vals:] if len(flat) >= max_vals else flat
+        first_str = ", ".join(f"{v:12.4f}" for v in first)
+        last_str = ", ".join(f"{v:12.4f}" for v in last)
+
+        print(f"                                       [{first_str}, ..., {last_str}]")
+
+    print("                                      ],")
+    print("                                     ]")
+    print(f"                                     sum = {t.sum().item():.6f}\n")
+
+
+def debug_hook(name):
+    def fn(_m, input, output):
+        if isinstance(input, torch.Tensor):
+            summarize(input, name + "_in")
+        elif isinstance(input, (tuple, list)) and len(input) > 0 and isinstance(input[0], torch.Tensor):
+            summarize(input[0], name + "_in")
+        if isinstance(output, torch.Tensor):
+            summarize(output, name + "_out")
+        elif isinstance(output, (tuple, list)) and len(output) > 0 and isinstance(output[0], torch.Tensor):
+            summarize(output[0], name + "_out")
+
+    return fn
+
+
+def setup_rope_debug(model_module_path: str, function_name: str = "apply_rotary_pos_emb"):
+    """
+    Apply monkey patch to dump RoPE activations for debugging.
+
+    Args:
+        model_module_path: Path to the model module (e.g., "transformers.models.apertus.modeling_apertus")
+        function_name: Name of the RoPE function to patch (default: "apply_rotary_pos_emb")
+
+    Example:
+        from utils.common import setup_rope_debug
+        setup_rope_debug("transformers.models.apertus.modeling_apertus")
+    """
+    import importlib
+
+    # Import the module and get the original function
+    module = importlib.import_module(model_module_path)
+    orig_rope = getattr(module, function_name)
+
+    # Set torch print options for better debugging
+    torch.set_printoptions(threshold=float('inf'))
+    torch.set_printoptions(precision=6, sci_mode=False)
+
+    def debug_rope(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+        # log inputs
+        summarize(q, "RoPE.q_in")
+        summarize(k, "RoPE.k_in")
+
+        # call original
+        q_out, k_out = orig_rope(q, k, cos, sin, position_ids, unsqueeze_dim)
+
+        # log outputs
+        summarize(q_out, "RoPE.q_out")
+        summarize(k_out, "RoPE.k_out")
+
+        return q_out, k_out
+
+    # Patch it
+    setattr(module, function_name, debug_rope)
+    print(f"RoPE debug patching applied to {model_module_path}.{function_name}")
--- a/examples/model-conversion/scripts/utils/create-collection-add-model.sh
+++ b/examples/model-conversion/scripts/utils/create-collection-add-model.sh
@@ -0,0 +1,8 @@
+
+#!/usr/bin/env bash
+
+COLLECTION_SLUG=$(python ./create_collection.py --return-slug)
+echo "Created collection: $COLLECTION_SLUG"
+
+# Use it in the next command
+python add_model_to_collection.py "$COLLECTION_SLUG" "username/my-model"
--- a/examples/model-conversion/scripts/utils/curl-embedding-server.sh
+++ b/examples/model-conversion/scripts/utils/curl-embedding-server.sh
@@ -0,0 +1,6 @@
+#!/usr/bin/env bash
+curl --request POST \
+    --url http://localhost:8080/embedding \
+    --header "Content-Type: application/json" \
+    --data '{"input": "Hello world today"}' \
+    --silent
--- a/examples/model-conversion/scripts/utils/hf-add-model-to-collection.py
+++ b/examples/model-conversion/scripts/utils/hf-add-model-to-collection.py
@@ -0,0 +1,80 @@
+#!/usr/bin/env python3
+
+from huggingface_hub import HfApi
+import argparse
+import sys
+
+def add_model_to_collection(collection_slug, model_id, note=""):
+    """
+    Add a model to an existing collection
+
+    Args:
+        collection_slug: The slug of the collection (e.g., "username/collection-name-12345")
+        model_id: The model repository ID (e.g., "username/model-name")
+        note: Optional note about the model
+
+    Returns:
+        True if successful, False if failed
+    """
+
+    # Initialize API
+    api = HfApi()
+
+    try:
+        user_info = api.whoami()
+        print(f"✅ Authenticated as: {user_info['name']}")
+
+        # Verify the model exists
+        print(f"🔍 Checking if model exists: {model_id}")
+        try:
+            model_info = api.model_info(model_id)
+        except Exception as e:
+            print(f"❌ Model not found or not accessible: {model_id}")
+            print(f"Error: {e}")
+            return False
+
+        print(f"📚 Adding model to collection...")
+        api.add_collection_item(
+            collection_slug=collection_slug,
+            item_id=model_id,
+            item_type="model",
+            note=note
+        )
+
+        print(f"✅ Model added to collection successfully!")
+        print(f"🔗 Collection URL: https://huggingface.co/collections/{collection_slug}")
+
+        return True
+
+    except Exception as e:
+        print(f"❌ Error adding model to collection: {e}")
+        return False
+
+def main():
+    # This script requires that the environment variable HF_TOKEN is set with your
+    # Hugging Face API token.
+    api = HfApi()
+
+    parser = argparse.ArgumentParser(description='Add model to a Huggingface Collection')
+    parser.add_argument('--collection', '-c', help='The collection slug username/collection-hash', required=True)
+    parser.add_argument('--model', '-m', help='The model to add to the Collection', required=True)
+    parser.add_argument('--note', '-n', help='An optional note/description', required=False)
+    args = parser.parse_args()
+
+    collection = args.collection
+    model = args.model
+    note = args.note
+
+    success = add_model_to_collection(
+        collection_slug=collection,
+        model_id=model,
+        note=note
+    )
+
+    if success:
+        print("\n🎉 Model added successfully!")
+    else:
+        print("\n❌ Failed to add model to collection")
+        sys.exit(1)
+if __name__ == "__main__":
+    main()
--- a/examples/model-conversion/scripts/utils/hf-create-collection.py
+++ b/examples/model-conversion/scripts/utils/hf-create-collection.py
@@ -0,0 +1,106 @@
+#!/usr/bin/env python3
+
+from huggingface_hub import HfApi
+import argparse
+import os
+import sys
+
+
+def create_collection(title, description, private=False, namespace=None, return_slug=False):
+    """
+    Create a new collection on Hugging Face
+
+    Args:
+        title: Collection title
+        description: Collection description
+        private: Whether the collection should be private (default: False)
+        namespace: Optional namespace (defaults to your username)
+
+    Returns:
+        Collection object if successful, None if failed
+    """
+
+    # Check if HF_TOKEN is available
+    token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_HUB_TOKEN")
+    if not token:
+        print("❌ No HF_TOKEN or HUGGINGFACE_HUB_TOKEN found in environment variables")
+        print("Please set your Hugging Face token as an environment variable")
+        return None
+
+    # Initialize API
+    api = HfApi()
+
+    try:
+        # Test authentication first
+        user_info = api.whoami()
+        if not return_slug:
+            print(f"✅ Authenticated as: {user_info['name']}")
+
+        # Create the collection
+        if not return_slug:
+            print(f"📚 Creating collection: '{title}'...")
+        collection = api.create_collection(
+            title=title,
+            description=description,
+            private=private,
+            namespace=namespace
+        )
+
+        if not return_slug:
+            print(f"✅ Collection created successfully!")
+            print(f"📋 Collection slug: {collection.slug}")
+            print(f"🔗 Collection URL: https://huggingface.co/collections/{collection.slug}")
+
+        return collection
+
+    except Exception as e:
+        print(f"❌ Error creating collection: {e}")
+        return None
+
+def main():
+    # This script requires that the environment variable HF_TOKEN is set with your
+    # Hugging Face API token.
+    api = HfApi()
+
+    parser = argparse.ArgumentParser(description='Create a Huggingface Collection')
+    parser.add_argument('--name', '-n', help='The name/title of the Collection', required=True)
+    parser.add_argument('--description', '-d', help='The description for the Collection', required=True)
+    parser.add_argument('--namespace', '-ns', help='The namespace to add the Collection to', required=True)
+    parser.add_argument('--private', '-p', help='Create a private Collection', action='store_true')  # Fixed
+    parser.add_argument('--return-slug', '-s', help='Only output the collection slug', action='store_true')  # Fixed
+
+    args = parser.parse_args()
+
+    name = args.name
+    description = args.description
+    private = args.private
+    namespace = args.namespace
+    return_slug = args.return_slug
+
+    if not return_slug:
+        print("🚀 Creating Hugging Face Collection")
+        print(f"Title: {name}")
+        print(f"Description: {description}")
+        print(f"Namespace: {namespace}")
+        print(f"Private: {private}")
+
+    collection = create_collection(
+        title=name,
+        description=description,
+        private=private,
+        namespace=namespace,
+        return_slug=return_slug
+    )
+
+    if collection:
+        if return_slug:
+            print(collection.slug)
+        else:
+            print("\n🎉 Collection created successfully!")
+            print(f"Use this slug to add models: {collection.slug}")
+    else:
+        print("\n❌ Failed to create collection")
+        sys.exit(1)
+
+if __name__ == "__main__":
+    main()
--- a/examples/model-conversion/scripts/utils/hf-create-model.py
+++ b/examples/model-conversion/scripts/utils/hf-create-model.py
@@ -0,0 +1,78 @@
+#!/usr/bin/env python3
+
+from huggingface_hub import HfApi
+import argparse
+
+# This script requires that the environment variable HF_TOKEN is set with your
+# Hugging Face API token.
+api = HfApi()
+
+def load_template_and_substitute(template_path, **kwargs):
+    try:
+        with open(template_path, 'r', encoding='utf-8') as f:
+            template_content = f.read()
+
+        return template_content.format(**kwargs)
+    except FileNotFoundError:
+        print(f"Template file '{template_path}' not found!")
+        return None
+    except KeyError as e:
+        print(f"Missing template variable: {e}")
+        return None
+
+parser = argparse.ArgumentParser(description='Create a new Hugging Face model repository')
+parser.add_argument('--model-name', '-m', help='Name for the model', required=True)
+parser.add_argument('--namespace', '-ns', help='Namespace to add the model to', required=True)
+parser.add_argument('--org-base-model', '-b', help='Original Base model name', default="")
+parser.add_argument('--no-card', action='store_true', help='Skip creating model card')
+parser.add_argument('--private', '-p', action='store_true', help='Create private model')
+parser.add_argument('--embedding', '-e', action='store_true', help='Use embedding model card template')
+parser.add_argument('--dry-run', '-d', action='store_true', help='Print repository info and template without creating repository')
+
+args = parser.parse_args()
+
+repo_id = f"{args.namespace}/{args.model_name}-GGUF"
+print("Repository ID: ", repo_id)
+
+repo_url = None
+if not args.dry_run:
+    repo_url = api.create_repo(
+        repo_id=repo_id,
+        repo_type="model",
+        private=args.private,
+        exist_ok=False
+    )
+
+if not args.no_card:
+    if args.embedding:
+        template_path = "scripts/embedding/modelcard.template"
+    else:
+        template_path = "scripts/causal/modelcard.template"
+
+    print("Template path: ", template_path)
+
+    model_card_content = load_template_and_substitute(
+        template_path,
+        model_name=args.model_name,
+        namespace=args.namespace,
+        base_model=args.org_base_model,
+    )
+
+    if args.dry_run:
+        print("\nTemplate Content:\n")
+        print(model_card_content)
+    else:
+        if model_card_content:
+            api.upload_file(
+                path_or_fileobj=model_card_content.encode('utf-8'),
+                path_in_repo="README.md",
+                repo_id=repo_id
+            )
+            print("Model card created successfully.")
+        else:
+            print("Failed to create model card.")
+
+if not args.dry_run and repo_url:
+    print(f"Repository created: {repo_url}")
+
+
--- a/examples/model-conversion/scripts/utils/hf-upload-gguf-model.py
+++ b/examples/model-conversion/scripts/utils/hf-upload-gguf-model.py
@@ -0,0 +1,58 @@
+#!/usr/bin/env python3
+
+from huggingface_hub import HfApi
+import argparse
+import os
+
+def upload_gguf_file(local_file_path, repo_id, filename_in_repo=None):
+    """
+    Upload a GGUF file to a Hugging Face model repository
+
+    Args:
+        local_file_path: Path to your local GGUF file
+        repo_id: Your repository ID (e.g., "username/model-name")
+        filename_in_repo: Optional custom name for the file in the repo
+    """
+
+    if not os.path.exists(local_file_path):
+        print(f"❌ File not found: {local_file_path}")
+        return False
+
+    if filename_in_repo is None:
+        filename_in_repo = os.path.basename(local_file_path)
+
+    if filename_in_repo is None or filename_in_repo == "":
+        filename_in_repo = os.path.basename(local_file_path)
+
+    print(f"📤 Uploading {local_file_path} to {repo_id}/{filename_in_repo}")
+
+    api = HfApi()
+
+    try:
+        api.upload_file(
+            path_or_fileobj=local_file_path,
+            path_in_repo=filename_in_repo,
+            repo_id=repo_id,
+            repo_type="model",
+            commit_message=f"Upload {filename_in_repo}"
+        )
+
+        print("✅ Upload successful!")
+        print(f"🔗 File available at: https://huggingface.co/{repo_id}/blob/main/{filename_in_repo}")
+        return True
+
+    except Exception as e:
+        print(f"❌ Upload failed: {e}")
+        return False
+
+# This script requires that the environment variable HF_TOKEN is set with your
+# Hugging Face API token.
+api = HfApi()
+
+parser = argparse.ArgumentParser(description='Upload a GGUF model to a Huggingface model repository')
+parser.add_argument('--gguf-model-path', '-m', help='The GGUF model file to upload', required=True)
+parser.add_argument('--repo-id', '-r', help='The repository to upload to', required=True)
+parser.add_argument('--name', '-o', help='The name in the model repository', required=False)
+args = parser.parse_args()
+
+upload_gguf_file(args.gguf_model_path, args.repo_id, args.name)
--- a/examples/model-conversion/scripts/utils/inspect-converted-model.sh
+++ b/examples/model-conversion/scripts/utils/inspect-converted-model.sh
@@ -0,0 +1,14 @@
+#!/usr/bin/env bash
+
+# First try command line argument, then environment variable, then file
+CONVERTED_MODEL="${1:-"$CONVERTED_MODEL"}"
+
+# Final check if we have a model path
+if [ -z "$CONVERTED_MODEL" ]; then
+    echo "Error: Model path must be provided either as:" >&2
+    echo "  1. Command line argument" >&2
+    echo "  2. CONVERTED_MODEL environment variable" >&2
+    exit 1
+fi
+
+../../gguf-py/gguf/scripts/gguf_dump.py $CONVERTED_MODEL
--- a/examples/model-conversion/scripts/utils/inspect-org-model.py
+++ b/examples/model-conversion/scripts/utils/inspect-org-model.py
@@ -0,0 +1,67 @@
+#!/usr/bin/env python3
+
+import argparse
+import os
+import json
+from safetensors import safe_open
+from collections import defaultdict
+
+parser = argparse.ArgumentParser(description='Process model with specified path')
+parser.add_argument('--model-path', '-m', help='Path to the model')
+args = parser.parse_args()
+
+model_path = os.environ.get('MODEL_PATH', args.model_path)
+if model_path is None:
+    parser.error("Model path must be specified either via --model-path argument or MODEL_PATH environment variable")
+
+# Check if there's an index file (multi-file model)
+index_path = os.path.join(model_path, "model.safetensors.index.json")
+single_file_path = os.path.join(model_path, "model.safetensors")
+
+if os.path.exists(index_path):
+    # Multi-file model
+    print("Multi-file model detected")
+
+    with open(index_path, 'r') as f:
+        index_data = json.load(f)
+
+    # Get the weight map (tensor_name -> file_name)
+    weight_map = index_data.get("weight_map", {})
+
+    # Group tensors by file for efficient processing
+    file_tensors = defaultdict(list)
+    for tensor_name, file_name in weight_map.items():
+        file_tensors[file_name].append(tensor_name)
+
+    print("Tensors in model:")
+
+    # Process each shard file
+    for file_name, tensor_names in file_tensors.items():
+        file_path = os.path.join(model_path, file_name)
+        print(f"\n--- From {file_name} ---")
+
+        with safe_open(file_path, framework="pt") as f:
+            for tensor_name in sorted(tensor_names):
+                tensor = f.get_tensor(tensor_name)
+                print(f"- {tensor_name} : shape = {tensor.shape}, dtype = {tensor.dtype}")
+
+elif os.path.exists(single_file_path):
+    # Single file model (original behavior)
+    print("Single-file model detected")
+
+    with safe_open(single_file_path, framework="pt") as f:
+        keys = f.keys()
+        print("Tensors in model:")
+        for key in sorted(keys):
+            tensor = f.get_tensor(key)
+            print(f"- {key} : shape = {tensor.shape}, dtype = {tensor.dtype}")
+
+else:
+    print(f"Error: Neither 'model.safetensors.index.json' nor 'model.safetensors' found in {model_path}")
+    print("Available files:")
+    if os.path.exists(model_path):
+        for item in sorted(os.listdir(model_path)):
+            print(f"  {item}")
+    else:
+        print(f"  Directory {model_path} does not exist")
+    exit(1)
--- a/examples/model-conversion/scripts/utils/perplexity-gen.sh
+++ b/examples/model-conversion/scripts/utils/perplexity-gen.sh
@@ -0,0 +1,35 @@
+#!/usr/bin/env bash
+
+set -e
+
+CONVERTED_MODEL="${1:-"$CONVERTED_MODEL"}"
+
+# Final check if we have a model path
+if [ -z "$CONVERTED_MODEL" ]; then
+    echo "Error: Model path must be provided either as:" >&2
+    echo "  1. Command line argument" >&2
+    echo "  2. CONVERTED_MODEL environment variable" >&2
+    exit 1
+fi
+
+# Check if data/wikitext-2-raw directory exists
+if [ ! -d "ppl/wikitext-2-raw" ]; then
+    echo "ppl/wikitext-2-raw directory does not exist. Downloading..." >&2
+    mkdir -p ppl
+    pushd ppl
+    ./../../../scripts/get-wikitext-2.sh
+    popd
+fi
+
+mkdir -p ppl
+OUTPUTFILE="ppl/$(basename $CONVERTED_MODEL).kld"
+echo "Model: $CONVERTED_MODEL"
+
+cmake --build ../../build --target llama-perplexity -j8
+
+../.././build/bin/llama-perplexity -m $CONVERTED_MODEL \
+    -f ppl/wikitext-2-raw/wiki.test.raw \
+    --kl-divergence-base $OUTPUTFILE
+
+echo "Generated logits in $OUTPUTFILE"
+
--- a/examples/model-conversion/scripts/utils/perplexity-run-simple.sh
+++ b/examples/model-conversion/scripts/utils/perplexity-run-simple.sh
@@ -0,0 +1,27 @@
+#!/usr/bin/env bash
+
+set -e
+
+QUANTIZED_MODEL="${1:-"$QUANTIZED_MODEL"}"
+
+if [ -z "$QUANTIZED_MODEL" ]; then
+    echo "Error: Model path must be provided either as:" >&2
+    echo "  1. Command line argument" >&2
+    echo "  2. QUANTIZED_MODEL environment variable" >&2
+    exit 1
+fi
+
+# Check if data/wikitext-2-raw directory exists
+if [ ! -d "ppl/wikitext-2-raw" ]; then
+    echo "ppl/wikitext-2-raw directory does not exist. Downloading..." >&2
+    mkdir -p ppl
+    pushd ppl
+    ./../../../scripts/get-wikitext-2.sh
+    popd
+fi
+
+cmake --build ../../build --target llama-perplexity -j8
+
+../.././build/bin/llama-perplexity -m $QUANTIZED_MODEL -f ppl/wikitext-2-raw/wiki.test.raw
+
+
--- a/examples/model-conversion/scripts/utils/perplexity-run.sh
+++ b/examples/model-conversion/scripts/utils/perplexity-run.sh
@@ -0,0 +1,28 @@
+#!/usr/bin/env bash
+
+set -e
+
+QUANTIZED_MODEL="${1:-"$QUANTIZED_MODEL"}"
+LOGITS_FILE="${1:-"$LOGITS_FILE"}"
+
+if [ -z "$QUANTIZED_MODEL" ]; then
+    echo "Error: Model path must be provided either as:" >&2
+    echo "  1. Command line argument" >&2
+    echo "  2. QUANTIZED_MODEL environment variable" >&2
+    exit 1
+fi
+
+if [ ! -f ${LOGITS_FILE} ]; then
+    echo "Error: logits file '${LOGITS_FILE} was not found"
+    echo "Did you run the perplexity-gen.sh script?"
+    exit 1
+fi
+
+echo "Model: $QUANTIZED_MODEL"
+echo "Data file: $LOGITS_FILE"
+
+cmake --build ../../build --target llama-perplexity -j8
+
+../.././build/bin/llama-perplexity -m $QUANTIZED_MODEL \
+    --kl-divergence-base $LOGITS_FILE \
+    --kl-divergence
--- a/examples/model-conversion/scripts/utils/quantize.sh
+++ b/examples/model-conversion/scripts/utils/quantize.sh
@@ -0,0 +1,48 @@
+#!/usr/bin/env bash
+
+set -e
+
+CONVERTED_MODEL="${1:-"$CONVERTED_MODEL"}"
+QUANTIZED_TYPE="${2:-"$QUANTIZED_TYPE"}"
+TOKEN_EMBD_TYPE="${3:-"${TOKEN_EMBD_TYPE}"}"
+OUTPUT_TYPE="${4:-"${OUTPUT_TYPE}"}"
+QUANTIZED_MODEL=$CONVERTED_MODEL
+
+# Final check if we have a model path
+if [ -z "$CONVERTED_MODEL" ]; then
+    echo "Error: Model path must be provided either as:" >&2
+    echo "  1. Command line argument" >&2
+    echo "  2. CONVERTED_MODEL environment variable" >&2
+    exit 1
+fi
+
+if [ -z "$QUANTIZED_TYPE" ]; then
+    echo "Error: QUANTIZED_TYPE is required" >&2
+    exit 1
+fi
+
+echo $CONVERTED_MODEL
+
+# Process the quantized model filename
+if [[ "$QUANTIZED_MODEL" == *.gguf ]]; then
+    # Remove .gguf suffix, add quantized type, then add .gguf back
+    BASE_NAME="${QUANTIZED_MODEL%.gguf}"
+    QUANTIZED_MODEL="${BASE_NAME}-${QUANTIZED_TYPE}.gguf"
+else
+    echo "Error: QUANTIZED_MODEL must end with .gguf extension" >&2
+    exit 1
+fi
+
+cmake --build ../../build --target llama-quantize -j8
+
+echo $TOKEN_EMBD_TYPE
+echo $OUTPUT_TYPE
+
+CMD_ARGS=("../../build/bin/llama-quantize")
+[[ -n "$TOKEN_EMBD_TYPE" ]] && CMD_ARGS+=("--token-embedding-type" "$TOKEN_EMBD_TYPE")
+[[ -n "$OUTPUT_TYPE" ]]     && CMD_ARGS+=("--output-tensor-type" "$OUTPUT_TYPE")
+CMD_ARGS+=("$CONVERTED_MODEL" "$QUANTIZED_MODEL" "$QUANTIZED_TYPE")
+
+"${CMD_ARGS[@]}"
+
+echo "Quantized model saved to: $QUANTIZED_MODEL"
--- a/examples/model-conversion/scripts/utils/run-embedding-server.sh
+++ b/examples/model-conversion/scripts/utils/run-embedding-server.sh
@@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+
+set -e
+#
+# First try command line argument, then environment variable, then file
+CONVERTED_MODEL="${1:-"$CONVERTED_MODEL"}"
+
+# Final check if we have a model path
+if [ -z "$CONVERTED_MODEL" ]; then
+    echo "Error: Model path must be provided either as:" >&2
+    echo "  1. Command line argument" >&2
+    echo "  2. CONVERTED_MODEL environment variable" >&2
+    exit 1
+fi
+
+echo $CONVERTED_MODEL
+
+cmake --build ../../build --target llama-server
+
+../../build/bin/llama-server -m $CONVERTED_MODEL \
+    --embedding \
+    --pooling none
--- a/examples/model-conversion/scripts/utils/semantic_check.py
+++ b/examples/model-conversion/scripts/utils/semantic_check.py
@@ -0,0 +1,225 @@
+#!/usr/bin/env python3
+
+import numpy as np
+import argparse
+import os
+import importlib
+
+from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, AutoModel
+
+unreleased_model_name = os.getenv('UNRELEASED_MODEL_NAME')
+
+def cosine_similarity(a, b=None):
+    a = np.asarray(a)
+    if b is None:
+        b = a
+    else:
+        b = np.asarray(b)
+
+    if a.ndim == 1:
+        a = a.reshape(1, -1)
+    if b.ndim == 1:
+        b = b.reshape(1, -1)
+
+    a_norms = np.linalg.norm(a, axis=1, keepdims=True)
+    b_norms = np.linalg.norm(b, axis=1, keepdims=True)
+
+    a_norms = np.where(a_norms == 0, 1e-8, a_norms)
+    b_norms = np.where(b_norms == 0, 1e-8, b_norms)
+
+    a_normalized = a / a_norms
+    b_normalized = b / b_norms
+
+    # Compute cosine similarity
+    return np.dot(a_normalized, b_normalized.T)
+
+def load_embeddings_from_file(filename, n_tokens, n_embd):
+    embeddings = np.fromfile(filename, dtype=np.float32)
+    # Check if this is pooled (single embedding) or per-token embeddings
+    if len(embeddings) == n_embd:
+        return embeddings.reshape(1, n_embd)
+    else:
+        return embeddings.reshape(n_tokens, n_embd)
+
+def test_single_prompt_similarity(python_emb, cpp_emb, tokens, prompt):
+    np.set_printoptions(suppress=True, precision=6)
+    print("pytorch embeddings:");
+    print(python_emb)
+    print("llama.cpp embeddings:");
+    print(cpp_emb)
+    print(f"\n=== Prompt: '{prompt}' ===")
+    print(f"Tokens: {tokens}")
+    print(f"Embeddings shape: Python {python_emb.shape}, llama.cpp {cpp_emb.shape}")
+
+    n_tokens = len(tokens)
+    is_pooled = python_emb.shape[0] == 1
+
+    if is_pooled:
+        print(f"\n[Pooled Embeddings Mode - comparing single sentence embeddings]")
+
+        # 1. Direct embedding comparison for pooled embeddings
+        print(f"\n1. Raw Embedding Magnitude Comparison:")
+        py_mag = np.linalg.norm(python_emb[0])
+        cpp_mag = np.linalg.norm(cpp_emb[0])
+        ratio = py_mag / cpp_mag if cpp_mag > 0 else float('inf')
+        print(f"   Pooled embedding: Python={py_mag:.3f}, llama.cpp={cpp_mag:.3f}, ratio={ratio:.3f}")
+
+        # 2. Cross-model similarity for pooled embeddings
+        print(f"\n2. Cross-Model Pooled Embedding Similarity:")
+        sim = cosine_similarity([python_emb[0]], [cpp_emb[0]])[0][0]
+        print(f"   Cosine similarity: {sim:.6f}")
+
+        return {
+            'cross_model_similarities': [sim],
+            'similarity_matrix_diff': np.array([[0.0]]),
+            'max_diff': 0.0,
+            'mean_diff': 0.0,
+            'rms_diff': 0.0
+        }
+    else:
+        # Original per-token comparison logic
+        # 1. Direct embedding comparison
+        print(f"\n1. Raw Embedding Magnitude Comparison:")
+        # Check if the distance of each token embedding from the origin and compare
+        # if the vectors are on the same "sphere". This does not tell us about
+        # direction (meaning of the token embedding), just magnitude.
+        for i in range(n_tokens):
+            py_mag = np.linalg.norm(python_emb[i]) # calculate standard euclidean norm for Python embeddings
+            cpp_mag = np.linalg.norm(cpp_emb[i])   # calculate standard euclidean norm for llama.cpp embeddings
+            ratio = py_mag / cpp_mag if cpp_mag > 0 else float('inf')
+            print(f"   Token {i} ({tokens[i]}): Python={py_mag:.3f}, llama.cpp={cpp_mag:.3f}, ratio={ratio:.3f}")
+
+        # 2. Cosine similarity between tokens within each model
+        # Here we check the direction of token embeddings to see if the have the
+        # same meaning (similarity). This is done by calculating cosine similarity
+        # of a pair of token embeddings within each model.
+        print(f"\n2. Within-Model Token Similarities:")
+        print("   Python model:")
+        for i in range(n_tokens):
+            for j in range(i+1, n_tokens):
+                sim = cosine_similarity([python_emb[i]], [python_emb[j]])[0][0]
+                print(f"     {tokens[i]} ↔ {tokens[j]}: {sim:.4f}")
+
+        print("   llama.cpp model:")
+        for i in range(n_tokens):
+            for j in range(i+1, n_tokens):
+                sim = cosine_similarity([cpp_emb[i]], [cpp_emb[j]])[0][0]
+                print(f"     {tokens[i]} ↔ {tokens[j]}: {sim:.4f}")
+
+        # 3. Cross-model similarity (same token position)
+        print(f"\n3. Cross-Model Same-Token Similarities:")
+        for i in range(n_tokens):
+            sim = cosine_similarity([python_emb[i]], [cpp_emb[i]])[0][0]
+            print(f"   Token {i} ({tokens[i]}): {sim:.4f}")
+
+        # 4. Similarity matrix comparison
+        print(f"\n4. Similarity Matrix Differences:")
+        py_sim_matrix = cosine_similarity(python_emb)
+        cpp_sim_matrix = cosine_similarity(cpp_emb)
+        diff_matrix = np.abs(py_sim_matrix - cpp_sim_matrix)
+
+        print(f"   Max difference: {np.max(diff_matrix):.4f}")
+        print(f"   Mean difference: {np.mean(diff_matrix):.4f}")
+        print(f"   RMS difference: {np.sqrt(np.mean(diff_matrix**2)):.4f}")
+
+        return {
+            'cross_model_similarities': [cosine_similarity([python_emb[i]], [cpp_emb[i]])[0][0] for i in range(n_tokens)],
+            'similarity_matrix_diff': diff_matrix,
+            'max_diff': np.max(diff_matrix),
+            'mean_diff': np.mean(diff_matrix),
+            'rms_diff': np.sqrt(np.mean(diff_matrix**2))
+        }
+
+def read_prompt_from_file(file_path):
+    try:
+        with open(file_path, 'r', encoding='utf-8') as f:
+            return f.read().strip()
+    except FileNotFoundError:
+        print(f"Error: Prompts file '{file_path}' not found")
+        exit(1)
+    except Exception as e:
+        print(f"Error reading prompts file: {e}")
+        exit(1)
+
+def main():
+    parser = argparse.ArgumentParser(description='Test semantic similarity between Python and llama.cpp embeddings')
+    parser.add_argument('--model-path', '-m', required=True, help='Path to the original Python model')
+    parser.add_argument('--python-embeddings', '-pe', help='Path to pytorch embeddings "logits" binary file')
+    parser.add_argument('--cpp-embeddings', '-ce', help='Path to llama.cpp embeddings "logits" binary file')
+    parser.add_argument('--causal', '-c', default=False, help='if the model is causal (default: false)', action='store_true')
+    parser.add_argument('--prompt', '-p', default='Hello world today', help='Test prompt')
+    parser.add_argument('--prompts-file', '-pf', help='Path to file containing prompts')
+
+    args = parser.parse_args()
+
+    if args.prompts_file:
+        prompt = read_prompt_from_file(args.prompts_file)
+    else:
+        prompt = args.prompt
+
+    print("Semantic Similarity Test Between Python and llama.cpp Embedding Models")
+    print("=" * 70)
+
+    # Single prompt detailed comparison
+    print(f"\nTesting with prompt: '{prompt}'")
+
+    # Load the python model to get configuration information and also to load the tokenizer.
+    print("Loading model and tokenizer using AutoTokenizer:", args.model_path)
+    tokenizer = AutoTokenizer.from_pretrained(args.model_path)
+    config = AutoConfig.from_pretrained(args.model_path)
+
+    if unreleased_model_name:
+        model_name_lower = unreleased_model_name.lower()
+        unreleased_module_path = f"transformers.models.{model_name_lower}.modular_{model_name_lower}"
+        if args.causal:
+            class_name = f"{unreleased_model_name}ForCausalLM"
+        else:
+            class_name = f"{unreleased_model_name}Model"
+        print(f"Model class: {class_name}")
+        print(f"Importing unreleased model module: {unreleased_module_path}")
+
+        try:
+            model_class = getattr(importlib.import_module(unreleased_module_path), class_name)
+            model = model_class.from_pretrained(args.model_path)
+        except (ImportError, AttributeError) as e:
+            print(f"Failed to import or load model: {e}")
+            exit(1)
+    else:
+        if args.causal:
+            model = AutoModelForCausalLM.from_pretrained(args.model_path)
+        else:
+            model = AutoModel.from_pretrained(args.model_path)
+
+    encoded = tokenizer(prompt, return_tensors="pt")
+    tokens = tokenizer.convert_ids_to_tokens(encoded['input_ids'][0])
+    n_tokens = len(tokens)
+    print(f"n_tokens: {n_tokens}");
+    print(f"hidden_size: {model.config.hidden_size}")
+
+    # Load binary embeddings from data directory.
+    llamacpp_embeddings = load_embeddings_from_file(args.cpp_embeddings, n_tokens, model.config.hidden_size)
+    python_embeddings = load_embeddings_from_file(args.python_embeddings, n_tokens, model.config.hidden_size)
+
+    # Run comparison
+    results = test_single_prompt_similarity(python_embeddings, llamacpp_embeddings, tokens, prompt)
+
+    # Summary
+    print(f"\n=== SUMMARY ===")
+    avg_cross_sim = np.mean(results['cross_model_similarities'])
+    print(f"Average cross-model similarity: {avg_cross_sim:.4f}")
+    print(f"Similarity matrix RMS difference: {results['rms_diff']:.4f}")
+
+    # Quality assessment
+    if avg_cross_sim > 0.95:
+        print("✅ EXCELLENT: Models are highly similar")
+    elif avg_cross_sim > 0.90:
+        print("✅ VERY GOOD: Models are very similar")
+    elif avg_cross_sim > 0.80:
+        print("⚠️  GOOD: Models are reasonably similar")
+    elif avg_cross_sim > 0.70:
+        print("⚠️  FAIR: Models have some differences")
+    else:
+        print("❌ POOR: Models are significantly different")
+
+if __name__ == "__main__":
+    main()