Sync from upstream llama.cpp repository

This commit is contained in:
2026-01-16 10:43:34 +08:00
parent 3bc369a6f7
commit f4ae4cc7da
2053 changed files with 956010 additions and 1 deletions

View File

@@ -0,0 +1,177 @@
#!/usr/bin/env python3
import numpy as np
import sys
import os
import argparse
from pathlib import Path
from common import get_model_name_from_env_path # type: ignore[import-not-found]
def calculate_nmse(reference, test):
mse = np.mean((test - reference) ** 2)
ref_var = np.var(reference)
if ref_var == 0:
nmse = float('inf') if mse > 0 else 0.0
return mse, mse, ref_var
nmse = mse / ref_var
return nmse, mse, ref_var
def load_logits(file_path):
if not os.path.exists(file_path):
raise FileNotFoundError(f"File not found: {file_path}")
if file_path.suffix == '.npy':
return np.load(file_path)
elif file_path.suffix == '.bin':
return np.fromfile(file_path, dtype=np.float32)
else:
# Try to load as text file
try:
# If it has index format "0: value", extract just values
data = []
with open(file_path, 'r') as f:
for line in f:
if ':' in line:
# Format: "index: value"
value = float(line.split(':')[1].strip())
else:
# Just the value
value = float(line.strip())
data.append(value)
return np.array(data, dtype=np.float32)
except:
return np.loadtxt(file_path, dtype=np.float32)
def interpret_nmse(nmse):
"""Provide interpretation of NMSE value"""
if nmse == 0:
return "Perfect match", "🎉"
elif nmse < 1e-6:
return "Essentially identical", ""
elif nmse < 1e-4:
return "Excellent match", ""
elif nmse < 1e-3:
return "Very good match", "👍"
elif nmse < 1e-2:
return "Good match", "👍"
elif nmse < 0.1:
return "Acceptable match", "⚠️"
elif nmse < 1.0:
return "Poor match", ""
else:
return "Very poor match (worse than noise)", ""
def main():
parser = argparse.ArgumentParser(description='Validate model logits')
parser.add_argument('-m', '--model-path', required=True, help='Path to the model directory')
args = parser.parse_args()
model_name = get_model_name_from_env_path('MODEL_PATH')
data_dir = Path("data")
pytorch_file = data_dir / f"pytorch-{model_name}.bin"
llamacpp_model_name = get_model_name_from_env_path('CONVERTED_MODEL')
llamacpp_file = data_dir / f"llamacpp-{llamacpp_model_name}.bin"
print(f"Model name: {model_name}")
print(f"PyTorch logits file: {pytorch_file}")
print(f"llama.cpp logits file: {llamacpp_file}")
reference_file = pytorch_file
test_file = llamacpp_file
print("📊 NMSE Check for Model Comparison")
print("=" * 50)
print(f"Reference (ground truth): {reference_file}")
print(f"Test (to evaluate): {test_file}")
print()
try:
print("Loading reference logits...")
reference = load_logits(reference_file)
print(f" Shape: {reference.shape}, Type: {reference.dtype}")
print("Loading test logits...")
test = load_logits(test_file)
print(f" Shape: {test.shape}, Type: {test.dtype}")
# Check shapes match
if reference.shape != test.shape:
print(f"\n❌ Error: Shape mismatch!")
print(f" Reference: {reference.shape}")
print(f" Test: {test.shape}")
sys.exit(1)
print(f"\n✅ Shapes match: {reference.shape}")
nmse, mse, ref_var = calculate_nmse(reference, test)
# Additional metrics
max_abs_error = np.max(np.abs(test - reference))
mean_abs_error = np.mean(np.abs(test - reference))
# Results
print(f"\n📈 METRICS")
print("=" * 30)
print(f"MSE (Mean Squared Error): {mse:.6e}")
print(f"Reference Variance: {ref_var:.6e}")
print(f"NMSE: {nmse:.6e}")
print(f"Max Absolute Error: {max_abs_error:.6f}")
print(f"Mean Absolute Error: {mean_abs_error:.6f}")
# NMSE in dB (common in signal processing)
if nmse > 0:
nmse_db = 10 * np.log10(nmse)
print(f"NMSE (dB): {nmse_db:.2f} dB")
# Interpretation
interpretation, emoji = interpret_nmse(nmse)
print(f"\n🎯 INTERPRETATION")
print("=" * 30)
print(f"{emoji} {interpretation}")
# Detailed guidance
print(f"\n📋 GUIDANCE")
print("=" * 30)
if nmse < 1e-3:
print("✅ EXCELLENT: Your GGML conversion is working very well!")
print(" The differences are negligible for practical use.")
elif nmse < 1e-2:
print("👍 GOOD: Your GGML conversion is working well.")
print(" Small differences are likely due to precision/quantization.")
elif nmse < 0.1:
print("⚠️ ACCEPTABLE: Conversion is working but with some differences.")
print(" Check if you're using quantization (Q4, Q8, etc.)")
print(" Test generation quality to see if it's acceptable.")
else:
print("❌ PROBLEMATIC: Large differences detected.")
print(" Check your conversion process for potential issues.")
print(" Verify you're using the same model weights.")
# NMSE benchmarks
print(f"\n📚 NMSE BENCHMARKS")
print("=" * 30)
print("< 1e-6: Essentially identical")
print("< 1e-4: Excellent (typical for good conversions)")
print("< 1e-3: Very good")
print("< 1e-2: Good (acceptable for most use cases)")
print("< 0.1: Acceptable (may need verification)")
print("> 1.0: Poor (worse than random)")
# Exit code based on NMSE
if nmse < 1e-2:
print(f"\n✅ RESULT: PASS (NMSE = {nmse:.2e})")
sys.exit(0)
else:
print(f"\n❌ RESULT: NEEDS REVIEW (NMSE = {nmse:.2e})")
sys.exit(1)
except Exception as e:
print(f"❌ Error: {e}")
sys.exit(1)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,299 @@
#!/usr/bin/env python3
import os
import sys
import torch
import transformers
import json
import textwrap
import numpy as np
from pathlib import Path
def get_model_name_from_env_path(env_path_name):
model_path = os.getenv(env_path_name)
if not model_path:
print(f"Error: {env_path_name} environment variable not set")
sys.exit(1)
if not os.path.exists(model_path):
print(f"Error: Model file not found: {model_path}")
sys.exit(1)
name = os.path.basename(os.path.normpath(model_path))
if name.endswith(".gguf"):
name = name[:-5]
return name
def summarize(tensor: torch.Tensor, name: str, max_seq: int = 3, max_vals: int = 3):
"""
Print a tensor in llama.cpp debug style.
Supports:
- 2D tensors (seq, hidden)
- 3D tensors (batch, seq, hidden)
- 4D tensors (batch, seq, heads, dim_per_head) via flattening heads × dim_per_head
Shows first and last max_vals of each vector per sequence position.
"""
t = tensor.detach().to(torch.float32).cpu()
# Determine dimensions
if t.ndim == 3:
_, s, _ = t.shape
elif t.ndim == 2:
_, s = 1, t.shape[0]
t = t.unsqueeze(0)
elif t.ndim == 4:
_, s, _, _ = t.shape
else:
print(f"Skipping tensor due to unsupported dimensions: {t.ndim}")
return
ten_shape = t.shape
print(f"ggml_debug: {name} = (f32) ... = {{{ten_shape}}}")
print(" [")
print(" [")
# Determine indices for first and last sequences
first_indices = list(range(min(s, max_seq)))
last_indices = list(range(max(0, s - max_seq), s))
# Check if there's an overlap between first and last indices or if we're at the edge case of s = 2 * max_seq
has_overlap = bool(set(first_indices) & set(last_indices)) or (max_seq * 2 == s)
# Combine indices
if has_overlap:
# If there's overlap, just use the combined unique indices
indices = sorted(list(set(first_indices + last_indices)))
separator_index = None
else:
# If no overlap, we'll add a separator between first and last sequences
indices = first_indices + last_indices
separator_index = len(first_indices)
for i, si in enumerate(indices):
# Add separator if needed
if separator_index is not None and i == separator_index:
print(" ...")
# Extract appropriate slice
vec = t[0, si]
if vec.ndim == 2: # 4D case: flatten heads × dim_per_head
flat = vec.flatten().tolist()
else: # 2D or 3D case
flat = vec.tolist()
# First and last slices
first = flat[:max_vals]
last = flat[-max_vals:] if len(flat) >= max_vals else flat
first_str = ", ".join(f"{v:12.4f}" for v in first)
last_str = ", ".join(f"{v:12.4f}" for v in last)
print(f" [{first_str}, ..., {last_str}]")
print(" ],")
print(" ]")
print(f" sum = {t.sum().item():.6f}\n")
def debug_hook(name):
def fn(_m, input, output):
if isinstance(input, torch.Tensor):
summarize(input, name + "_in")
elif isinstance(input, (tuple, list)) and len(input) > 0 and isinstance(input[0], torch.Tensor):
summarize(input[0], name + "_in")
if isinstance(output, torch.Tensor):
summarize(output, name + "_out")
elif isinstance(output, (tuple, list)) and len(output) > 0 and isinstance(output[0], torch.Tensor):
summarize(output[0], name + "_out")
return fn
def setup_rope_debug(model_module_path: str, function_name: str = "apply_rotary_pos_emb"):
"""
Apply monkey patch to dump RoPE activations for debugging.
Args:
model_module_path: Path to the model module (e.g., "transformers.models.apertus.modeling_apertus")
function_name: Name of the RoPE function to patch (default: "apply_rotary_pos_emb")
Example:
from utils.common import setup_rope_debug
setup_rope_debug("transformers.models.apertus.modeling_apertus")
"""
import importlib
# Import the module and get the original function
module = importlib.import_module(model_module_path)
orig_rope = getattr(module, function_name)
# Set torch print options for better debugging
torch.set_printoptions(threshold=float('inf'))
torch.set_printoptions(precision=6, sci_mode=False)
def debug_rope(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
# log inputs
summarize(q, "RoPE.q_in")
summarize(k, "RoPE.k_in")
# call original
q_out, k_out = orig_rope(q, k, cos, sin, position_ids, unsqueeze_dim)
# log outputs
summarize(q_out, "RoPE.q_out")
summarize(k_out, "RoPE.k_out")
return q_out, k_out
# Patch it
setattr(module, function_name, debug_rope)
print(f"RoPE debug patching applied to {model_module_path}.{function_name}")
def save_output_data(data, tokens, prompt, model_name, type_suffix="", output_dir="data"):
"""
Save output data (logits/embeddings), tokens, and prompt to files.
Args:
data: numpy array of floats (logits or embeddings)
tokens: list or array of token IDs
prompt: string containing the input prompt
model_name: name of the model
type_suffix: optional suffix like "-embeddings" (default: "")
output_dir: directory to save files (default: "data")
Creates the following files in output_dir:
- pytorch-{model_name}{type_suffix}.bin
- pytorch-{model_name}{type_suffix}.txt
- pytorch-{model_name}{type_suffix}-prompt.txt
- pytorch-{model_name}{type_suffix}-tokens.bin
"""
data_dir = Path(output_dir)
data_dir.mkdir(exist_ok=True)
base_path = data_dir / f"pytorch-{model_name}{type_suffix}"
# Convert and flatten logits/embeddings
data = data.cpu().numpy() if isinstance(data, torch.Tensor) else np.asarray(data)
data = data.flatten() if data.ndim > 1 else data
# Save logits/embedding files
data.astype(np.float32).tofile(f"{base_path}.bin")
print(f"Data saved to {base_path}.bin")
with open(f"{base_path}.txt", "w") as f:
f.writelines(f"{i}: {value:.6f}\n" for i, value in enumerate(data))
print(f"Data saved to {base_path}.txt")
# Convert and flatten tokens
tokens = tokens.cpu().numpy() if isinstance(tokens, torch.Tensor) else np.asarray(tokens)
tokens = tokens.flatten() if tokens.ndim > 1 else tokens
# Save token binary file
tokens.astype(np.int32).tofile(f"{base_path}-tokens.bin")
print(f"Tokens saved to {base_path}-tokens.bin")
# Save prompt file
with open(f"{base_path}-prompt.txt", "w") as f:
f.write(f"prompt: {prompt}\n")
f.write(f"n_tokens: {len(tokens)}\n")
f.write(f"token ids: {', '.join(str(int(tid)) for tid in tokens)}\n")
print(f"Prompt saved to {base_path}-prompt.txt")
def compare_tokens(original, converted, type_suffix="", output_dir="data"):
data_dir = Path(output_dir)
# Read tokens from both models
tokens1_file = data_dir / f"{original}{type_suffix}-tokens.bin"
tokens2_file = data_dir / f"{converted}{type_suffix}-tokens.bin"
if not tokens1_file.exists():
print(f"Error: Token file not found: {tokens1_file}")
return False
if not tokens2_file.exists():
print(f"Error: Token file not found: {tokens2_file}")
return False
tokens1 = np.fromfile(tokens1_file, dtype=np.int32)
tokens2 = np.fromfile(tokens2_file, dtype=np.int32)
print(f"\nComparing tokens between:")
print(f" Original : {original} ({len(tokens1)} tokens)")
print(f" Converted: {converted} ({len(tokens2)} tokens)")
if len(tokens1) != len(tokens2):
print(f"\n❌ Token count mismatch: {len(tokens1)} vs {len(tokens2)}")
return False
if np.array_equal(tokens1, tokens2):
print(f"\n✅ All {len(tokens1)} tokens match!")
return True
mismatches = np.where(tokens1 != tokens2)[0]
print(f"\n❌ Found {len(mismatches)} mismatched tokens:")
num_to_show = min(len(mismatches), 10)
for idx in mismatches[:num_to_show]:
print(f" Position {idx}: {tokens1[idx]} vs {tokens2[idx]}")
if len(mismatches) > num_to_show:
print(f" ... and {len(mismatches) - num_to_show} more mismatches")
return False
def show_version_warning(current_version, model_version):
if not model_version:
return False
try:
from packaging.version import parse, InvalidVersion
try:
return parse(current_version) < parse(model_version)
except InvalidVersion:
return current_version != model_version
except ImportError:
return current_version != model_version
def get_model_transformers_version(model_path):
if not model_path:
return None
config_path = Path(model_path) / "config.json"
if not config_path.is_file():
return None
try:
with open(config_path, "r", encoding="utf-8") as f:
config = json.load(f)
return config.get("transformers_version")
except (IOError, json.JSONDecodeError) as e:
print(f"Warning: Could not read or parse {config_path}: {e}", file=sys.stderr)
return None
def exit_with_warning(message, model_path):
print(message)
if model_path and transformers is not None:
model_transformers_version = get_model_transformers_version(model_path)
transformers_version = transformers.__version__
if show_version_warning(transformers_version, model_transformers_version):
warning_message = f"""
=====================================================================
Verification failure might be due to a transformers version mismatch:
Current transformers version: {transformers_version}
Model's required version : {model_transformers_version}
Consider installing the version specified by the model's config:
pip install transformers=={model_transformers_version}
=====================================================================
"""
print(textwrap.dedent(warning_message))
sys.exit(1)

View File

@@ -0,0 +1,76 @@
#!/usr/bin/env python3
import argparse
import sys
from common import compare_tokens # type: ignore
def parse_arguments():
parser = argparse.ArgumentParser(
description='Compare tokens between two models',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
%(prog)s pytorch-gemma-3-270m-it llamacpp-gemma-3-270m-it-bf16
"""
)
parser.add_argument(
'original',
help='Original model name'
)
parser.add_argument(
'converted',
help='Converted model name'
)
parser.add_argument(
'-s', '--suffix',
default='',
help='Type suffix (e.g., "-embeddings")'
)
parser.add_argument(
'-d', '--data-dir',
default='data',
help='Directory containing token files (default: data)'
)
parser.add_argument(
'-v', '--verbose',
action='store_true',
help='Print prompts from both models'
)
return parser.parse_args()
def main():
args = parse_arguments()
if args.verbose:
from pathlib import Path
data_dir = Path(args.data_dir)
prompt1_file = data_dir / f"{args.original}{args.suffix}-prompt.txt"
prompt2_file = data_dir / f"{args.converted}{args.suffix}-prompt.txt"
if prompt1_file.exists():
print(f"\nOriginal model prompt ({args.original}):")
print(f" {prompt1_file.read_text().strip()}")
if prompt2_file.exists():
print(f"\nConverted model prompt ({args.converted}):")
print(f" {prompt2_file.read_text().strip()}")
print()
result = compare_tokens(
args.original,
args.converted,
type_suffix=args.suffix,
output_dir=args.data_dir
)
# Enable the script to be used in shell scripts so that they can check
# the exit code for success/failure.
sys.exit(0 if result else 1)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,8 @@
#!/usr/bin/env bash
COLLECTION_SLUG=$(python ./create_collection.py --return-slug)
echo "Created collection: $COLLECTION_SLUG"
# Use it in the next command
python add_model_to_collection.py "$COLLECTION_SLUG" "username/my-model"

View File

@@ -0,0 +1,6 @@
#!/usr/bin/env bash
curl --request POST \
--url http://localhost:8080/embedding \
--header "Content-Type: application/json" \
--data '{"input": "Hello world today"}' \
--silent

View File

@@ -0,0 +1,80 @@
#!/usr/bin/env python3
from huggingface_hub import HfApi
import argparse
import sys
def add_model_to_collection(collection_slug, model_id, note=""):
"""
Add a model to an existing collection
Args:
collection_slug: The slug of the collection (e.g., "username/collection-name-12345")
model_id: The model repository ID (e.g., "username/model-name")
note: Optional note about the model
Returns:
True if successful, False if failed
"""
# Initialize API
api = HfApi()
try:
user_info = api.whoami()
print(f"✅ Authenticated as: {user_info['name']}")
# Verify the model exists
print(f"🔍 Checking if model exists: {model_id}")
try:
model_info = api.model_info(model_id)
except Exception as e:
print(f"❌ Model not found or not accessible: {model_id}")
print(f"Error: {e}")
return False
print(f"📚 Adding model to collection...")
api.add_collection_item(
collection_slug=collection_slug,
item_id=model_id,
item_type="model",
note=note
)
print(f"✅ Model added to collection successfully!")
print(f"🔗 Collection URL: https://huggingface.co/collections/{collection_slug}")
return True
except Exception as e:
print(f"❌ Error adding model to collection: {e}")
return False
def main():
# This script requires that the environment variable HF_TOKEN is set with your
# Hugging Face API token.
api = HfApi()
parser = argparse.ArgumentParser(description='Add model to a Huggingface Collection')
parser.add_argument('--collection', '-c', help='The collection slug username/collection-hash', required=True)
parser.add_argument('--model', '-m', help='The model to add to the Collection', required=True)
parser.add_argument('--note', '-n', help='An optional note/description', required=False)
args = parser.parse_args()
collection = args.collection
model = args.model
note = args.note
success = add_model_to_collection(
collection_slug=collection,
model_id=model,
note=note
)
if success:
print("\n🎉 Model added successfully!")
else:
print("\n❌ Failed to add model to collection")
sys.exit(1)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,106 @@
#!/usr/bin/env python3
from huggingface_hub import HfApi
import argparse
import os
import sys
def create_collection(title, description, private=False, namespace=None, return_slug=False):
"""
Create a new collection on Hugging Face
Args:
title: Collection title
description: Collection description
private: Whether the collection should be private (default: False)
namespace: Optional namespace (defaults to your username)
Returns:
Collection object if successful, None if failed
"""
# Check if HF_TOKEN is available
token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_HUB_TOKEN")
if not token:
print("❌ No HF_TOKEN or HUGGINGFACE_HUB_TOKEN found in environment variables")
print("Please set your Hugging Face token as an environment variable")
return None
# Initialize API
api = HfApi()
try:
# Test authentication first
user_info = api.whoami()
if not return_slug:
print(f"✅ Authenticated as: {user_info['name']}")
# Create the collection
if not return_slug:
print(f"📚 Creating collection: '{title}'...")
collection = api.create_collection(
title=title,
description=description,
private=private,
namespace=namespace
)
if not return_slug:
print(f"✅ Collection created successfully!")
print(f"📋 Collection slug: {collection.slug}")
print(f"🔗 Collection URL: https://huggingface.co/collections/{collection.slug}")
return collection
except Exception as e:
print(f"❌ Error creating collection: {e}")
return None
def main():
# This script requires that the environment variable HF_TOKEN is set with your
# Hugging Face API token.
api = HfApi()
parser = argparse.ArgumentParser(description='Create a Huggingface Collection')
parser.add_argument('--name', '-n', help='The name/title of the Collection', required=True)
parser.add_argument('--description', '-d', help='The description for the Collection', required=True)
parser.add_argument('--namespace', '-ns', help='The namespace to add the Collection to', required=True)
parser.add_argument('--private', '-p', help='Create a private Collection', action='store_true') # Fixed
parser.add_argument('--return-slug', '-s', help='Only output the collection slug', action='store_true') # Fixed
args = parser.parse_args()
name = args.name
description = args.description
private = args.private
namespace = args.namespace
return_slug = args.return_slug
if not return_slug:
print("🚀 Creating Hugging Face Collection")
print(f"Title: {name}")
print(f"Description: {description}")
print(f"Namespace: {namespace}")
print(f"Private: {private}")
collection = create_collection(
title=name,
description=description,
private=private,
namespace=namespace,
return_slug=return_slug
)
if collection:
if return_slug:
print(collection.slug)
else:
print("\n🎉 Collection created successfully!")
print(f"Use this slug to add models: {collection.slug}")
else:
print("\n❌ Failed to create collection")
sys.exit(1)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,78 @@
#!/usr/bin/env python3
from huggingface_hub import HfApi
import argparse
# This script requires that the environment variable HF_TOKEN is set with your
# Hugging Face API token.
api = HfApi()
def load_template_and_substitute(template_path, **kwargs):
try:
with open(template_path, 'r', encoding='utf-8') as f:
template_content = f.read()
return template_content.format(**kwargs)
except FileNotFoundError:
print(f"Template file '{template_path}' not found!")
return None
except KeyError as e:
print(f"Missing template variable: {e}")
return None
parser = argparse.ArgumentParser(description='Create a new Hugging Face model repository')
parser.add_argument('--model-name', '-m', help='Name for the model', required=True)
parser.add_argument('--namespace', '-ns', help='Namespace to add the model to', required=True)
parser.add_argument('--org-base-model', '-b', help='Original Base model name', default="")
parser.add_argument('--no-card', action='store_true', help='Skip creating model card')
parser.add_argument('--private', '-p', action='store_true', help='Create private model')
parser.add_argument('--embedding', '-e', action='store_true', help='Use embedding model card template')
parser.add_argument('--dry-run', '-d', action='store_true', help='Print repository info and template without creating repository')
args = parser.parse_args()
repo_id = f"{args.namespace}/{args.model_name}-GGUF"
print("Repository ID: ", repo_id)
repo_url = None
if not args.dry_run:
repo_url = api.create_repo(
repo_id=repo_id,
repo_type="model",
private=args.private,
exist_ok=False
)
if not args.no_card:
if args.embedding:
template_path = "scripts/embedding/modelcard.template"
else:
template_path = "scripts/causal/modelcard.template"
print("Template path: ", template_path)
model_card_content = load_template_and_substitute(
template_path,
model_name=args.model_name,
namespace=args.namespace,
base_model=args.org_base_model,
)
if args.dry_run:
print("\nTemplate Content:\n")
print(model_card_content)
else:
if model_card_content:
api.upload_file(
path_or_fileobj=model_card_content.encode('utf-8'),
path_in_repo="README.md",
repo_id=repo_id
)
print("Model card created successfully.")
else:
print("Failed to create model card.")
if not args.dry_run and repo_url:
print(f"Repository created: {repo_url}")

View File

@@ -0,0 +1,58 @@
#!/usr/bin/env python3
from huggingface_hub import HfApi
import argparse
import os
def upload_gguf_file(local_file_path, repo_id, filename_in_repo=None):
"""
Upload a GGUF file to a Hugging Face model repository
Args:
local_file_path: Path to your local GGUF file
repo_id: Your repository ID (e.g., "username/model-name")
filename_in_repo: Optional custom name for the file in the repo
"""
if not os.path.exists(local_file_path):
print(f"❌ File not found: {local_file_path}")
return False
if filename_in_repo is None:
filename_in_repo = os.path.basename(local_file_path)
if filename_in_repo is None or filename_in_repo == "":
filename_in_repo = os.path.basename(local_file_path)
print(f"📤 Uploading {local_file_path} to {repo_id}/{filename_in_repo}")
api = HfApi()
try:
api.upload_file(
path_or_fileobj=local_file_path,
path_in_repo=filename_in_repo,
repo_id=repo_id,
repo_type="model",
commit_message=f"Upload {filename_in_repo}"
)
print("✅ Upload successful!")
print(f"🔗 File available at: https://huggingface.co/{repo_id}/blob/main/{filename_in_repo}")
return True
except Exception as e:
print(f"❌ Upload failed: {e}")
return False
# This script requires that the environment variable HF_TOKEN is set with your
# Hugging Face API token.
api = HfApi()
parser = argparse.ArgumentParser(description='Upload a GGUF model to a Huggingface model repository')
parser.add_argument('--gguf-model-path', '-m', help='The GGUF model file to upload', required=True)
parser.add_argument('--repo-id', '-r', help='The repository to upload to', required=True)
parser.add_argument('--name', '-o', help='The name in the model repository', required=False)
args = parser.parse_args()
upload_gguf_file(args.gguf_model_path, args.repo_id, args.name)

View File

@@ -0,0 +1,14 @@
#!/usr/bin/env bash
# First try command line argument, then environment variable, then file
CONVERTED_MODEL="${1:-"$CONVERTED_MODEL"}"
# Final check if we have a model path
if [ -z "$CONVERTED_MODEL" ]; then
echo "Error: Model path must be provided either as:" >&2
echo " 1. Command line argument" >&2
echo " 2. CONVERTED_MODEL environment variable" >&2
exit 1
fi
../../gguf-py/gguf/scripts/gguf_dump.py $CONVERTED_MODEL

View File

@@ -0,0 +1,67 @@
#!/usr/bin/env python3
import argparse
import os
import json
from safetensors import safe_open
from collections import defaultdict
parser = argparse.ArgumentParser(description='Process model with specified path')
parser.add_argument('--model-path', '-m', help='Path to the model')
args = parser.parse_args()
model_path = os.environ.get('MODEL_PATH', args.model_path)
if model_path is None:
parser.error("Model path must be specified either via --model-path argument or MODEL_PATH environment variable")
# Check if there's an index file (multi-file model)
index_path = os.path.join(model_path, "model.safetensors.index.json")
single_file_path = os.path.join(model_path, "model.safetensors")
if os.path.exists(index_path):
# Multi-file model
print("Multi-file model detected")
with open(index_path, 'r') as f:
index_data = json.load(f)
# Get the weight map (tensor_name -> file_name)
weight_map = index_data.get("weight_map", {})
# Group tensors by file for efficient processing
file_tensors = defaultdict(list)
for tensor_name, file_name in weight_map.items():
file_tensors[file_name].append(tensor_name)
print("Tensors in model:")
# Process each shard file
for file_name, tensor_names in file_tensors.items():
file_path = os.path.join(model_path, file_name)
print(f"\n--- From {file_name} ---")
with safe_open(file_path, framework="pt") as f:
for tensor_name in sorted(tensor_names):
tensor = f.get_tensor(tensor_name)
print(f"- {tensor_name} : shape = {tensor.shape}, dtype = {tensor.dtype}")
elif os.path.exists(single_file_path):
# Single file model (original behavior)
print("Single-file model detected")
with safe_open(single_file_path, framework="pt") as f:
keys = f.keys()
print("Tensors in model:")
for key in sorted(keys):
tensor = f.get_tensor(key)
print(f"- {key} : shape = {tensor.shape}, dtype = {tensor.dtype}")
else:
print(f"Error: Neither 'model.safetensors.index.json' nor 'model.safetensors' found in {model_path}")
print("Available files:")
if os.path.exists(model_path):
for item in sorted(os.listdir(model_path)):
print(f" {item}")
else:
print(f" Directory {model_path} does not exist")
exit(1)

View File

@@ -0,0 +1,35 @@
#!/usr/bin/env bash
set -e
CONVERTED_MODEL="${1:-"$CONVERTED_MODEL"}"
# Final check if we have a model path
if [ -z "$CONVERTED_MODEL" ]; then
echo "Error: Model path must be provided either as:" >&2
echo " 1. Command line argument" >&2
echo " 2. CONVERTED_MODEL environment variable" >&2
exit 1
fi
# Check if data/wikitext-2-raw directory exists
if [ ! -d "ppl/wikitext-2-raw" ]; then
echo "ppl/wikitext-2-raw directory does not exist. Downloading..." >&2
mkdir -p ppl
pushd ppl
./../../../scripts/get-wikitext-2.sh
popd
fi
mkdir -p ppl
OUTPUTFILE="ppl/$(basename $CONVERTED_MODEL).kld"
echo "Model: $CONVERTED_MODEL"
cmake --build ../../build --target llama-perplexity -j8
../.././build/bin/llama-perplexity -m $CONVERTED_MODEL \
-f ppl/wikitext-2-raw/wiki.test.raw \
--kl-divergence-base $OUTPUTFILE
echo "Generated logits in $OUTPUTFILE"

View File

@@ -0,0 +1,27 @@
#!/usr/bin/env bash
set -e
QUANTIZED_MODEL="${1:-"$QUANTIZED_MODEL"}"
if [ -z "$QUANTIZED_MODEL" ]; then
echo "Error: Model path must be provided either as:" >&2
echo " 1. Command line argument" >&2
echo " 2. QUANTIZED_MODEL environment variable" >&2
exit 1
fi
# Check if data/wikitext-2-raw directory exists
if [ ! -d "ppl/wikitext-2-raw" ]; then
echo "ppl/wikitext-2-raw directory does not exist. Downloading..." >&2
mkdir -p ppl
pushd ppl
./../../../scripts/get-wikitext-2.sh
popd
fi
cmake --build ../../build --target llama-perplexity -j8
../.././build/bin/llama-perplexity -m $QUANTIZED_MODEL -f ppl/wikitext-2-raw/wiki.test.raw

View File

@@ -0,0 +1,28 @@
#!/usr/bin/env bash
set -e
QUANTIZED_MODEL="${1:-"$QUANTIZED_MODEL"}"
LOGITS_FILE="${1:-"$LOGITS_FILE"}"
if [ -z "$QUANTIZED_MODEL" ]; then
echo "Error: Model path must be provided either as:" >&2
echo " 1. Command line argument" >&2
echo " 2. QUANTIZED_MODEL environment variable" >&2
exit 1
fi
if [ ! -f ${LOGITS_FILE} ]; then
echo "Error: logits file '${LOGITS_FILE} was not found"
echo "Did you run the perplexity-gen.sh script?"
exit 1
fi
echo "Model: $QUANTIZED_MODEL"
echo "Data file: $LOGITS_FILE"
cmake --build ../../build --target llama-perplexity -j8
../.././build/bin/llama-perplexity -m $QUANTIZED_MODEL \
--kl-divergence-base $LOGITS_FILE \
--kl-divergence

View File

@@ -0,0 +1,48 @@
#!/usr/bin/env bash
set -e
CONVERTED_MODEL="${1:-"$CONVERTED_MODEL"}"
QUANTIZED_TYPE="${2:-"$QUANTIZED_TYPE"}"
TOKEN_EMBD_TYPE="${3:-"${TOKEN_EMBD_TYPE}"}"
OUTPUT_TYPE="${4:-"${OUTPUT_TYPE}"}"
QUANTIZED_MODEL=$CONVERTED_MODEL
# Final check if we have a model path
if [ -z "$CONVERTED_MODEL" ]; then
echo "Error: Model path must be provided either as:" >&2
echo " 1. Command line argument" >&2
echo " 2. CONVERTED_MODEL environment variable" >&2
exit 1
fi
if [ -z "$QUANTIZED_TYPE" ]; then
echo "Error: QUANTIZED_TYPE is required" >&2
exit 1
fi
echo $CONVERTED_MODEL
# Process the quantized model filename
if [[ "$QUANTIZED_MODEL" == *.gguf ]]; then
# Remove .gguf suffix, add quantized type, then add .gguf back
BASE_NAME="${QUANTIZED_MODEL%.gguf}"
QUANTIZED_MODEL="${BASE_NAME}-${QUANTIZED_TYPE}.gguf"
else
echo "Error: QUANTIZED_MODEL must end with .gguf extension" >&2
exit 1
fi
cmake --build ../../build --target llama-quantize -j8
echo $TOKEN_EMBD_TYPE
echo $OUTPUT_TYPE
CMD_ARGS=("../../build/bin/llama-quantize")
[[ -n "$TOKEN_EMBD_TYPE" ]] && CMD_ARGS+=("--token-embedding-type" "$TOKEN_EMBD_TYPE")
[[ -n "$OUTPUT_TYPE" ]] && CMD_ARGS+=("--output-tensor-type" "$OUTPUT_TYPE")
CMD_ARGS+=("$CONVERTED_MODEL" "$QUANTIZED_MODEL" "$QUANTIZED_TYPE")
"${CMD_ARGS[@]}"
echo "Quantized model saved to: $QUANTIZED_MODEL"

View File

@@ -0,0 +1,22 @@
#!/usr/bin/env bash
set -e
#
# First try command line argument, then environment variable, then file
CONVERTED_MODEL="${1:-"$CONVERTED_MODEL"}"
# Final check if we have a model path
if [ -z "$CONVERTED_MODEL" ]; then
echo "Error: Model path must be provided either as:" >&2
echo " 1. Command line argument" >&2
echo " 2. CONVERTED_MODEL environment variable" >&2
exit 1
fi
echo $CONVERTED_MODEL
cmake --build ../../build --target llama-server
../../build/bin/llama-server -m $CONVERTED_MODEL \
--embedding \
--pooling none

View File

@@ -0,0 +1,242 @@
#!/usr/bin/env python3
import numpy as np
import argparse
import os
import importlib
from pathlib import Path
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, AutoModel
from common import compare_tokens, exit_with_warning # type: ignore[import-not-found]
unreleased_model_name = os.getenv('UNRELEASED_MODEL_NAME')
def cosine_similarity(a, b=None):
a = np.asarray(a)
if b is None:
b = a
else:
b = np.asarray(b)
if a.ndim == 1:
a = a.reshape(1, -1)
if b.ndim == 1:
b = b.reshape(1, -1)
a_norms = np.linalg.norm(a, axis=1, keepdims=True)
b_norms = np.linalg.norm(b, axis=1, keepdims=True)
a_norms = np.where(a_norms == 0, 1e-8, a_norms)
b_norms = np.where(b_norms == 0, 1e-8, b_norms)
a_normalized = a / a_norms
b_normalized = b / b_norms
# Compute cosine similarity
return np.dot(a_normalized, b_normalized.T)
def load_embeddings_from_file(filename, n_tokens, n_embd):
embeddings = np.fromfile(filename, dtype=np.float32)
# Check if this is pooled (single embedding) or per-token embeddings
if len(embeddings) == n_embd:
return embeddings.reshape(1, n_embd)
else:
return embeddings.reshape(n_tokens, n_embd)
def test_single_prompt_similarity(python_emb, cpp_emb, tokens, prompt):
np.set_printoptions(suppress=True, precision=6)
print("pytorch embeddings:");
print(python_emb)
print("llama.cpp embeddings:");
print(cpp_emb)
print(f"\n=== Prompt: '{prompt}' ===")
print(f"Tokens: {tokens}")
print(f"Embeddings shape: Python {python_emb.shape}, llama.cpp {cpp_emb.shape}")
n_tokens = len(tokens)
is_pooled = python_emb.shape[0] == 1
if is_pooled:
print(f"\n[Pooled Embeddings Mode - comparing single sentence embeddings]")
# 1. Direct embedding comparison for pooled embeddings
print(f"\n1. Raw Embedding Magnitude Comparison:")
py_mag = np.linalg.norm(python_emb[0])
cpp_mag = np.linalg.norm(cpp_emb[0])
ratio = py_mag / cpp_mag if cpp_mag > 0 else float('inf')
print(f" Pooled embedding: Python={py_mag:.3f}, llama.cpp={cpp_mag:.3f}, ratio={ratio:.3f}")
# 2. Cross-model similarity for pooled embeddings
print(f"\n2. Cross-Model Pooled Embedding Similarity:")
sim = cosine_similarity([python_emb[0]], [cpp_emb[0]])[0][0]
print(f" Cosine similarity: {sim:.6f}")
return {
'cross_model_similarities': [sim],
'similarity_matrix_diff': np.array([[0.0]]),
'max_diff': 0.0,
'mean_diff': 0.0,
'rms_diff': 0.0
}
else:
# Original per-token comparison logic
# 1. Direct embedding comparison
print(f"\n1. Raw Embedding Magnitude Comparison:")
# Check if the distance of each token embedding from the origin and compare
# if the vectors are on the same "sphere". This does not tell us about
# direction (meaning of the token embedding), just magnitude.
for i in range(n_tokens):
py_mag = np.linalg.norm(python_emb[i]) # calculate standard euclidean norm for Python embeddings
cpp_mag = np.linalg.norm(cpp_emb[i]) # calculate standard euclidean norm for llama.cpp embeddings
ratio = py_mag / cpp_mag if cpp_mag > 0 else float('inf')
print(f" Token {i} ({tokens[i]}): Python={py_mag:.3f}, llama.cpp={cpp_mag:.3f}, ratio={ratio:.3f}")
# 2. Cosine similarity between tokens within each model
# Here we check the direction of token embeddings to see if the have the
# same meaning (similarity). This is done by calculating cosine similarity
# of a pair of token embeddings within each model.
print(f"\n2. Within-Model Token Similarities:")
print(" Python model:")
for i in range(n_tokens):
for j in range(i+1, n_tokens):
sim = cosine_similarity([python_emb[i]], [python_emb[j]])[0][0]
print(f" {tokens[i]}{tokens[j]}: {sim:.4f}")
print(" llama.cpp model:")
for i in range(n_tokens):
for j in range(i+1, n_tokens):
sim = cosine_similarity([cpp_emb[i]], [cpp_emb[j]])[0][0]
print(f" {tokens[i]}{tokens[j]}: {sim:.4f}")
# 3. Cross-model similarity (same token position)
print(f"\n3. Cross-Model Same-Token Similarities:")
for i in range(n_tokens):
sim = cosine_similarity([python_emb[i]], [cpp_emb[i]])[0][0]
print(f" Token {i} ({tokens[i]}): {sim:.4f}")
# 4. Similarity matrix comparison
print(f"\n4. Similarity Matrix Differences:")
py_sim_matrix = cosine_similarity(python_emb)
cpp_sim_matrix = cosine_similarity(cpp_emb)
diff_matrix = np.abs(py_sim_matrix - cpp_sim_matrix)
print(f" Max difference: {np.max(diff_matrix):.4f}")
print(f" Mean difference: {np.mean(diff_matrix):.4f}")
print(f" RMS difference: {np.sqrt(np.mean(diff_matrix**2)):.4f}")
return {
'cross_model_similarities': [cosine_similarity([python_emb[i]], [cpp_emb[i]])[0][0] for i in range(n_tokens)],
'similarity_matrix_diff': diff_matrix,
'max_diff': np.max(diff_matrix),
'mean_diff': np.mean(diff_matrix),
'rms_diff': np.sqrt(np.mean(diff_matrix**2))
}
def read_prompt_from_file(file_path):
try:
with open(file_path, 'r', encoding='utf-8') as f:
return f.read().strip()
except FileNotFoundError:
print(f"Error: Prompts file '{file_path}' not found")
exit(1)
except Exception as e:
print(f"Error reading prompts file: {e}")
exit(1)
def main():
parser = argparse.ArgumentParser(description='Test semantic similarity between Python and llama.cpp embeddings')
parser.add_argument('--model-path', '-m', required=True, help='Path to the original Python model')
parser.add_argument('--python-embeddings', '-pe', help='Path to pytorch embeddings "logits" binary file')
parser.add_argument('--cpp-embeddings', '-ce', help='Path to llama.cpp embeddings "logits" binary file')
parser.add_argument('--causal', '-c', default=False, help='if the model is causal (default: false)', action='store_true')
parser.add_argument('--prompt', '-p', default='Hello world today', help='Test prompt')
parser.add_argument('--prompts-file', '-pf', help='Path to file containing prompts')
args = parser.parse_args()
if args.prompts_file:
prompt = read_prompt_from_file(args.prompts_file)
else:
prompt = args.prompt
python_emb_path = Path(args.python_embeddings)
cpp_emb_path = Path(args.cpp_embeddings)
# Extract base names (e.g., "pytorch-model-name-embeddings.bin" -> "pytorch-model-name")
python_model_name = python_emb_path.stem.replace("-embeddings", "")
cpp_model_name = cpp_emb_path.stem.replace("-embeddings", "")
print("Semantic Similarity Test Between Python and llama.cpp Embedding Models")
print("=" * 70)
# First verify tokens match before comparing embeddings
print("\n🔍 Token Comparison Check")
print("=" * 70)
data_dir = python_emb_path.parent
if not compare_tokens(python_model_name, cpp_model_name, type_suffix="-embeddings", output_dir=str(data_dir)):
exit_with_warning("\n❌ Token mismatch detected", args.model_path)
print()
# Single prompt detailed comparison
print(f"\nTesting with prompt: '{prompt}'")
# Load the python model to get configuration information and also to load the tokenizer.
print("Loading model and tokenizer using AutoTokenizer:", args.model_path)
tokenizer = AutoTokenizer.from_pretrained(args.model_path)
config = AutoConfig.from_pretrained(args.model_path, trust_remote_code=True)
if unreleased_model_name:
model_name_lower = unreleased_model_name.lower()
unreleased_module_path = f"transformers.models.{model_name_lower}.modular_{model_name_lower}"
if args.causal:
class_name = f"{unreleased_model_name}ForCausalLM"
else:
class_name = f"{unreleased_model_name}Model"
print(f"Model class: {class_name}")
print(f"Importing unreleased model module: {unreleased_module_path}")
try:
model_class = getattr(importlib.import_module(unreleased_module_path), class_name)
model = model_class.from_pretrained(args.model_path)
except (ImportError, AttributeError) as e:
print(f"Failed to import or load model: {e}")
exit(1)
else:
if args.causal:
model = AutoModelForCausalLM.from_pretrained(args.model_path, trust_remote_code=True)
else:
model = AutoModel.from_pretrained(args.model_path, trust_remote_code=True)
encoded = tokenizer(prompt, return_tensors="pt")
tokens = tokenizer.convert_ids_to_tokens(encoded['input_ids'][0])
n_tokens = len(tokens)
print(f"n_tokens: {n_tokens}");
print(f"hidden_size: {model.config.hidden_size}")
# Load binary embeddings from data directory.
llamacpp_embeddings = load_embeddings_from_file(args.cpp_embeddings, n_tokens, model.config.hidden_size)
python_embeddings = load_embeddings_from_file(args.python_embeddings, n_tokens, model.config.hidden_size)
# Run comparison
results = test_single_prompt_similarity(python_embeddings, llamacpp_embeddings, tokens, prompt)
# Summary
print(f"\n=== SUMMARY ===")
avg_cross_sim = np.mean(results['cross_model_similarities'])
print(f"Average cross-model similarity: {avg_cross_sim:.4f}")
print(f"Similarity matrix RMS difference: {results['rms_diff']:.4f}")
# Quality assessment
if avg_cross_sim > 0.95:
print("✅ EXCELLENT: Models are highly similar")
elif avg_cross_sim > 0.90:
print("✅ VERY GOOD: Models are very similar")
elif avg_cross_sim > 0.80:
print("⚠️ GOOD: Models are reasonably similar")
elif avg_cross_sim > 0.70:
print("⚠️ FAIR: Models have some differences")
else:
exit_with_warning("❌ POOR: Models are significantly different", args.model_path)
if __name__ == "__main__":
main()