sync from b7516

This commit is contained in:
2026-01-16 11:16:14 +08:00
parent f4ae4cc7da
commit 6ee41dd9e3
380 changed files with 18435 additions and 38806 deletions

View File

@@ -4,10 +4,8 @@ import numpy as np
import argparse
import os
import importlib
from pathlib import Path
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, AutoModel
from common import compare_tokens, exit_with_warning # type: ignore[import-not-found]
unreleased_model_name = os.getenv('UNRELEASED_MODEL_NAME')
@@ -159,31 +157,16 @@ def main():
else:
prompt = args.prompt
python_emb_path = Path(args.python_embeddings)
cpp_emb_path = Path(args.cpp_embeddings)
# Extract base names (e.g., "pytorch-model-name-embeddings.bin" -> "pytorch-model-name")
python_model_name = python_emb_path.stem.replace("-embeddings", "")
cpp_model_name = cpp_emb_path.stem.replace("-embeddings", "")
print("Semantic Similarity Test Between Python and llama.cpp Embedding Models")
print("=" * 70)
# First verify tokens match before comparing embeddings
print("\n🔍 Token Comparison Check")
print("=" * 70)
data_dir = python_emb_path.parent
if not compare_tokens(python_model_name, cpp_model_name, type_suffix="-embeddings", output_dir=str(data_dir)):
exit_with_warning("\n❌ Token mismatch detected", args.model_path)
print()
# Single prompt detailed comparison
print(f"\nTesting with prompt: '{prompt}'")
# Load the python model to get configuration information and also to load the tokenizer.
print("Loading model and tokenizer using AutoTokenizer:", args.model_path)
tokenizer = AutoTokenizer.from_pretrained(args.model_path)
config = AutoConfig.from_pretrained(args.model_path, trust_remote_code=True)
config = AutoConfig.from_pretrained(args.model_path)
if unreleased_model_name:
model_name_lower = unreleased_model_name.lower()
@@ -203,9 +186,9 @@ def main():
exit(1)
else:
if args.causal:
model = AutoModelForCausalLM.from_pretrained(args.model_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(args.model_path)
else:
model = AutoModel.from_pretrained(args.model_path, trust_remote_code=True)
model = AutoModel.from_pretrained(args.model_path)
encoded = tokenizer(prompt, return_tensors="pt")
tokens = tokenizer.convert_ids_to_tokens(encoded['input_ids'][0])
@@ -236,7 +219,7 @@ def main():
elif avg_cross_sim > 0.70:
print("⚠️ FAIR: Models have some differences")
else:
exit_with_warning("❌ POOR: Models are significantly different", args.model_path)
print("❌ POOR: Models are significantly different")
if __name__ == "__main__":
main()