Sync from upstream llama.cpp repository

This commit is contained in:
2026-01-16 10:43:34 +08:00
parent 3bc369a6f7
commit f4ae4cc7da
2053 changed files with 956010 additions and 1 deletions

View File

@@ -0,0 +1,46 @@
#!/usr/bin/env bash
set -e
MODEL_PATH="${1:-"$MODEL_PATH"}"
MODEL_NAME="${2:-$(basename "$MODEL_PATH")}"
CONVERTED_MODEL_PATH="${1:-"$CONVERTED_MODEL"}"
CONVERTED_MODEL_NAME="${2:-$(basename "$CONVERTED_MODEL_PATH" ".gguf")}"
if [ -t 0 ]; then
CPP_EMBEDDINGS="data/llamacpp-${CONVERTED_MODEL_NAME}-embeddings.bin"
else
# Process piped JSON data and convert to binary (matching logits.cpp format)
TEMP_FILE=$(mktemp /tmp/tmp.XXXXXX.binn)
python3 -c "
import json
import sys
import struct
data = json.load(sys.stdin)
# Flatten all embeddings completely
flattened = []
for item in data:
embedding = item['embedding']
for token_embedding in embedding:
flattened.extend(token_embedding)
print(f'Total embedding values: {len(flattened)}', file=sys.stderr)
# Write as binary floats - matches logitc.cpp fwrite format
with open('$TEMP_FILE', 'wb') as f:
for value in flattened:
f.write(struct.pack('f', value))
"
CPP_EMBEDDINGS="$TEMP_FILE"
trap "rm -f $TEMP_FILE" EXIT
fi
python scripts/utils/semantic_check.py --model-path $MODEL_PATH \
--python-embeddings data/pytorch-${MODEL_NAME}-embeddings.bin \
--cpp-embeddings $CPP_EMBEDDINGS \
--prompt "Hello world today" \
--causal

View File

@@ -0,0 +1,87 @@
#!/usr/bin/env python3
import sys
import numpy as np
from pathlib import Path
import os
# Add utils directory to path for direct script execution
sys.path.insert(0, str(Path(__file__).parent.parent / "utils"))
from common import get_model_name_from_env_path, compare_tokens, exit_with_warning # type: ignore[import-not-found]
def quick_logits_check(pytorch_file, llamacpp_file):
"""Lightweight sanity check before NMSE"""
try:
pytorch_logits = np.fromfile(pytorch_file, dtype=np.float32)
llamacpp_logits = np.fromfile(llamacpp_file, dtype=np.float32)
except Exception as e:
print(f"❌ NOK: Failed to load files - {e}")
return False
# Check shapes match
if pytorch_logits.shape != llamacpp_logits.shape:
print(f"❌ NOK: Shape mismatch - PyTorch: {pytorch_logits.shape}, llama.cpp: {llamacpp_logits.shape}")
return False
# Calculate key metrics
diff = pytorch_logits - llamacpp_logits
abs_diff = np.abs(diff)
max_diff = np.max(abs_diff)
# Get top 10 predictions from both models
pytorch_top10 = np.argsort(pytorch_logits)[-10:][::-1]
llamacpp_top10 = np.argsort(llamacpp_logits)[-10:][::-1]
print(f"Top 10 PyTorch logits: {pytorch_logits[pytorch_top10]}")
print(f"Top 10 llama.cpp logits: {llamacpp_logits[llamacpp_top10]}")
print(f"Max absolute difference: {max_diff:.4f}")
return True
def main():
model_path = os.environ.get('MODEL_PATH')
model_name = get_model_name_from_env_path('MODEL_PATH')
data_dir = Path("data")
pytorch_file = data_dir / f"pytorch-{model_name}.bin"
llamacpp_model_name = get_model_name_from_env_path('CONVERTED_MODEL')
print(f"Using converted model: {llamacpp_model_name}")
llamacpp_file = data_dir / f"llamacpp-{llamacpp_model_name}.bin"
if not pytorch_file.exists():
print(f"Error: PyTorch logits file not found: {pytorch_file}")
print("Please run scripts/run-org-model.sh first to generate this file.")
sys.exit(1)
if not llamacpp_file.exists():
print(f"Error: llama.cpp logits file not found: {llamacpp_file}")
print("Please run scripts/run-converted-model.sh first to generate this file.")
sys.exit(1)
print("Checked all required files were found. Proceeding...\n")
# Verify tokens as they are a prerequisite for logits comparison.
print("🔍 Token Comparison Check")
print("=" * 40)
if not compare_tokens(f"pytorch-{model_name}", f"llamacpp-{llamacpp_model_name}"):
exit_with_warning("\n❌ Token mismatch detected", model_path)
print()
print("🔍 GGML Model Validation for model ", model_name)
print("=" * 40)
print(f"PyTorch logits : {pytorch_file}")
print(f"llama.cpp logits: {llamacpp_file}")
print()
success = quick_logits_check(pytorch_file, llamacpp_file)
# Exit with appropriate code
if success:
print("✅ OK: Lightweight model check successful!")
print(" Ok to proceed with NMSE check...")
sys.exit(0)
else:
exit_with_warning(f"❌ NOK: Top 10 predictions don't match - generation will differ", model_path)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,46 @@
#!/usr/bin/env bash
set -e
# Parse command line arguments
MMPROJ=""
while [[ $# -gt 0 ]]; do
case $1 in
--mmproj)
MMPROJ="--mmproj"
shift
;;
*)
shift
;;
esac
done
MODEL_NAME="${MODEL_NAME:-$(basename "$MODEL_PATH")}"
OUTPUT_DIR="${OUTPUT_DIR:-../../models}"
TYPE="${OUTTYPE:-f16}"
METADATA_OVERRIDE="${METADATA_OVERRIDE:-}"
CONVERTED_MODEL="${OUTPUT_DIR}/${MODEL_NAME}.gguf"
echo "Model path: ${MODEL_PATH}"
echo "Model name: ${MODEL_NAME}"
echo "Data type: ${TYPE}"
echo "Converted model path:: ${CONVERTED_MODEL}"
echo "Metadata override: ${METADATA_OVERRIDE}"
CMD_ARGS=("python" "../../convert_hf_to_gguf.py" "--verbose")
CMD_ARGS+=("${MODEL_PATH}")
CMD_ARGS+=("--outfile" "${CONVERTED_MODEL}")
CMD_ARGS+=("--outtype" "${TYPE}")
[[ -n "$METADATA_OVERRIDE" ]] && CMD_ARGS+=("--metadata" "${METADATA_OVERRIDE}")
[[ -n "$MMPROJ" ]] && CMD_ARGS+=("${MMPROJ}")
"${CMD_ARGS[@]}"
echo ""
echo "The environment variable CONVERTED_MODEL can be set to this path using:"
echo "export CONVERTED_MODEL=$(realpath ${CONVERTED_MODEL})"
if [[ -n "$MMPROJ" ]]; then
mmproj_file="${OUTPUT_DIR}/mmproj-$(basename "${CONVERTED_MODEL}")"
echo "The mmproj model was created in $(realpath "$mmproj_file")"
fi

View File

@@ -0,0 +1,13 @@
---
base_model:
- {base_model}
---
# {model_name} GGUF
Recommended way to run this model:
```sh
llama-server -hf {namespace}/{model_name}-GGUF
```
Then, access http://localhost:8080

View File

@@ -0,0 +1,114 @@
#!/usr/bin/env python3
import argparse
import os
import importlib
import torch
import numpy as np
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM
from pathlib import Path
unreleased_model_name = os.getenv('UNRELEASED_MODEL_NAME')
parser = argparse.ArgumentParser(description='Process model with specified path')
parser.add_argument('--model-path', '-m', help='Path to the model')
args = parser.parse_args()
model_path = os.environ.get('MODEL_PATH', args.model_path)
if model_path is None:
parser.error("Model path must be specified either via --model-path argument or MODEL_PATH environment variable")
config = AutoConfig.from_pretrained(model_path)
print("Model type: ", config.model_type)
print("Vocab size: ", config.vocab_size)
print("Hidden size: ", config.hidden_size)
print("Number of layers: ", config.num_hidden_layers)
print("BOS token id: ", config.bos_token_id)
print("EOS token id: ", config.eos_token_id)
print("Loading model and tokenizer using AutoTokenizer:", model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)
if unreleased_model_name:
model_name_lower = unreleased_model_name.lower()
unreleased_module_path = f"transformers.models.{model_name_lower}.modular_{model_name_lower}"
class_name = f"{unreleased_model_name}ForCausalLM"
print(f"Importing unreleased model module: {unreleased_module_path}")
try:
model_class = getattr(importlib.import_module(unreleased_module_path), class_name)
model = model_class.from_pretrained(model_path)
except (ImportError, AttributeError) as e:
print(f"Failed to import or load model: {e}")
print("Falling back to AutoModelForCausalLM")
model = AutoModelForCausalLM.from_pretrained(model_path)
else:
model = AutoModelForCausalLM.from_pretrained(model_path)
print(f"Model class: {type(model)}")
#print(f"Model file: {type(model).__module__}")
model_name = os.path.basename(model_path)
print(f"Model name: {model_name}")
prompt = "Hello world today"
input_ids = tokenizer(prompt, return_tensors="pt").input_ids
print(f"Input tokens: {input_ids}")
print(f"Input text: {repr(prompt)}")
print(f"Tokenized: {tokenizer.convert_ids_to_tokens(input_ids[0])}")
with torch.no_grad():
outputs = model(input_ids, output_hidden_states=True)
# Extract hidden states from the last layer
# outputs.hidden_states is a tuple of (num_layers + 1) tensors
# Index -1 gets the last layer, shape: [batch_size, seq_len, hidden_size]
last_hidden_states = outputs.hidden_states[-1]
# Get embeddings for all tokens
token_embeddings = last_hidden_states[0].float().cpu().numpy() # Remove batch dimension
print(f"Hidden states shape: {last_hidden_states.shape}")
print(f"Token embeddings shape: {token_embeddings.shape}")
print(f"Hidden dimension: {token_embeddings.shape[-1]}")
print(f"Number of tokens: {token_embeddings.shape[0]}")
# Save raw token embeddings
data_dir = Path("data")
data_dir.mkdir(exist_ok=True)
bin_filename = data_dir / f"pytorch-{model_name}-embeddings.bin"
txt_filename = data_dir / f"pytorch-{model_name}-embeddings.txt"
# Save all token embeddings as binary
print(token_embeddings)
token_embeddings.astype(np.float32).tofile(bin_filename)
# Save as text for inspection
with open(txt_filename, "w") as f:
for i, embedding in enumerate(token_embeddings):
for j, val in enumerate(embedding):
f.write(f"{i} {j} {val:.6f}\n")
# Print embeddings per token in the requested format
print("\nToken embeddings:")
tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
for i, embedding in enumerate(token_embeddings):
# Format: show first few values, ..., then last few values
if len(embedding) > 10:
# Show first 3 and last 3 values with ... in between
first_vals = " ".join(f"{val:8.6f}" for val in embedding[:3])
last_vals = " ".join(f"{val:8.6f}" for val in embedding[-3:])
print(f"embedding {i}: {first_vals} ... {last_vals}")
else:
# If embedding is short, show all values
vals = " ".join(f"{val:8.6f}" for val in embedding)
print(f"embedding {i}: {vals}")
# Also show token info for reference
print(f"\nToken reference:")
for i, token in enumerate(tokens):
print(f" Token {i}: {repr(token)}")
print(f"Saved bin logits to: {bin_filename}")
print(f"Saved txt logist to: {txt_filename}")

View File

@@ -0,0 +1,18 @@
#!/usr/bin/env bash
set -e
# First try command line argument, then environment variable, then file
CONVERTED_MODEL="${1:-"$CONVERTED_MODEL"}"
# Final check if we have a model path
if [ -z "$CONVERTED_MODEL" ]; then
echo "Error: Model path must be provided either as:" >&2
echo " 1. Command line argument" >&2
echo " 2. CONVERTED_MODEL environment variable" >&2
exit 1
fi
cmake --build ../../build --target llama-debug -j8
../../build/bin/llama-debug -m $CONVERTED_MODEL --embedding -p "Hello world today" --save-logits

View File

@@ -0,0 +1,26 @@
#!/usr/bin/env bash
set -e
# First try command line argument, then environment variable, then file
CONVERTED_MODEL="${1:-"$CONVERTED_MODEL"}"
MODEL_TESTING_PROMPT="${2:-"$MODEL_TESTING_PROMPT"}"
if [ -z "$MODEL_TESTING_PROMPT"]; then
MODEL_TESTING_PROMPT="Hello, my name is"
fi
# Final check if we have a model path
if [ -z "$CONVERTED_MODEL" ]; then
echo "Error: Model path must be provided either as:" >&2
echo " 1. Command line argument" >&2
echo " 2. CONVERTED_MODEL environment variable" >&2
exit 1
fi
echo $CONVERTED_MODEL
echo $MODEL_TESTING_PROMPT
cmake --build ../../build --target llama-debug -j8
../../build/bin/llama-debug -m "$CONVERTED_MODEL" -p "$MODEL_TESTING_PROMPT" --save-logits

View File

@@ -0,0 +1,168 @@
#!/usr/bin/env python3
import argparse
import os
import sys
import importlib
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForImageTextToText, AutoConfig
# Add parent directory to path for imports
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
from utils.common import debug_hook, save_output_data
def parse_arguments():
parser = argparse.ArgumentParser(description="Process model with specified path")
parser.add_argument("--model-path", "-m", help="Path to the model")
parser.add_argument("--prompt-file", "-f", help="Optional prompt file", required=False)
parser.add_argument("--verbose", "-v", action="store_true", help="Enable verbose debug output")
parser.add_argument("--device", "-d", help="Device to use (cpu, cuda, mps, auto)", default="auto")
return parser.parse_args()
def load_model_and_tokenizer(model_path, device="auto"):
print("Loading model and tokenizer using AutoTokenizer:", model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
multimodal = False
full_config = config
# Determine device_map based on device argument
if device == "cpu":
device_map = {"": "cpu"}
print("Forcing CPU usage")
elif device == "auto":
device_map = "auto"
else:
device_map = {"": device}
print("Model type: ", config.model_type)
if "vocab_size" not in config and "text_config" in config:
config = config.text_config
multimodal = True
print("Vocab size: ", config.vocab_size)
print("Hidden size: ", config.hidden_size)
print("Number of layers: ", config.num_hidden_layers)
print("BOS token id: ", config.bos_token_id)
print("EOS token id: ", config.eos_token_id)
unreleased_model_name = os.getenv("UNRELEASED_MODEL_NAME")
if unreleased_model_name:
model_name_lower = unreleased_model_name.lower()
unreleased_module_path = (
f"transformers.models.{model_name_lower}.modular_{model_name_lower}"
)
class_name = f"{unreleased_model_name}ForCausalLM"
print(f"Importing unreleased model module: {unreleased_module_path}")
try:
model_class = getattr(importlib.import_module(unreleased_module_path), class_name)
model = model_class.from_pretrained(
model_path,
device_map=device_map,
offload_folder="offload",
trust_remote_code=True,
config=config
)
except (ImportError, AttributeError) as e:
print(f"Failed to import or load model: {e}")
exit(1)
else:
if multimodal:
model = AutoModelForImageTextToText.from_pretrained(
model_path,
device_map=device_map,
offload_folder="offload",
trust_remote_code=True,
config=full_config
)
else:
model = AutoModelForCausalLM.from_pretrained(
model_path,
device_map=device_map,
offload_folder="offload",
trust_remote_code=True,
config=config
)
print(f"Model class: {model.__class__.__name__}")
return model, tokenizer, config
def enable_torch_debugging(model):
for name, module in model.named_modules():
if len(list(module.children())) == 0: # only leaf modules
module.register_forward_hook(debug_hook(name))
def get_prompt(args):
if args.prompt_file:
with open(args.prompt_file, encoding='utf-8') as f:
return f.read()
elif os.getenv("MODEL_TESTING_PROMPT"):
return os.getenv("MODEL_TESTING_PROMPT")
else:
return "Hello, my name is"
def main():
args = parse_arguments()
model_path = os.environ.get("MODEL_PATH", args.model_path)
if model_path is None:
print("Error: Model path must be specified either via --model-path argument or MODEL_PATH environment variable")
sys.exit(1)
model, tokenizer, config = load_model_and_tokenizer(model_path, args.device)
if args.verbose:
enable_torch_debugging(model)
model_name = os.path.basename(model_path)
# Iterate over the model parameters (the tensors) and get the first one
# and use it to get the device the model is on.
device = next(model.parameters()).device
prompt = get_prompt(args)
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
token_ids = input_ids[0].cpu().tolist()
print(f"Input tokens: {input_ids}")
print(f"Input text: {repr(prompt)}")
print(f"Tokenized: {tokenizer.convert_ids_to_tokens(input_ids[0])}")
batch_size = 512
with torch.no_grad():
past = None
outputs = None
for i in range(0, input_ids.size(1), batch_size):
print(f"Processing chunk with tokens {i} to {i + batch_size}")
chunk = input_ids[:, i:i + batch_size]
outputs = model(chunk.to(model.device), past_key_values=past, use_cache=True)
past = outputs.past_key_values
logits = outputs.logits # type: ignore
# Extract logits for the last token (next token prediction)
last_logits = logits[0, -1, :].float().cpu().numpy()
print(f"Logits shape: {logits.shape}")
print(f"Last token logits shape: {last_logits.shape}")
print(f"Vocab size: {len(last_logits)}")
# Print some sample logits for quick verification
print(f"First 10 logits: {last_logits[:10]}")
print(f"Last 10 logits: {last_logits[-10:]}")
# Show top 5 predicted tokens
top_indices = np.argsort(last_logits)[-5:][::-1]
print("Top 5 predictions:")
for idx in top_indices:
token = tokenizer.decode([idx])
print(f" Token {idx} ({repr(token)}): {last_logits[idx]:.6f}")
save_output_data(last_logits, token_ids, prompt, model_name)
if __name__ == "__main__":
main()