Sync from upstream llama.cpp repository
This commit is contained in:
3
examples/model-conversion/.gitignore
vendored
Normal file
3
examples/model-conversion/.gitignore
vendored
Normal file
@@ -0,0 +1,3 @@
|
||||
.model_name
|
||||
data
|
||||
ppl
|
||||
229
examples/model-conversion/Makefile
Normal file
229
examples/model-conversion/Makefile
Normal file
@@ -0,0 +1,229 @@
|
||||
MAKEFLAGS += --no-print-directory
|
||||
|
||||
define validate_model_path
|
||||
@if [ -z "$(MODEL_PATH)" ]; then \
|
||||
echo "Error: MODEL_PATH must be provided either as:"; \
|
||||
echo " 1. Environment variable: export MODEL_PATH=/path/to/model"; \
|
||||
echo " 2. Command line argument: make $(1) MODEL_PATH=/path/to/model"; \
|
||||
exit 1; \
|
||||
fi
|
||||
endef
|
||||
|
||||
define validate_embedding_model_path
|
||||
@if [ -z "$(EMBEDDING_MODEL_PATH)" ]; then \
|
||||
echo "Error: EMBEDDING_MODEL_PATH must be provided either as:"; \
|
||||
echo " 1. Environment variable: export EMBEDDING_MODEL_PATH=/path/to/model"; \
|
||||
echo " 2. Command line argument: make $(1) EMBEDDING_MODEL_PATH=/path/to/model"; \
|
||||
exit 1; \
|
||||
fi
|
||||
endef
|
||||
|
||||
define quantize_model
|
||||
@CONVERTED_MODEL="$(1)" QUANTIZED_TYPE="$(QUANTIZED_TYPE)" \
|
||||
TOKEN_EMBD_TYPE="$(TOKEN_EMBD_TYPE)" OUTPUT_TYPE="$(OUTPUT_TYPE)" \
|
||||
./scripts/utils/quantize.sh "$(1)" "$(QUANTIZED_TYPE)" "$(TOKEN_EMBD_TYPE)" "$(OUTPUT_TYPE)"
|
||||
@echo "Export the quantized model path to $(2) variable in your environment"
|
||||
endef
|
||||
|
||||
DEVICE ?= auto
|
||||
|
||||
###
|
||||
### Casual Model targets/recipes
|
||||
###
|
||||
causal-convert-model-bf16: OUTTYPE=bf16
|
||||
causal-convert-model-bf16: causal-convert-model
|
||||
|
||||
causal-convert-model:
|
||||
$(call validate_model_path,causal-convert-model)
|
||||
@MODEL_NAME="$(MODEL_NAME)" OUTTYPE="$(OUTTYPE)" MODEL_PATH="$(MODEL_PATH)" \
|
||||
METADATA_OVERRIDE="$(METADATA_OVERRIDE)" \
|
||||
./scripts/causal/convert-model.sh
|
||||
|
||||
causal-convert-mm-model-bf16: OUTTYPE=bf16
|
||||
causal-convert-mm-model-bf16: MM_OUTTYPE=f16
|
||||
causal-convert-mm-model-bf16: causal-convert-mm-model
|
||||
|
||||
causal-convert-mm-model:
|
||||
$(call validate_model_path,causal-convert-mm-model)
|
||||
@MODEL_NAME="$(MODEL_NAME)" OUTTYPE="$(OUTTYPE)" MODEL_PATH="$(MODEL_PATH)" \
|
||||
METADATA_OVERRIDE="$(METADATA_OVERRIDE)" \
|
||||
./scripts/causal/convert-model.sh
|
||||
|
||||
@MODEL_NAME="$(MODEL_NAME)" OUTTYPE="$(MM_OUTTYPE)" MODEL_PATH="$(MODEL_PATH)" \
|
||||
METADATA_OVERRIDE="$(METADATA_OVERRIDE)" \
|
||||
./scripts/causal/convert-model.sh --mmproj
|
||||
|
||||
causal-run-original-model:
|
||||
$(call validate_model_path,causal-run-original-model)
|
||||
@MODEL_PATH="$(MODEL_PATH)" ./scripts/causal/run-org-model.py --device "$(DEVICE)"
|
||||
|
||||
causal-run-converted-model:
|
||||
@CONVERTED_MODEL="$(CONVERTED_MODEL)" ./scripts/causal/run-converted-model.sh
|
||||
|
||||
causal-verify-logits: causal-run-original-model causal-run-converted-model
|
||||
@MODEL_PATH="$(MODEL_PATH)" ./scripts/causal/compare-logits.py
|
||||
@MODEL_PATH="$(MODEL_PATH)" ./scripts/utils/check-nmse.py -m ${MODEL_PATH}
|
||||
|
||||
causal-run-original-embeddings:
|
||||
@./scripts/causal/run-casual-gen-embeddings-org.py
|
||||
|
||||
causal-run-converted-embeddings:
|
||||
@./scripts/causal/run-converted-model-embeddings-logits.sh
|
||||
|
||||
causal-verify-embeddings: causal-run-original-embeddings causal-run-converted-embeddings
|
||||
@./scripts/causal/compare-embeddings-logits.sh
|
||||
|
||||
causal-inspect-original-model:
|
||||
@./scripts/utils/inspect-org-model.py
|
||||
|
||||
causal-inspect-converted-model:
|
||||
@./scripts/utils/inspect-converted-model.sh
|
||||
|
||||
causal-start-embedding-server:
|
||||
@./scripts/utils/run-embedding-server.sh ${CONVERTED_MODEL}
|
||||
|
||||
causal-curl-embedding-endpoint: causal-run-original-embeddings
|
||||
@./scripts/utils/curl-embedding-server.sh | ./scripts/causal/compare-embeddings-logits.sh
|
||||
|
||||
causal-quantize-Q8_0: QUANTIZED_TYPE = Q8_0
|
||||
causal-quantize-Q8_0: causal-quantize-model
|
||||
|
||||
causal-quantize-Q4_0: QUANTIZED_TYPE = Q4_0
|
||||
causal-quantize-Q4_0: causal-quantize-model
|
||||
|
||||
# For Quantization Aware Trained (QAT) models in Q4_0 we explicitly set the
|
||||
# token embedding and output types to Q8_0 instead of the default Q6_K.
|
||||
causal-quantize-qat-Q4_0: QUANTIZED_TYPE = Q4_0
|
||||
causal-quantize-qat-Q4_0: TOKEN_EMBD_TYPE = Q8_0
|
||||
causal-quantize-qat-Q4_0: OUTPUT_TYPE = Q8_0
|
||||
causal-quantize-qat-Q4_0: causal-quantize-model
|
||||
|
||||
causal-quantize-model:
|
||||
$(call quantize_model,$(CONVERTED_MODEL),QUANTIZED_MODEL)
|
||||
|
||||
causal-run-quantized-model:
|
||||
@QUANTIZED_MODEL="$(QUANTIZED_MODEL)" ./scripts/causal/run-converted-model.sh ${QUANTIZED_MODEL}
|
||||
|
||||
|
||||
###
|
||||
### Embedding Model targets/recipes
|
||||
###
|
||||
|
||||
embedding-convert-model-bf16: OUTTYPE=bf16
|
||||
embedding-convert-model-bf16: embedding-convert-model
|
||||
|
||||
embedding-convert-model:
|
||||
$(call validate_embedding_model_path,embedding-convert-model)
|
||||
@MODEL_NAME="$(MODEL_NAME)" OUTTYPE="$(OUTTYPE)" MODEL_PATH="$(EMBEDDING_MODEL_PATH)" \
|
||||
METADATA_OVERRIDE="$(METADATA_OVERRIDE)" \
|
||||
./scripts/embedding/convert-model.sh
|
||||
|
||||
embedding-convert-model-st:
|
||||
$(call validate_embedding_model_path,embedding-convert-model-st)
|
||||
@MODEL_NAME="$(MODEL_NAME)" OUTTYPE="$(OUTTYPE)" MODEL_PATH="$(EMBEDDING_MODEL_PATH)" \
|
||||
METADATA_OVERRIDE="$(METADATA_OVERRIDE)" \
|
||||
./scripts/embedding/convert-model.sh -st
|
||||
|
||||
embedding-run-original-model:
|
||||
$(call validate_embedding_model_path,embedding-run-original-model)
|
||||
@EMBEDDING_MODEL_PATH="$(EMBEDDING_MODEL_PATH)" \
|
||||
USE_SENTENCE_TRANSFORMERS="$(USE_SENTENCE_TRANSFORMERS)" \
|
||||
./scripts/embedding/run-original-model.py \
|
||||
$(if $(PROMPTS_FILE),--prompts-file "$(PROMPTS_FILE)") \
|
||||
$(if $(USE_SENTENCE_TRANSFORMERS),--use-sentence-transformers)
|
||||
|
||||
embedding-run-original-model-st: USE_SENTENCE_TRANSFORMERS=1
|
||||
embedding-run-original-model-st: embedding-run-original-model
|
||||
|
||||
embedding-run-converted-model:
|
||||
@./scripts/embedding/run-converted-model.sh $(CONVERTED_EMBEDDING_MODEL) \
|
||||
$(if $(PROMPTS_FILE),--prompts-file "$(PROMPTS_FILE)") \
|
||||
$(if $(EMBD_NORMALIZE),--embd-normalize "$(EMBD_NORMALIZE)")
|
||||
|
||||
embedding-verify-logits: embedding-run-original-model embedding-run-converted-model
|
||||
@./scripts/embedding/compare-embeddings-logits.sh \
|
||||
$(if $(PROMPTS_FILE),--prompts-file "$(PROMPTS_FILE)")
|
||||
|
||||
embedding-verify-logits-st: embedding-run-original-model-st embedding-run-converted-model
|
||||
@./scripts/embedding/compare-embeddings-logits.sh \
|
||||
$(if $(PROMPTS_FILE),--prompts-file "$(PROMPTS_FILE)")
|
||||
|
||||
embedding-inspect-original-model:
|
||||
$(call validate_embedding_model_path,embedding-inspect-original-model)
|
||||
@EMBEDDING_MODEL_PATH="$(EMBEDDING_MODEL_PATH)" ./scripts/utils/inspect-org-model.py -m ${EMBEDDING_MODEL_PATH}
|
||||
|
||||
embedding-inspect-converted-model:
|
||||
@CONVERTED_EMBEDDING_MODEL="$(CONVERTED_EMBEDDING_MODEL)" ./scripts/utils/inspect-converted-model.sh ${CONVERTED_EMBEDDING_MODEL}
|
||||
|
||||
embedding-start-embedding-server:
|
||||
@./scripts/utils/run-embedding-server.sh ${CONVERTED_EMBEDDING_MODEL}
|
||||
|
||||
embedding-curl-embedding-endpoint:
|
||||
@./scripts/utils/curl-embedding-server.sh | ./scripts/embedding/compare-embeddings-logits.sh
|
||||
|
||||
embedding-quantize-Q8_0: QUANTIZED_TYPE = Q8_0
|
||||
embedding-quantize-Q8_0: embedding-quantize-model
|
||||
|
||||
embedding-quantize-Q4_0: QUANTIZED_TYPE = Q4_0
|
||||
embedding-quantize-Q4_0: embedding-quantize-model
|
||||
|
||||
# For Quantization Aware Trained (QAT) models in Q4_0 we explicitly set the
|
||||
# token embedding and output types to Q8_0 instead of the default Q6_K.
|
||||
embedding-quantize-qat-Q4_0: QUANTIZED_TYPE = Q4_0
|
||||
embedding-quantize-qat-Q4_0: TOKEN_EMBD_TYPE = Q8_0
|
||||
embedding-quantize-qat-Q4_0: OUTPUT_TYPE = Q8_0
|
||||
embedding-quantize-qat-Q4_0: embedding-quantize-model
|
||||
|
||||
embedding-quantize-model:
|
||||
$(call quantize_model,$(CONVERTED_EMBEDDING_MODEL),QUANTIZED_EMBEDDING_MODEL)
|
||||
|
||||
embedding-run-quantized-model:
|
||||
@./scripts/embedding/run-converted-model.sh $(QUANTIZED_EMBEDDING_MODEL) \
|
||||
$(if $(PROMPTS_FILE),--prompts-file "$(PROMPTS_FILE)")
|
||||
|
||||
###
|
||||
### Perplexity targets/recipes
|
||||
###
|
||||
perplexity-data-gen:
|
||||
CONVERTED_MODEL="$(CONVERTED_MODEL)" ./scripts/utils/perplexity-gen.sh
|
||||
|
||||
perplexity-run-full:
|
||||
QUANTIZED_MODEL="$(QUANTIZED_MODEL)" LOOGITS_FILE="$(LOGITS_FILE)" \
|
||||
./scripts/utils/perplexity-run.sh
|
||||
|
||||
perplexity-run:
|
||||
QUANTIZED_MODEL="$(QUANTIZED_MODEL)" ./scripts/utils/perplexity-run-simple.sh
|
||||
|
||||
###
|
||||
### HuggingFace targets/recipes
|
||||
###
|
||||
|
||||
hf-create-model:
|
||||
@./scripts/utils/hf-create-model.py -m "${MODEL_NAME}" -ns "${NAMESPACE}" -b "${ORIGINAL_BASE_MODEL}"
|
||||
|
||||
hf-create-model-dry-run:
|
||||
@./scripts/utils/hf-create-model.py -m "${MODEL_NAME}" -ns "${NAMESPACE}" -b "${ORIGINAL_BASE_MODEL}" -d
|
||||
|
||||
hf-create-model-embedding:
|
||||
@./scripts/utils/hf-create-model.py -m "${MODEL_NAME}" -ns "${NAMESPACE}" -b "${ORIGINAL_BASE_MODEL}" -e
|
||||
|
||||
hf-create-model-embedding-dry-run:
|
||||
@./scripts/utils/hf-create-model.py -m "${MODEL_NAME}" -ns "${NAMESPACE}" -b "${ORIGINAL_BASE_MODEL}" -e -d
|
||||
|
||||
hf-create-model-private:
|
||||
@./scripts/utils/hf-create-model.py -m "${MODEL_NAME}" -ns "${NAMESPACE}" -b "${ORIGINAL_BASE_MODEL}" -p
|
||||
|
||||
hf-upload-gguf-to-model:
|
||||
@./scripts/utils/hf-upload-gguf-model.py -m "${MODEL_PATH}" -r "${REPO_ID}" -o "${NAME_IN_REPO}"
|
||||
|
||||
hf-create-collection:
|
||||
@./scripts/utils/hf-create-collection.py -n "${NAME}" -d "${DESCRIPTION}" -ns "${NAMESPACE}"
|
||||
|
||||
hf-add-model-to-collection:
|
||||
@./scripts/utils/hf-add-model-to-collection.py -c "${COLLECTION}" -m "${MODEL}"
|
||||
|
||||
|
||||
.PHONY: clean
|
||||
clean:
|
||||
@${RM} -rf data .converted_embedding_model.txt .converted_model.txt .embedding_model_name.txt .model_name.txt
|
||||
|
||||
408
examples/model-conversion/README.md
Normal file
408
examples/model-conversion/README.md
Normal file
@@ -0,0 +1,408 @@
|
||||
# Model Conversion Example
|
||||
This directory contains scripts and code to help in the process of converting
|
||||
HuggingFace PyTorch models to GGUF format.
|
||||
|
||||
The motivation for having this is that the conversion process can often be an
|
||||
iterative process, where the original model is inspected, converted, updates
|
||||
made to llama.cpp, converted again, etc. Once the model has been converted it
|
||||
needs to be verified against the original model, and then optionally quantified,
|
||||
and in some cases perplexity checked of the quantized model. And finally the
|
||||
model/models need to the ggml-org on Hugging Face. This tool/example tries to
|
||||
help with this process.
|
||||
|
||||
> 📝 **Note:** When adding a new model from an existing family, verify the
|
||||
> previous version passes logits verification first. Existing models can have
|
||||
> subtle numerical differences that don't affect generation quality but cause
|
||||
> logits mismatches. Identifying these upfront whether they exist in llama.cpp,
|
||||
> the conversion script, or in an upstream implementation, can save significant
|
||||
> debugging time.
|
||||
|
||||
### Overview
|
||||
The idea is that the makefile targets and scripts here can be used in the
|
||||
development/conversion process assisting with things like:
|
||||
|
||||
* inspect/run the original model to figure out how it works
|
||||
* convert the original model to GGUF format
|
||||
* inspect/run the converted model
|
||||
* verify the logits produced by the original model and the converted model
|
||||
* quantize the model to GGUF format
|
||||
* run perplexity evaluation to verify that the quantized model is performing
|
||||
as expected
|
||||
* upload the model to HuggingFace to make it available for others
|
||||
|
||||
## Setup
|
||||
Create virtual python environment
|
||||
```console
|
||||
$ python3.11 -m venv venv
|
||||
$ source venv/bin/activate
|
||||
(venv) $ pip install -r requirements.txt
|
||||
```
|
||||
|
||||
## Causal Language Model Conversion
|
||||
This section describes the steps to convert a causal language model to GGUF and
|
||||
to verify that the conversion was successful.
|
||||
|
||||
### Download the original model
|
||||
First, clone the original model to some local directory:
|
||||
```console
|
||||
$ mkdir models && cd models
|
||||
$ git clone https://huggingface.co/user/model_name
|
||||
$ cd model_name
|
||||
$ git lfs install
|
||||
$ git lfs pull
|
||||
```
|
||||
|
||||
### Set the MODEL_PATH
|
||||
The path to the downloaded model can be provided in two ways:
|
||||
|
||||
**Option 1: Environment variable (recommended for iterative development)**
|
||||
```console
|
||||
export MODEL_PATH=~/work/ai/models/some_model
|
||||
```
|
||||
|
||||
**Option 2: Command line argument (for one-off tasks)**
|
||||
```console
|
||||
make causal-convert-model MODEL_PATH=~/work/ai/models/some_model
|
||||
```
|
||||
|
||||
Command line arguments take precedence over environment variables when both are provided.
|
||||
|
||||
In cases where the transformer implementation for the model has not been released
|
||||
yet it is possible to set the environment variable `UNRELEASED_MODEL_NAME` which
|
||||
will then cause the transformer implementation to be loaded explicitely and not
|
||||
use AutoModelForCausalLM:
|
||||
```
|
||||
export UNRELEASED_MODEL_NAME=SomeNewModel
|
||||
```
|
||||
|
||||
### Inspecting the original tensors
|
||||
```console
|
||||
# Using environment variable
|
||||
(venv) $ make causal-inspect-original-model
|
||||
|
||||
# Or using command line argument
|
||||
(venv) $ make causal-inspect-original-model MODEL_PATH=~/work/ai/models/some_model
|
||||
```
|
||||
|
||||
### Running the original model
|
||||
This is mainly to verify that the original model works, and to compare the output
|
||||
from the converted model.
|
||||
```console
|
||||
# Using environment variable
|
||||
(venv) $ make causal-run-original-model
|
||||
|
||||
# Or using command line argument
|
||||
(venv) $ make causal-run-original-model MODEL_PATH=~/work/ai/models/some_model
|
||||
```
|
||||
This command will save two files to the `data` directory, one is a binary file
|
||||
containing logits which will be used for comparison with the converted model
|
||||
later, and the other is a text file which allows for manual visual inspection.
|
||||
|
||||
### Model conversion
|
||||
After updates have been made to [gguf-py](../../gguf-py) to add support for the
|
||||
new model, the model can be converted to GGUF format using the following command:
|
||||
```console
|
||||
# Using environment variable
|
||||
(venv) $ make causal-convert-model
|
||||
|
||||
# Or using command line argument
|
||||
(venv) $ make causal-convert-model MODEL_PATH=~/work/ai/models/some_model
|
||||
```
|
||||
|
||||
### Inspecting the converted model
|
||||
The converted model can be inspected using the following command:
|
||||
```console
|
||||
(venv) $ make causal-inspect-converted-model
|
||||
```
|
||||
|
||||
### Running the converted model
|
||||
```console
|
||||
(venv) $ make causal-run-converted-model
|
||||
```
|
||||
|
||||
### Model logits verfication
|
||||
The following target will run the original model and the converted model and
|
||||
compare the logits:
|
||||
```console
|
||||
(venv) $ make causal-verify-logits
|
||||
```
|
||||
|
||||
### Quantizing the model
|
||||
The causal model can be quantized to GGUF format using the following command:
|
||||
```console
|
||||
(venv) $ make causal-quantize-Q8_0
|
||||
Quantized model saved to: /path/to/quantized/model-Q8_0.gguf
|
||||
Export the quantized model path to QUANTIZED_MODEL variable in your environment
|
||||
```
|
||||
This will show the path to the quantized model in the terminal, which can then
|
||||
be used to set the `QUANTIZED_MODEL` environment variable:
|
||||
```console
|
||||
export QUANTIZED_MODEL=/path/to/quantized/model-Q8_0.gguf
|
||||
```
|
||||
Then the quantized model can be run using the following command:
|
||||
```console
|
||||
(venv) $ make causal-run-quantized-model
|
||||
```
|
||||
|
||||
### Quantizing QAT (Quantization Aware Training) models
|
||||
When quantizing to `Q4_0`, the default data type for the token embedding weights
|
||||
will be `Q6_K`. For models that are going to be uploaded to ggml-org it is
|
||||
recommended to use `Q8_0` instead for the embeddings and output tensors.
|
||||
The reason is that although `Q6_K` is smaller in size, it requires more compute
|
||||
to unpack, which can hurt performance during output generation when the entire
|
||||
embedding matrix must be dequantized to compute vocabulary logits. `Q8_0`
|
||||
provides practically full quality with better computational efficiency.
|
||||
```console
|
||||
(venv) $ make causal-quantize-qat-Q4_0
|
||||
```
|
||||
|
||||
|
||||
## Embedding Language Model Conversion
|
||||
|
||||
### Download the original model
|
||||
```console
|
||||
$ mkdir models && cd models
|
||||
$ git clone https://huggingface.co/user/model_name
|
||||
$ cd model_name
|
||||
$ git lfs install
|
||||
$ git lfs pull
|
||||
```
|
||||
|
||||
The path to the embedding model can be provided in two ways:
|
||||
|
||||
**Option 1: Environment variable (recommended for iterative development)**
|
||||
```console
|
||||
export EMBEDDING_MODEL_PATH=~/path/to/embedding_model
|
||||
```
|
||||
|
||||
**Option 2: Command line argument (for one-off tasks)**
|
||||
```console
|
||||
make embedding-convert-model EMBEDDING_MODEL_PATH=~/path/to/embedding_model
|
||||
```
|
||||
|
||||
Command line arguments take precedence over environment variables when both are provided.
|
||||
|
||||
### Running the original model
|
||||
This is mainly to verify that the original model works and to compare the output
|
||||
with the output from the converted model.
|
||||
```console
|
||||
# Using environment variable
|
||||
(venv) $ make embedding-run-original-model
|
||||
|
||||
# Or using command line argument
|
||||
(venv) $ make embedding-run-original-model EMBEDDING_MODEL_PATH=~/path/to/embedding_model
|
||||
```
|
||||
This command will save two files to the `data` directory, one is a binary
|
||||
file containing logits which will be used for comparison with the converted
|
||||
model, and the other is a text file which allows for manual visual inspection.
|
||||
|
||||
#### Using SentenceTransformer with numbered layers
|
||||
For models that have numbered SentenceTransformer layers (01_Pooling, 02_Dense,
|
||||
03_Dense, 04_Normalize), these will be applied automatically when running the
|
||||
converted model but currently there is a separate target to run the original
|
||||
version:
|
||||
|
||||
```console
|
||||
# Run original model with SentenceTransformer (applies all numbered layers)
|
||||
(venv) $ make embedding-run-original-model-st
|
||||
```
|
||||
|
||||
This will use the SentenceTransformer library to load and run the model, which
|
||||
automatically applies all the numbered layers in the correct order. This is
|
||||
particularly useful when comparing with models that should include these
|
||||
additional transformation layers beyond just the base model output.
|
||||
|
||||
The type of normalization can be specified for the converted model but is not
|
||||
strictly necessary as the verification uses cosine similarity and the magnitude
|
||||
of the output vectors does not affect this. But the normalization type can be
|
||||
specified as an argument to the target which might be useful for manual
|
||||
inspection:
|
||||
```console
|
||||
(venv) $ make embedding-verify-logits-st EMBD_NORMALIZE=1
|
||||
```
|
||||
The original model will apply the normalization according to the normalization
|
||||
layer specified in the modules.json configuration file.
|
||||
|
||||
### Model conversion
|
||||
After updates have been made to [gguf-py](../../gguf-py) to add support for the
|
||||
new model the model can be converted to GGUF format using the following command:
|
||||
```console
|
||||
(venv) $ make embedding-convert-model
|
||||
```
|
||||
|
||||
### Run the converted model
|
||||
```console
|
||||
(venv) $ make embedding-run-converted-model
|
||||
```
|
||||
|
||||
### Model logits verfication
|
||||
The following target will run the original model and the converted model (which
|
||||
was done manually in the previous steps) and compare the logits:
|
||||
```console
|
||||
(venv) $ make embedding-verify-logits
|
||||
```
|
||||
|
||||
For models with SentenceTransformer layers, use the `-st` verification target:
|
||||
```console
|
||||
(venv) $ make embedding-verify-logits-st
|
||||
```
|
||||
This convenience target automatically runs both the original model with SentenceTransformer
|
||||
and the converted model with pooling enabled, then compares the results.
|
||||
|
||||
### llama-server verification
|
||||
To verify that the converted model works with llama-server, the following
|
||||
command can be used:
|
||||
```console
|
||||
(venv) $ make embedding-start-embedding-server
|
||||
```
|
||||
Then open another terminal and set the `EMBEDDINGS_MODEL_PATH` environment
|
||||
variable as this will not be inherited by the new terminal:
|
||||
```console
|
||||
(venv) $ make embedding-curl-embedding-endpoint
|
||||
```
|
||||
This will call the `embedding` endpoing and the output will be piped into
|
||||
the same verification script as used by the target `embedding-verify-logits`.
|
||||
|
||||
The causal model can also be used to produce embeddings and this can be verified
|
||||
using the following commands:
|
||||
```console
|
||||
(venv) $ make causal-start-embedding-server
|
||||
```
|
||||
Then open another terminal and set the `MODEL_PATH` environment
|
||||
variable as this will not be inherited by the new terminal:
|
||||
```console
|
||||
(venv) $ make casual-curl-embedding-endpoint
|
||||
```
|
||||
|
||||
### Quantizing the model
|
||||
The embedding model can be quantized to GGUF format using the following command:
|
||||
```console
|
||||
(venv) $ make embedding-quantize-Q8_0
|
||||
Quantized model saved to: /path/to/quantized/model-Q8_0.gguf
|
||||
Export the quantized model path to QUANTIZED_EMBEDDING_MODEL variable in your environment
|
||||
```
|
||||
This will show the path to the quantized model in the terminal, which can then
|
||||
be used to set the `QUANTIZED_EMBEDDING_MODEL` environment variable:
|
||||
```console
|
||||
export QUANTIZED_EMBEDDING_MODEL=/path/to/quantized/model-Q8_0.gguf
|
||||
```
|
||||
Then the quantized model can be run using the following command:
|
||||
```console
|
||||
(venv) $ make embedding-run-quantized-model
|
||||
```
|
||||
|
||||
### Quantizing QAT (Quantization Aware Training) models
|
||||
When quantizing to `Q4_0`, the default data type for the token embedding weights
|
||||
will be `Q6_K`. For models that are going to be uploaded to ggml-org it is
|
||||
recommended to use `Q8_0` instead for the embeddings and output tensors.
|
||||
The reason is that although `Q6_K` is smaller in size, it requires more compute
|
||||
to unpack, which can hurt performance during output generation when the entire
|
||||
embedding matrix must be dequantized to compute vocabulary logits. `Q8_0`
|
||||
provides practically full quality with better computational efficiency.
|
||||
```console
|
||||
(venv) $ make embedding-quantize-qat-Q4_0
|
||||
```
|
||||
|
||||
## Perplexity Evaluation
|
||||
|
||||
### Simple perplexity evaluation
|
||||
This allows to run the perplexity evaluation without having to generate a
|
||||
token/logits file:
|
||||
```console
|
||||
(venv) $ make perplexity-run QUANTIZED_MODEL=~/path/to/quantized/model.gguf
|
||||
```
|
||||
This will use the wikitext dataset to run the perplexity evaluation and
|
||||
output the perplexity score to the terminal. This value can then be compared
|
||||
with the perplexity score of the unquantized model.
|
||||
|
||||
### Full perplexity evaluation
|
||||
First use the converted, non-quantized, model to generate the perplexity evaluation
|
||||
dataset using the following command:
|
||||
```console
|
||||
$ make perplexity-data-gen CONVERTED_MODEL=~/path/to/converted/model.gguf
|
||||
```
|
||||
This will generate a file in the `data` directory named after the model and with
|
||||
a `.kld` suffix which contains the tokens and the logits for the wikitext dataset.
|
||||
|
||||
After the dataset has been generated, the perplexity evaluation can be run using
|
||||
the quantized model:
|
||||
```console
|
||||
$ make perplexity-run-full QUANTIZED_MODEL=~/path/to/quantized/model-Qxx.gguf LOGITS_FILE=data/model.gguf.ppl
|
||||
```
|
||||
|
||||
> 📝 **Note:** The `LOGITS_FILE` is the file generated by the previous command
|
||||
> can be very large, so make sure you have enough disk space available.
|
||||
|
||||
## HuggingFace utilities
|
||||
The following targets are useful for creating collections and model repositories
|
||||
on Hugging Face in the the ggml-org. These can be used when preparing a relase
|
||||
to script the process for new model releases.
|
||||
|
||||
For the following targets a `HF_TOKEN` environment variable is required.
|
||||
|
||||
> 📝 **Note:** Don't forget to logout from Hugging Face after running these
|
||||
> commands, otherwise you might have issues pulling/cloning repositories as
|
||||
> the token will still be in use:
|
||||
> $ huggingface-cli logout
|
||||
> $ unset HF_TOKEN
|
||||
|
||||
### Create a new Hugging Face Model (model repository)
|
||||
This will create a new model repsository on Hugging Face with the specified
|
||||
model name.
|
||||
```console
|
||||
(venv) $ make hf-create-model MODEL_NAME='TestModel' NAMESPACE="danbev" ORIGINAL_BASE_MODEL="some-base-model"
|
||||
Repository ID: danbev/TestModel-GGUF
|
||||
Repository created: https://huggingface.co/danbev/TestModel-GGUF
|
||||
```
|
||||
Note that we append a `-GGUF` suffix to the model name to ensure a consistent
|
||||
naming convention for GGUF models.
|
||||
|
||||
An embedding model can be created using the following command:
|
||||
```console
|
||||
(venv) $ make hf-create-model-embedding MODEL_NAME='TestEmbeddingModel' NAMESPACE="danbev" ORIGINAL_BASE_MODEL="some-base-model"
|
||||
```
|
||||
The only difference is that the model card for an embedding model will be different
|
||||
with regards to the llama-server command and also how to access/call the embedding
|
||||
endpoint.
|
||||
|
||||
### Upload a GGUF model to model repository
|
||||
The following target uploads a model to an existing Hugging Face model repository.
|
||||
```console
|
||||
(venv) $ make hf-upload-gguf-to-model MODEL_PATH=dummy-model1.gguf REPO_ID=danbev/TestModel-GGUF
|
||||
📤 Uploading dummy-model1.gguf to danbev/TestModel-GGUF/dummy-model1.gguf
|
||||
✅ Upload successful!
|
||||
🔗 File available at: https://huggingface.co/danbev/TestModel-GGUF/blob/main/dummy-model1.gguf
|
||||
```
|
||||
This command can also be used to update an existing model file in a repository.
|
||||
|
||||
### Create a new Collection
|
||||
```console
|
||||
(venv) $ make hf-new-collection NAME=TestCollection DESCRIPTION="Collection for testing scripts" NAMESPACE=danbev
|
||||
🚀 Creating Hugging Face Collection
|
||||
Title: TestCollection
|
||||
Description: Collection for testing scripts
|
||||
Namespace: danbev
|
||||
Private: False
|
||||
✅ Authenticated as: danbev
|
||||
📚 Creating collection: 'TestCollection'...
|
||||
✅ Collection created successfully!
|
||||
📋 Collection slug: danbev/testcollection-68930fcf73eb3fc200b9956d
|
||||
🔗 Collection URL: https://huggingface.co/collections/danbev/testcollection-68930fcf73eb3fc200b9956d
|
||||
|
||||
🎉 Collection created successfully!
|
||||
Use this slug to add models: danbev/testcollection-68930fcf73eb3fc200b9956d
|
||||
```
|
||||
|
||||
### Add model to a Collection
|
||||
```console
|
||||
(venv) $ make hf-add-model-to-collection COLLECTION=danbev/testcollection-68930fcf73eb3fc200b9956d MODEL=danbev/TestModel-GGUF
|
||||
✅ Authenticated as: danbev
|
||||
🔍 Checking if model exists: danbev/TestModel-GGUF
|
||||
✅ Model found: danbev/TestModel-GGUF
|
||||
📚 Adding model to collection...
|
||||
✅ Model added to collection successfully!
|
||||
🔗 Collection URL: https://huggingface.co/collections/danbev/testcollection-68930fcf73eb3fc200b9956d
|
||||
|
||||
🎉 Model added successfully!
|
||||
|
||||
```
|
||||
7
examples/model-conversion/requirements.txt
Normal file
7
examples/model-conversion/requirements.txt
Normal file
@@ -0,0 +1,7 @@
|
||||
--extra-index-url https://download.pytorch.org/whl/cpu
|
||||
torch
|
||||
torchvision
|
||||
transformers
|
||||
huggingface-hub
|
||||
accelerate
|
||||
sentence-transformers
|
||||
46
examples/model-conversion/scripts/causal/compare-embeddings-logits.sh
Executable file
46
examples/model-conversion/scripts/causal/compare-embeddings-logits.sh
Executable file
@@ -0,0 +1,46 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -e
|
||||
|
||||
MODEL_PATH="${1:-"$MODEL_PATH"}"
|
||||
MODEL_NAME="${2:-$(basename "$MODEL_PATH")}"
|
||||
|
||||
CONVERTED_MODEL_PATH="${1:-"$CONVERTED_MODEL"}"
|
||||
CONVERTED_MODEL_NAME="${2:-$(basename "$CONVERTED_MODEL_PATH" ".gguf")}"
|
||||
|
||||
if [ -t 0 ]; then
|
||||
CPP_EMBEDDINGS="data/llamacpp-${CONVERTED_MODEL_NAME}-embeddings.bin"
|
||||
else
|
||||
# Process piped JSON data and convert to binary (matching logits.cpp format)
|
||||
TEMP_FILE=$(mktemp /tmp/tmp.XXXXXX.binn)
|
||||
python3 -c "
|
||||
import json
|
||||
import sys
|
||||
import struct
|
||||
|
||||
data = json.load(sys.stdin)
|
||||
|
||||
# Flatten all embeddings completely
|
||||
flattened = []
|
||||
for item in data:
|
||||
embedding = item['embedding']
|
||||
for token_embedding in embedding:
|
||||
flattened.extend(token_embedding)
|
||||
|
||||
print(f'Total embedding values: {len(flattened)}', file=sys.stderr)
|
||||
|
||||
# Write as binary floats - matches logitc.cpp fwrite format
|
||||
with open('$TEMP_FILE', 'wb') as f:
|
||||
for value in flattened:
|
||||
f.write(struct.pack('f', value))
|
||||
"
|
||||
CPP_EMBEDDINGS="$TEMP_FILE"
|
||||
trap "rm -f $TEMP_FILE" EXIT
|
||||
fi
|
||||
|
||||
python scripts/utils/semantic_check.py --model-path $MODEL_PATH \
|
||||
--python-embeddings data/pytorch-${MODEL_NAME}-embeddings.bin \
|
||||
--cpp-embeddings $CPP_EMBEDDINGS \
|
||||
--prompt "Hello world today" \
|
||||
--causal
|
||||
|
||||
87
examples/model-conversion/scripts/causal/compare-logits.py
Executable file
87
examples/model-conversion/scripts/causal/compare-logits.py
Executable file
@@ -0,0 +1,87 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import sys
|
||||
import numpy as np
|
||||
from pathlib import Path
|
||||
import os
|
||||
|
||||
# Add utils directory to path for direct script execution
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent / "utils"))
|
||||
from common import get_model_name_from_env_path, compare_tokens, exit_with_warning # type: ignore[import-not-found]
|
||||
|
||||
def quick_logits_check(pytorch_file, llamacpp_file):
|
||||
"""Lightweight sanity check before NMSE"""
|
||||
|
||||
try:
|
||||
pytorch_logits = np.fromfile(pytorch_file, dtype=np.float32)
|
||||
llamacpp_logits = np.fromfile(llamacpp_file, dtype=np.float32)
|
||||
except Exception as e:
|
||||
print(f"❌ NOK: Failed to load files - {e}")
|
||||
return False
|
||||
|
||||
# Check shapes match
|
||||
if pytorch_logits.shape != llamacpp_logits.shape:
|
||||
print(f"❌ NOK: Shape mismatch - PyTorch: {pytorch_logits.shape}, llama.cpp: {llamacpp_logits.shape}")
|
||||
return False
|
||||
|
||||
# Calculate key metrics
|
||||
diff = pytorch_logits - llamacpp_logits
|
||||
abs_diff = np.abs(diff)
|
||||
max_diff = np.max(abs_diff)
|
||||
|
||||
# Get top 10 predictions from both models
|
||||
pytorch_top10 = np.argsort(pytorch_logits)[-10:][::-1]
|
||||
llamacpp_top10 = np.argsort(llamacpp_logits)[-10:][::-1]
|
||||
print(f"Top 10 PyTorch logits: {pytorch_logits[pytorch_top10]}")
|
||||
print(f"Top 10 llama.cpp logits: {llamacpp_logits[llamacpp_top10]}")
|
||||
print(f"Max absolute difference: {max_diff:.4f}")
|
||||
|
||||
return True
|
||||
|
||||
def main():
|
||||
model_path = os.environ.get('MODEL_PATH')
|
||||
model_name = get_model_name_from_env_path('MODEL_PATH')
|
||||
data_dir = Path("data")
|
||||
pytorch_file = data_dir / f"pytorch-{model_name}.bin"
|
||||
|
||||
llamacpp_model_name = get_model_name_from_env_path('CONVERTED_MODEL')
|
||||
print(f"Using converted model: {llamacpp_model_name}")
|
||||
llamacpp_file = data_dir / f"llamacpp-{llamacpp_model_name}.bin"
|
||||
|
||||
if not pytorch_file.exists():
|
||||
print(f"Error: PyTorch logits file not found: {pytorch_file}")
|
||||
print("Please run scripts/run-org-model.sh first to generate this file.")
|
||||
sys.exit(1)
|
||||
|
||||
if not llamacpp_file.exists():
|
||||
print(f"Error: llama.cpp logits file not found: {llamacpp_file}")
|
||||
print("Please run scripts/run-converted-model.sh first to generate this file.")
|
||||
sys.exit(1)
|
||||
|
||||
print("Checked all required files were found. Proceeding...\n")
|
||||
|
||||
# Verify tokens as they are a prerequisite for logits comparison.
|
||||
print("🔍 Token Comparison Check")
|
||||
print("=" * 40)
|
||||
if not compare_tokens(f"pytorch-{model_name}", f"llamacpp-{llamacpp_model_name}"):
|
||||
exit_with_warning("\n❌ Token mismatch detected", model_path)
|
||||
print()
|
||||
|
||||
print("🔍 GGML Model Validation for model ", model_name)
|
||||
print("=" * 40)
|
||||
print(f"PyTorch logits : {pytorch_file}")
|
||||
print(f"llama.cpp logits: {llamacpp_file}")
|
||||
print()
|
||||
|
||||
success = quick_logits_check(pytorch_file, llamacpp_file)
|
||||
|
||||
# Exit with appropriate code
|
||||
if success:
|
||||
print("✅ OK: Lightweight model check successful!")
|
||||
print(" Ok to proceed with NMSE check...")
|
||||
sys.exit(0)
|
||||
else:
|
||||
exit_with_warning(f"❌ NOK: Top 10 predictions don't match - generation will differ", model_path)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
46
examples/model-conversion/scripts/causal/convert-model.sh
Executable file
46
examples/model-conversion/scripts/causal/convert-model.sh
Executable file
@@ -0,0 +1,46 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -e
|
||||
|
||||
# Parse command line arguments
|
||||
MMPROJ=""
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case $1 in
|
||||
--mmproj)
|
||||
MMPROJ="--mmproj"
|
||||
shift
|
||||
;;
|
||||
*)
|
||||
shift
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
MODEL_NAME="${MODEL_NAME:-$(basename "$MODEL_PATH")}"
|
||||
OUTPUT_DIR="${OUTPUT_DIR:-../../models}"
|
||||
TYPE="${OUTTYPE:-f16}"
|
||||
METADATA_OVERRIDE="${METADATA_OVERRIDE:-}"
|
||||
CONVERTED_MODEL="${OUTPUT_DIR}/${MODEL_NAME}.gguf"
|
||||
|
||||
echo "Model path: ${MODEL_PATH}"
|
||||
echo "Model name: ${MODEL_NAME}"
|
||||
echo "Data type: ${TYPE}"
|
||||
echo "Converted model path:: ${CONVERTED_MODEL}"
|
||||
echo "Metadata override: ${METADATA_OVERRIDE}"
|
||||
|
||||
CMD_ARGS=("python" "../../convert_hf_to_gguf.py" "--verbose")
|
||||
CMD_ARGS+=("${MODEL_PATH}")
|
||||
CMD_ARGS+=("--outfile" "${CONVERTED_MODEL}")
|
||||
CMD_ARGS+=("--outtype" "${TYPE}")
|
||||
[[ -n "$METADATA_OVERRIDE" ]] && CMD_ARGS+=("--metadata" "${METADATA_OVERRIDE}")
|
||||
[[ -n "$MMPROJ" ]] && CMD_ARGS+=("${MMPROJ}")
|
||||
|
||||
"${CMD_ARGS[@]}"
|
||||
|
||||
echo ""
|
||||
echo "The environment variable CONVERTED_MODEL can be set to this path using:"
|
||||
echo "export CONVERTED_MODEL=$(realpath ${CONVERTED_MODEL})"
|
||||
if [[ -n "$MMPROJ" ]]; then
|
||||
mmproj_file="${OUTPUT_DIR}/mmproj-$(basename "${CONVERTED_MODEL}")"
|
||||
echo "The mmproj model was created in $(realpath "$mmproj_file")"
|
||||
fi
|
||||
13
examples/model-conversion/scripts/causal/modelcard.template
Normal file
13
examples/model-conversion/scripts/causal/modelcard.template
Normal file
@@ -0,0 +1,13 @@
|
||||
---
|
||||
base_model:
|
||||
- {base_model}
|
||||
---
|
||||
# {model_name} GGUF
|
||||
|
||||
Recommended way to run this model:
|
||||
|
||||
```sh
|
||||
llama-server -hf {namespace}/{model_name}-GGUF
|
||||
```
|
||||
|
||||
Then, access http://localhost:8080
|
||||
114
examples/model-conversion/scripts/causal/run-casual-gen-embeddings-org.py
Executable file
114
examples/model-conversion/scripts/causal/run-casual-gen-embeddings-org.py
Executable file
@@ -0,0 +1,114 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import importlib
|
||||
import torch
|
||||
import numpy as np
|
||||
|
||||
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM
|
||||
from pathlib import Path
|
||||
|
||||
unreleased_model_name = os.getenv('UNRELEASED_MODEL_NAME')
|
||||
|
||||
parser = argparse.ArgumentParser(description='Process model with specified path')
|
||||
parser.add_argument('--model-path', '-m', help='Path to the model')
|
||||
args = parser.parse_args()
|
||||
|
||||
model_path = os.environ.get('MODEL_PATH', args.model_path)
|
||||
if model_path is None:
|
||||
parser.error("Model path must be specified either via --model-path argument or MODEL_PATH environment variable")
|
||||
|
||||
config = AutoConfig.from_pretrained(model_path)
|
||||
|
||||
print("Model type: ", config.model_type)
|
||||
print("Vocab size: ", config.vocab_size)
|
||||
print("Hidden size: ", config.hidden_size)
|
||||
print("Number of layers: ", config.num_hidden_layers)
|
||||
print("BOS token id: ", config.bos_token_id)
|
||||
print("EOS token id: ", config.eos_token_id)
|
||||
|
||||
print("Loading model and tokenizer using AutoTokenizer:", model_path)
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_path)
|
||||
|
||||
if unreleased_model_name:
|
||||
model_name_lower = unreleased_model_name.lower()
|
||||
unreleased_module_path = f"transformers.models.{model_name_lower}.modular_{model_name_lower}"
|
||||
class_name = f"{unreleased_model_name}ForCausalLM"
|
||||
print(f"Importing unreleased model module: {unreleased_module_path}")
|
||||
|
||||
try:
|
||||
model_class = getattr(importlib.import_module(unreleased_module_path), class_name)
|
||||
model = model_class.from_pretrained(model_path)
|
||||
except (ImportError, AttributeError) as e:
|
||||
print(f"Failed to import or load model: {e}")
|
||||
print("Falling back to AutoModelForCausalLM")
|
||||
model = AutoModelForCausalLM.from_pretrained(model_path)
|
||||
else:
|
||||
model = AutoModelForCausalLM.from_pretrained(model_path)
|
||||
print(f"Model class: {type(model)}")
|
||||
#print(f"Model file: {type(model).__module__}")
|
||||
|
||||
model_name = os.path.basename(model_path)
|
||||
print(f"Model name: {model_name}")
|
||||
|
||||
prompt = "Hello world today"
|
||||
input_ids = tokenizer(prompt, return_tensors="pt").input_ids
|
||||
print(f"Input tokens: {input_ids}")
|
||||
print(f"Input text: {repr(prompt)}")
|
||||
print(f"Tokenized: {tokenizer.convert_ids_to_tokens(input_ids[0])}")
|
||||
|
||||
with torch.no_grad():
|
||||
outputs = model(input_ids, output_hidden_states=True)
|
||||
|
||||
# Extract hidden states from the last layer
|
||||
# outputs.hidden_states is a tuple of (num_layers + 1) tensors
|
||||
# Index -1 gets the last layer, shape: [batch_size, seq_len, hidden_size]
|
||||
last_hidden_states = outputs.hidden_states[-1]
|
||||
|
||||
# Get embeddings for all tokens
|
||||
token_embeddings = last_hidden_states[0].float().cpu().numpy() # Remove batch dimension
|
||||
|
||||
print(f"Hidden states shape: {last_hidden_states.shape}")
|
||||
print(f"Token embeddings shape: {token_embeddings.shape}")
|
||||
print(f"Hidden dimension: {token_embeddings.shape[-1]}")
|
||||
print(f"Number of tokens: {token_embeddings.shape[0]}")
|
||||
|
||||
# Save raw token embeddings
|
||||
data_dir = Path("data")
|
||||
data_dir.mkdir(exist_ok=True)
|
||||
bin_filename = data_dir / f"pytorch-{model_name}-embeddings.bin"
|
||||
txt_filename = data_dir / f"pytorch-{model_name}-embeddings.txt"
|
||||
|
||||
# Save all token embeddings as binary
|
||||
print(token_embeddings)
|
||||
token_embeddings.astype(np.float32).tofile(bin_filename)
|
||||
|
||||
# Save as text for inspection
|
||||
with open(txt_filename, "w") as f:
|
||||
for i, embedding in enumerate(token_embeddings):
|
||||
for j, val in enumerate(embedding):
|
||||
f.write(f"{i} {j} {val:.6f}\n")
|
||||
|
||||
# Print embeddings per token in the requested format
|
||||
print("\nToken embeddings:")
|
||||
tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
|
||||
for i, embedding in enumerate(token_embeddings):
|
||||
# Format: show first few values, ..., then last few values
|
||||
if len(embedding) > 10:
|
||||
# Show first 3 and last 3 values with ... in between
|
||||
first_vals = " ".join(f"{val:8.6f}" for val in embedding[:3])
|
||||
last_vals = " ".join(f"{val:8.6f}" for val in embedding[-3:])
|
||||
print(f"embedding {i}: {first_vals} ... {last_vals}")
|
||||
else:
|
||||
# If embedding is short, show all values
|
||||
vals = " ".join(f"{val:8.6f}" for val in embedding)
|
||||
print(f"embedding {i}: {vals}")
|
||||
|
||||
# Also show token info for reference
|
||||
print(f"\nToken reference:")
|
||||
for i, token in enumerate(tokens):
|
||||
print(f" Token {i}: {repr(token)}")
|
||||
|
||||
print(f"Saved bin logits to: {bin_filename}")
|
||||
print(f"Saved txt logist to: {txt_filename}")
|
||||
@@ -0,0 +1,18 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -e
|
||||
|
||||
# First try command line argument, then environment variable, then file
|
||||
CONVERTED_MODEL="${1:-"$CONVERTED_MODEL"}"
|
||||
|
||||
# Final check if we have a model path
|
||||
if [ -z "$CONVERTED_MODEL" ]; then
|
||||
echo "Error: Model path must be provided either as:" >&2
|
||||
echo " 1. Command line argument" >&2
|
||||
echo " 2. CONVERTED_MODEL environment variable" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
cmake --build ../../build --target llama-debug -j8
|
||||
|
||||
../../build/bin/llama-debug -m $CONVERTED_MODEL --embedding -p "Hello world today" --save-logits
|
||||
26
examples/model-conversion/scripts/causal/run-converted-model.sh
Executable file
26
examples/model-conversion/scripts/causal/run-converted-model.sh
Executable file
@@ -0,0 +1,26 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -e
|
||||
|
||||
# First try command line argument, then environment variable, then file
|
||||
CONVERTED_MODEL="${1:-"$CONVERTED_MODEL"}"
|
||||
MODEL_TESTING_PROMPT="${2:-"$MODEL_TESTING_PROMPT"}"
|
||||
|
||||
if [ -z "$MODEL_TESTING_PROMPT"]; then
|
||||
MODEL_TESTING_PROMPT="Hello, my name is"
|
||||
fi
|
||||
|
||||
# Final check if we have a model path
|
||||
if [ -z "$CONVERTED_MODEL" ]; then
|
||||
echo "Error: Model path must be provided either as:" >&2
|
||||
echo " 1. Command line argument" >&2
|
||||
echo " 2. CONVERTED_MODEL environment variable" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo $CONVERTED_MODEL
|
||||
echo $MODEL_TESTING_PROMPT
|
||||
|
||||
cmake --build ../../build --target llama-debug -j8
|
||||
|
||||
../../build/bin/llama-debug -m "$CONVERTED_MODEL" -p "$MODEL_TESTING_PROMPT" --save-logits
|
||||
168
examples/model-conversion/scripts/causal/run-org-model.py
Executable file
168
examples/model-conversion/scripts/causal/run-org-model.py
Executable file
@@ -0,0 +1,168 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
import importlib
|
||||
import torch
|
||||
import numpy as np
|
||||
|
||||
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForImageTextToText, AutoConfig
|
||||
|
||||
# Add parent directory to path for imports
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
||||
from utils.common import debug_hook, save_output_data
|
||||
|
||||
def parse_arguments():
|
||||
parser = argparse.ArgumentParser(description="Process model with specified path")
|
||||
parser.add_argument("--model-path", "-m", help="Path to the model")
|
||||
parser.add_argument("--prompt-file", "-f", help="Optional prompt file", required=False)
|
||||
parser.add_argument("--verbose", "-v", action="store_true", help="Enable verbose debug output")
|
||||
parser.add_argument("--device", "-d", help="Device to use (cpu, cuda, mps, auto)", default="auto")
|
||||
return parser.parse_args()
|
||||
|
||||
def load_model_and_tokenizer(model_path, device="auto"):
|
||||
print("Loading model and tokenizer using AutoTokenizer:", model_path)
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
||||
config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
|
||||
multimodal = False
|
||||
full_config = config
|
||||
|
||||
# Determine device_map based on device argument
|
||||
if device == "cpu":
|
||||
device_map = {"": "cpu"}
|
||||
print("Forcing CPU usage")
|
||||
elif device == "auto":
|
||||
device_map = "auto"
|
||||
else:
|
||||
device_map = {"": device}
|
||||
|
||||
print("Model type: ", config.model_type)
|
||||
if "vocab_size" not in config and "text_config" in config:
|
||||
config = config.text_config
|
||||
multimodal = True
|
||||
|
||||
print("Vocab size: ", config.vocab_size)
|
||||
print("Hidden size: ", config.hidden_size)
|
||||
print("Number of layers: ", config.num_hidden_layers)
|
||||
print("BOS token id: ", config.bos_token_id)
|
||||
print("EOS token id: ", config.eos_token_id)
|
||||
|
||||
unreleased_model_name = os.getenv("UNRELEASED_MODEL_NAME")
|
||||
if unreleased_model_name:
|
||||
model_name_lower = unreleased_model_name.lower()
|
||||
unreleased_module_path = (
|
||||
f"transformers.models.{model_name_lower}.modular_{model_name_lower}"
|
||||
)
|
||||
class_name = f"{unreleased_model_name}ForCausalLM"
|
||||
print(f"Importing unreleased model module: {unreleased_module_path}")
|
||||
|
||||
try:
|
||||
model_class = getattr(importlib.import_module(unreleased_module_path), class_name)
|
||||
model = model_class.from_pretrained(
|
||||
model_path,
|
||||
device_map=device_map,
|
||||
offload_folder="offload",
|
||||
trust_remote_code=True,
|
||||
config=config
|
||||
)
|
||||
except (ImportError, AttributeError) as e:
|
||||
print(f"Failed to import or load model: {e}")
|
||||
exit(1)
|
||||
else:
|
||||
if multimodal:
|
||||
model = AutoModelForImageTextToText.from_pretrained(
|
||||
model_path,
|
||||
device_map=device_map,
|
||||
offload_folder="offload",
|
||||
trust_remote_code=True,
|
||||
config=full_config
|
||||
)
|
||||
else:
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
model_path,
|
||||
device_map=device_map,
|
||||
offload_folder="offload",
|
||||
trust_remote_code=True,
|
||||
config=config
|
||||
)
|
||||
|
||||
print(f"Model class: {model.__class__.__name__}")
|
||||
|
||||
return model, tokenizer, config
|
||||
|
||||
def enable_torch_debugging(model):
|
||||
for name, module in model.named_modules():
|
||||
if len(list(module.children())) == 0: # only leaf modules
|
||||
module.register_forward_hook(debug_hook(name))
|
||||
|
||||
def get_prompt(args):
|
||||
if args.prompt_file:
|
||||
with open(args.prompt_file, encoding='utf-8') as f:
|
||||
return f.read()
|
||||
elif os.getenv("MODEL_TESTING_PROMPT"):
|
||||
return os.getenv("MODEL_TESTING_PROMPT")
|
||||
else:
|
||||
return "Hello, my name is"
|
||||
|
||||
def main():
|
||||
args = parse_arguments()
|
||||
model_path = os.environ.get("MODEL_PATH", args.model_path)
|
||||
if model_path is None:
|
||||
print("Error: Model path must be specified either via --model-path argument or MODEL_PATH environment variable")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
model, tokenizer, config = load_model_and_tokenizer(model_path, args.device)
|
||||
|
||||
if args.verbose:
|
||||
enable_torch_debugging(model)
|
||||
|
||||
model_name = os.path.basename(model_path)
|
||||
|
||||
# Iterate over the model parameters (the tensors) and get the first one
|
||||
# and use it to get the device the model is on.
|
||||
device = next(model.parameters()).device
|
||||
prompt = get_prompt(args)
|
||||
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
|
||||
token_ids = input_ids[0].cpu().tolist()
|
||||
|
||||
print(f"Input tokens: {input_ids}")
|
||||
print(f"Input text: {repr(prompt)}")
|
||||
print(f"Tokenized: {tokenizer.convert_ids_to_tokens(input_ids[0])}")
|
||||
|
||||
batch_size = 512
|
||||
|
||||
with torch.no_grad():
|
||||
past = None
|
||||
outputs = None
|
||||
for i in range(0, input_ids.size(1), batch_size):
|
||||
print(f"Processing chunk with tokens {i} to {i + batch_size}")
|
||||
chunk = input_ids[:, i:i + batch_size]
|
||||
outputs = model(chunk.to(model.device), past_key_values=past, use_cache=True)
|
||||
past = outputs.past_key_values
|
||||
|
||||
logits = outputs.logits # type: ignore
|
||||
|
||||
# Extract logits for the last token (next token prediction)
|
||||
last_logits = logits[0, -1, :].float().cpu().numpy()
|
||||
|
||||
print(f"Logits shape: {logits.shape}")
|
||||
print(f"Last token logits shape: {last_logits.shape}")
|
||||
print(f"Vocab size: {len(last_logits)}")
|
||||
|
||||
# Print some sample logits for quick verification
|
||||
print(f"First 10 logits: {last_logits[:10]}")
|
||||
print(f"Last 10 logits: {last_logits[-10:]}")
|
||||
|
||||
# Show top 5 predicted tokens
|
||||
top_indices = np.argsort(last_logits)[-5:][::-1]
|
||||
print("Top 5 predictions:")
|
||||
for idx in top_indices:
|
||||
token = tokenizer.decode([idx])
|
||||
print(f" Token {idx} ({repr(token)}): {last_logits[idx]:.6f}")
|
||||
|
||||
save_output_data(last_logits, token_ids, prompt, model_name)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
84
examples/model-conversion/scripts/embedding/compare-embeddings-logits.sh
Executable file
84
examples/model-conversion/scripts/embedding/compare-embeddings-logits.sh
Executable file
@@ -0,0 +1,84 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -e
|
||||
|
||||
# Parse command line arguments
|
||||
MODEL_PATH=""
|
||||
MODEL_NAME=""
|
||||
PROMPTS_FILE=""
|
||||
|
||||
# First argument is always model path
|
||||
if [ $# -gt 0 ] && [[ "$1" != --* ]]; then
|
||||
MODEL_PATH="$1"
|
||||
shift
|
||||
fi
|
||||
|
||||
# Parse remaining arguments
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case $1 in
|
||||
--prompts-file|-pf)
|
||||
PROMPTS_FILE="$2"
|
||||
shift 2
|
||||
;;
|
||||
*)
|
||||
# If MODEL_NAME not set and this isn't a flag, use as model name
|
||||
if [ -z "$MODEL_NAME" ] && [[ "$1" != --* ]]; then
|
||||
MODEL_NAME="$1"
|
||||
fi
|
||||
shift
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
# Set defaults
|
||||
MODEL_PATH="${MODEL_PATH:-"$EMBEDDING_MODEL_PATH"}"
|
||||
MODEL_NAME="${MODEL_NAME:-$(basename "$MODEL_PATH")}"
|
||||
|
||||
CONVERTED_MODEL_PATH="${CONVERTED_EMBEDDING_PATH:-"$CONVERTED_EMBEDDING_MODEL"}"
|
||||
CONVERTED_MODEL_NAME="${CONVERTED_MODEL_NAME:-$(basename "$CONVERTED_MODEL_PATH" .gguf)}"
|
||||
|
||||
if [ -t 0 ]; then
|
||||
CPP_EMBEDDINGS="data/llamacpp-${CONVERTED_MODEL_NAME}-embeddings.bin"
|
||||
else
|
||||
# Process piped JSON data and convert to binary (matching logits.cpp format)
|
||||
TEMP_FILE=$(mktemp /tmp/tmp.XXXXXX.binn)
|
||||
python3 -c "
|
||||
import json
|
||||
import sys
|
||||
import struct
|
||||
|
||||
data = json.load(sys.stdin)
|
||||
|
||||
# Flatten all embeddings completely
|
||||
flattened = []
|
||||
for item in data:
|
||||
embedding = item['embedding']
|
||||
for token_embedding in embedding:
|
||||
flattened.extend(token_embedding)
|
||||
|
||||
print(f'Total embedding values: {len(flattened)}', file=sys.stderr)
|
||||
|
||||
# Write as binary floats - matches logitc.cpp fwrite format
|
||||
with open('$TEMP_FILE', 'wb') as f:
|
||||
for value in flattened:
|
||||
f.write(struct.pack('f', value))
|
||||
"
|
||||
CPP_EMBEDDINGS="$TEMP_FILE"
|
||||
trap "rm -f $TEMP_FILE" EXIT
|
||||
fi
|
||||
|
||||
# Build the semantic_check.py command
|
||||
SEMANTIC_CMD="python scripts/utils/semantic_check.py --model-path $MODEL_PATH \
|
||||
--python-embeddings data/pytorch-${MODEL_NAME}-embeddings.bin \
|
||||
--cpp-embeddings $CPP_EMBEDDINGS"
|
||||
|
||||
# Add prompts file if specified, otherwise use default prompt
|
||||
if [ -n "$PROMPTS_FILE" ]; then
|
||||
SEMANTIC_CMD="$SEMANTIC_CMD --prompts-file \"$PROMPTS_FILE\""
|
||||
else
|
||||
SEMANTIC_CMD="$SEMANTIC_CMD --prompt \"Hello world today\""
|
||||
fi
|
||||
|
||||
# Execute the command
|
||||
eval $SEMANTIC_CMD
|
||||
|
||||
38
examples/model-conversion/scripts/embedding/convert-model.sh
Executable file
38
examples/model-conversion/scripts/embedding/convert-model.sh
Executable file
@@ -0,0 +1,38 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -e
|
||||
|
||||
# Parse command line arguments
|
||||
SENTENCE_TRANSFORMERS=""
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case $1 in
|
||||
-st|--sentence-transformers)
|
||||
SENTENCE_TRANSFORMERS="--sentence-transformers-dense-modules"
|
||||
shift
|
||||
;;
|
||||
*)
|
||||
echo "Unknown option: $1"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
MODEL_NAME="${MODEL_NAME:-$(basename "$EMBEDDING_MODEL_PATH")}"
|
||||
OUTPUT_DIR="${OUTPUT_DIR:-../../models}"
|
||||
TYPE="${OUTTYPE:-f16}"
|
||||
METADATA_OVERRIDE="${METADATA_OVERRIDE:-}"
|
||||
CONVERTED_MODEL="${OUTPUT_DIR}/${MODEL_NAME}.gguf"
|
||||
|
||||
echo "Model path: ${EMBEDDING_MODEL_PATH}"
|
||||
echo "Model name: ${MODEL_NAME}"
|
||||
echo "Data type: ${TYPE}"
|
||||
echo "Converted model path:: ${CONVERTED_MODEL}"
|
||||
python ../../convert_hf_to_gguf.py --verbose \
|
||||
${EMBEDDING_MODEL_PATH} \
|
||||
--outfile ${CONVERTED_MODEL} \
|
||||
--outtype ${TYPE} \
|
||||
${SENTENCE_TRANSFORMERS}
|
||||
|
||||
echo ""
|
||||
echo "The environment variable CONVERTED_EMBEDDING MODEL can be set to this path using:"
|
||||
echo "export CONVERTED_EMBEDDING_MODEL=$(realpath ${CONVERTED_MODEL})"
|
||||
@@ -0,0 +1,48 @@
|
||||
---
|
||||
base_model:
|
||||
- {base_model}
|
||||
---
|
||||
# {model_name} GGUF
|
||||
|
||||
Recommended way to run this model:
|
||||
|
||||
```sh
|
||||
llama-server -hf {namespace}/{model_name}-GGUF --embeddings
|
||||
```
|
||||
|
||||
Then the endpoint can be accessed at http://localhost:8080/embedding, for
|
||||
example using `curl`:
|
||||
```console
|
||||
curl --request POST \
|
||||
--url http://localhost:8080/embedding \
|
||||
--header "Content-Type: application/json" \
|
||||
--data '{{"input": "Hello embeddings"}}' \
|
||||
--silent
|
||||
```
|
||||
|
||||
Alternatively, the `llama-embedding` command line tool can be used:
|
||||
```sh
|
||||
llama-embedding -hf {namespace}/{model_name}-GGUF --verbose-prompt -p "Hello embeddings"
|
||||
```
|
||||
|
||||
#### embd_normalize
|
||||
When a model uses pooling, or the pooling method is specified using `--pooling`,
|
||||
the normalization can be controlled by the `embd_normalize` parameter.
|
||||
|
||||
The default value is `2` which means that the embeddings are normalized using
|
||||
the Euclidean norm (L2). Other options are:
|
||||
* -1 No normalization
|
||||
* 0 Max absolute
|
||||
* 1 Taxicab
|
||||
* 2 Euclidean/L2
|
||||
* \>2 P-Norm
|
||||
|
||||
This can be passed in the request body to `llama-server`, for example:
|
||||
```sh
|
||||
--data '{{"input": "Hello embeddings", "embd_normalize": -1}}' \
|
||||
```
|
||||
|
||||
And for `llama-embedding`, by passing `--embd-normalize <value>`, for example:
|
||||
```sh
|
||||
llama-embedding -hf {namespace}/{model_name}-GGUF --embd-normalize -1 -p "Hello embeddings"
|
||||
```
|
||||
54
examples/model-conversion/scripts/embedding/run-converted-model.sh
Executable file
54
examples/model-conversion/scripts/embedding/run-converted-model.sh
Executable file
@@ -0,0 +1,54 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -e
|
||||
|
||||
# Parse command line arguments
|
||||
CONVERTED_MODEL=""
|
||||
PROMPTS_FILE=""
|
||||
EMBD_NORMALIZE="2"
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case $1 in
|
||||
-p|--prompts-file)
|
||||
PROMPTS_FILE="$2"
|
||||
shift 2
|
||||
;;
|
||||
--embd-normalize)
|
||||
EMBD_NORMALIZE="$2"
|
||||
shift 2
|
||||
;;
|
||||
*)
|
||||
if [ -z "$CONVERTED_MODEL" ]; then
|
||||
CONVERTED_MODEL="$1"
|
||||
fi
|
||||
shift
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
# First try command line argument, then environment variable
|
||||
CONVERTED_MODEL="${CONVERTED_MODEL:-"$CONVERTED_EMBEDDING_MODEL"}"
|
||||
|
||||
# Final check if we have a model path
|
||||
if [ -z "$CONVERTED_MODEL" ]; then
|
||||
echo "Error: Model path must be provided either as:" >&2
|
||||
echo " 1. Command line argument" >&2
|
||||
echo " 2. CONVERTED_EMBEDDING_MODEL environment variable" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Read prompt from file or use default
|
||||
if [ -n "$PROMPTS_FILE" ]; then
|
||||
if [ ! -f "$PROMPTS_FILE" ]; then
|
||||
echo "Error: Prompts file '$PROMPTS_FILE' not found" >&2
|
||||
exit 1
|
||||
fi
|
||||
PROMPT=$(cat "$PROMPTS_FILE")
|
||||
else
|
||||
PROMPT="Hello world today"
|
||||
fi
|
||||
|
||||
echo $CONVERTED_MODEL
|
||||
|
||||
cmake --build ../../build --target llama-debug -j8
|
||||
../../build/bin/llama-debug -m "$CONVERTED_MODEL" --embedding -p "$PROMPT" --save-logits --embd-normalize $EMBD_NORMALIZE
|
||||
243
examples/model-conversion/scripts/embedding/run-original-model.py
Executable file
243
examples/model-conversion/scripts/embedding/run-original-model.py
Executable file
@@ -0,0 +1,243 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
import importlib
|
||||
|
||||
from transformers import AutoTokenizer, AutoConfig, AutoModel
|
||||
import torch
|
||||
|
||||
# Add parent directory to path for imports
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
||||
from utils.common import save_output_data
|
||||
|
||||
|
||||
def parse_arguments():
|
||||
parser = argparse.ArgumentParser(description='Run original embedding model')
|
||||
parser.add_argument(
|
||||
'--model-path',
|
||||
'-m',
|
||||
help='Path to the model'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--prompts-file',
|
||||
'-p',
|
||||
help='Path to file containing prompts (one per line)'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--use-sentence-transformers',
|
||||
action='store_true',
|
||||
help=('Use SentenceTransformer to apply all numbered layers '
|
||||
'(01_Pooling, 02_Dense, 03_Dense, 04_Normalize)')
|
||||
)
|
||||
parser.add_argument(
|
||||
'--device',
|
||||
'-d',
|
||||
help='Device to use (cpu, cuda, mps, auto)',
|
||||
default='auto'
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def load_model_and_tokenizer(model_path, use_sentence_transformers=False, device="auto"):
|
||||
if device == "cpu":
|
||||
device_map = {"": "cpu"}
|
||||
print("Forcing CPU usage")
|
||||
elif device == "auto":
|
||||
# On Mac, "auto" device_map can cause issues with accelerate
|
||||
# So we detect the best device manually
|
||||
if torch.cuda.is_available():
|
||||
device_map = {"": "cuda"}
|
||||
print("Using CUDA")
|
||||
elif torch.backends.mps.is_available():
|
||||
device_map = {"": "mps"}
|
||||
print("Using MPS (Apple Metal)")
|
||||
else:
|
||||
device_map = {"": "cpu"}
|
||||
print("Using CPU")
|
||||
else:
|
||||
device_map = {"": device}
|
||||
|
||||
if use_sentence_transformers:
|
||||
from sentence_transformers import SentenceTransformer
|
||||
print("Using SentenceTransformer to apply all numbered layers")
|
||||
model = SentenceTransformer(model_path)
|
||||
tokenizer = model.tokenizer
|
||||
config = model[0].auto_model.config # type: ignore
|
||||
else:
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_path)
|
||||
config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
|
||||
|
||||
# This can be used to override the sliding window size for manual testing. This
|
||||
# can be useful to verify the sliding window attention mask in the original model
|
||||
# and compare it with the converted .gguf model.
|
||||
if hasattr(config, 'sliding_window'):
|
||||
original_sliding_window = config.sliding_window
|
||||
print(f"Modified sliding window: {original_sliding_window} -> {config.sliding_window}")
|
||||
|
||||
unreleased_model_name = os.getenv('UNRELEASED_MODEL_NAME')
|
||||
print(f"Using unreleased model: {unreleased_model_name}")
|
||||
if unreleased_model_name:
|
||||
model_name_lower = unreleased_model_name.lower()
|
||||
unreleased_module_path = f"transformers.models.{model_name_lower}.modular_{model_name_lower}"
|
||||
class_name = f"{unreleased_model_name}Model"
|
||||
print(f"Importing unreleased model module: {unreleased_module_path}")
|
||||
|
||||
try:
|
||||
model_class = getattr(importlib.import_module(unreleased_module_path), class_name)
|
||||
model = model_class.from_pretrained(
|
||||
model_path,
|
||||
device_map=device_map,
|
||||
offload_folder="offload",
|
||||
trust_remote_code=True,
|
||||
config=config
|
||||
)
|
||||
except (ImportError, AttributeError) as e:
|
||||
print(f"Failed to import or load model: {e}")
|
||||
sys.exit(1)
|
||||
else:
|
||||
model = AutoModel.from_pretrained(
|
||||
model_path,
|
||||
device_map=device_map,
|
||||
offload_folder="offload",
|
||||
trust_remote_code=True,
|
||||
config=config
|
||||
)
|
||||
print(f"Model class: {type(model)}")
|
||||
print(f"Model file: {type(model).__module__}")
|
||||
|
||||
# Verify the model is using the correct sliding window
|
||||
if hasattr(model.config, 'sliding_window'): # type: ignore
|
||||
print(f"Model's sliding_window: {model.config.sliding_window}") # type: ignore
|
||||
else:
|
||||
print("Model config does not have sliding_window attribute")
|
||||
|
||||
return model, tokenizer, config
|
||||
|
||||
|
||||
def get_prompt(args):
|
||||
if args.prompts_file:
|
||||
try:
|
||||
with open(args.prompts_file, 'r', encoding='utf-8') as f:
|
||||
return f.read().strip()
|
||||
except FileNotFoundError:
|
||||
print(f"Error: Prompts file '{args.prompts_file}' not found")
|
||||
sys.exit(1)
|
||||
except Exception as e:
|
||||
print(f"Error reading prompts file: {e}")
|
||||
sys.exit(1)
|
||||
else:
|
||||
return "Hello world today"
|
||||
|
||||
|
||||
def main():
|
||||
args = parse_arguments()
|
||||
|
||||
model_path = os.environ.get('EMBEDDING_MODEL_PATH', args.model_path)
|
||||
if model_path is None:
|
||||
print("Error: Model path must be specified either via --model-path argument "
|
||||
"or EMBEDDING_MODEL_PATH environment variable")
|
||||
sys.exit(1)
|
||||
|
||||
# Determine if we should use SentenceTransformer
|
||||
use_st = (
|
||||
args.use_sentence_transformers or os.environ.get('USE_SENTENCE_TRANSFORMERS', '').lower() in ('1', 'true', 'yes')
|
||||
)
|
||||
|
||||
model, tokenizer, config = load_model_and_tokenizer(model_path, use_st, args.device)
|
||||
|
||||
# Get the device the model is on
|
||||
if not use_st:
|
||||
device = next(model.parameters()).device
|
||||
else:
|
||||
# For SentenceTransformer, get device from the underlying model
|
||||
device = next(model[0].auto_model.parameters()).device # type: ignore
|
||||
|
||||
model_name = os.path.basename(model_path)
|
||||
|
||||
prompt_text = get_prompt(args)
|
||||
texts = [prompt_text]
|
||||
|
||||
with torch.no_grad():
|
||||
if use_st:
|
||||
embeddings = model.encode(texts, convert_to_numpy=True)
|
||||
all_embeddings = embeddings # Shape: [batch_size, hidden_size]
|
||||
|
||||
encoded = tokenizer(
|
||||
texts,
|
||||
padding=True,
|
||||
truncation=True,
|
||||
return_tensors="pt"
|
||||
)
|
||||
tokens = encoded['input_ids'][0]
|
||||
token_ids = tokens.cpu().tolist()
|
||||
token_strings = tokenizer.convert_ids_to_tokens(tokens)
|
||||
for i, (token_id, token_str) in enumerate(zip(tokens, token_strings)):
|
||||
print(f"{token_id:6d} -> '{token_str}'")
|
||||
|
||||
print(f"Embeddings shape (after all SentenceTransformer layers): {all_embeddings.shape}")
|
||||
print(f"Embedding dimension: {all_embeddings.shape[1] if len(all_embeddings.shape) > 1 else all_embeddings.shape[0]}") # type: ignore
|
||||
else:
|
||||
# Standard approach: use base model output only
|
||||
encoded = tokenizer(
|
||||
texts,
|
||||
padding=True,
|
||||
truncation=True,
|
||||
return_tensors="pt"
|
||||
)
|
||||
|
||||
tokens = encoded['input_ids'][0]
|
||||
token_ids = tokens.cpu().tolist()
|
||||
token_strings = tokenizer.convert_ids_to_tokens(tokens)
|
||||
for i, (token_id, token_str) in enumerate(zip(tokens, token_strings)):
|
||||
print(f"{token_id:6d} -> '{token_str}'")
|
||||
|
||||
# Move inputs to the same device as the model
|
||||
encoded = {k: v.to(device) for k, v in encoded.items()}
|
||||
outputs = model(**encoded)
|
||||
hidden_states = outputs.last_hidden_state # Shape: [batch_size, seq_len, hidden_size]
|
||||
|
||||
all_embeddings = hidden_states[0].float().cpu().numpy() # Shape: [seq_len, hidden_size]
|
||||
|
||||
print(f"Hidden states shape: {hidden_states.shape}")
|
||||
print(f"All embeddings shape: {all_embeddings.shape}")
|
||||
print(f"Embedding dimension: {all_embeddings.shape[1]}")
|
||||
|
||||
if len(all_embeddings.shape) == 1:
|
||||
n_embd = all_embeddings.shape[0] # type: ignore
|
||||
n_embd_count = 1
|
||||
all_embeddings = all_embeddings.reshape(1, -1)
|
||||
else:
|
||||
n_embd = all_embeddings.shape[1] # type: ignore
|
||||
n_embd_count = all_embeddings.shape[0] # type: ignore
|
||||
|
||||
print()
|
||||
|
||||
for j in range(n_embd_count):
|
||||
embedding = all_embeddings[j]
|
||||
print(f"embedding {j}: ", end="")
|
||||
|
||||
# Print first 3 values
|
||||
for i in range(min(3, n_embd)):
|
||||
print(f"{embedding[i]:9.6f} ", end="")
|
||||
|
||||
print(" ... ", end="")
|
||||
|
||||
# Print last 3 values
|
||||
for i in range(n_embd - 3, n_embd):
|
||||
print(f"{embedding[i]:9.6f} ", end="")
|
||||
|
||||
print() # New line
|
||||
|
||||
print()
|
||||
|
||||
flattened_embeddings = all_embeddings.flatten()
|
||||
print(f"Total values: {len(flattened_embeddings)} ({n_embd_count} embeddings × {n_embd} dimensions)")
|
||||
print("")
|
||||
|
||||
save_output_data(flattened_embeddings, token_ids, prompt_text, model_name, type_suffix="-embeddings")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
0
examples/model-conversion/scripts/utils/__init__.py
Normal file
0
examples/model-conversion/scripts/utils/__init__.py
Normal file
177
examples/model-conversion/scripts/utils/check-nmse.py
Executable file
177
examples/model-conversion/scripts/utils/check-nmse.py
Executable file
@@ -0,0 +1,177 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import numpy as np
|
||||
import sys
|
||||
import os
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
from common import get_model_name_from_env_path # type: ignore[import-not-found]
|
||||
|
||||
def calculate_nmse(reference, test):
|
||||
mse = np.mean((test - reference) ** 2)
|
||||
ref_var = np.var(reference)
|
||||
if ref_var == 0:
|
||||
nmse = float('inf') if mse > 0 else 0.0
|
||||
return mse, mse, ref_var
|
||||
|
||||
nmse = mse / ref_var
|
||||
|
||||
return nmse, mse, ref_var
|
||||
|
||||
def load_logits(file_path):
|
||||
if not os.path.exists(file_path):
|
||||
raise FileNotFoundError(f"File not found: {file_path}")
|
||||
|
||||
if file_path.suffix == '.npy':
|
||||
return np.load(file_path)
|
||||
elif file_path.suffix == '.bin':
|
||||
return np.fromfile(file_path, dtype=np.float32)
|
||||
else:
|
||||
# Try to load as text file
|
||||
try:
|
||||
# If it has index format "0: value", extract just values
|
||||
data = []
|
||||
with open(file_path, 'r') as f:
|
||||
for line in f:
|
||||
if ':' in line:
|
||||
# Format: "index: value"
|
||||
value = float(line.split(':')[1].strip())
|
||||
else:
|
||||
# Just the value
|
||||
value = float(line.strip())
|
||||
data.append(value)
|
||||
return np.array(data, dtype=np.float32)
|
||||
except:
|
||||
return np.loadtxt(file_path, dtype=np.float32)
|
||||
|
||||
def interpret_nmse(nmse):
|
||||
"""Provide interpretation of NMSE value"""
|
||||
if nmse == 0:
|
||||
return "Perfect match", "🎉"
|
||||
elif nmse < 1e-6:
|
||||
return "Essentially identical", "✅"
|
||||
elif nmse < 1e-4:
|
||||
return "Excellent match", "✅"
|
||||
elif nmse < 1e-3:
|
||||
return "Very good match", "👍"
|
||||
elif nmse < 1e-2:
|
||||
return "Good match", "👍"
|
||||
elif nmse < 0.1:
|
||||
return "Acceptable match", "⚠️"
|
||||
elif nmse < 1.0:
|
||||
return "Poor match", "❌"
|
||||
else:
|
||||
return "Very poor match (worse than noise)", "❌"
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description='Validate model logits')
|
||||
parser.add_argument('-m', '--model-path', required=True, help='Path to the model directory')
|
||||
args = parser.parse_args()
|
||||
|
||||
model_name = get_model_name_from_env_path('MODEL_PATH')
|
||||
data_dir = Path("data")
|
||||
|
||||
pytorch_file = data_dir / f"pytorch-{model_name}.bin"
|
||||
|
||||
llamacpp_model_name = get_model_name_from_env_path('CONVERTED_MODEL')
|
||||
llamacpp_file = data_dir / f"llamacpp-{llamacpp_model_name}.bin"
|
||||
|
||||
print(f"Model name: {model_name}")
|
||||
print(f"PyTorch logits file: {pytorch_file}")
|
||||
print(f"llama.cpp logits file: {llamacpp_file}")
|
||||
|
||||
reference_file = pytorch_file
|
||||
test_file = llamacpp_file
|
||||
|
||||
print("📊 NMSE Check for Model Comparison")
|
||||
print("=" * 50)
|
||||
print(f"Reference (ground truth): {reference_file}")
|
||||
print(f"Test (to evaluate): {test_file}")
|
||||
print()
|
||||
|
||||
try:
|
||||
print("Loading reference logits...")
|
||||
reference = load_logits(reference_file)
|
||||
print(f" Shape: {reference.shape}, Type: {reference.dtype}")
|
||||
|
||||
print("Loading test logits...")
|
||||
test = load_logits(test_file)
|
||||
print(f" Shape: {test.shape}, Type: {test.dtype}")
|
||||
|
||||
# Check shapes match
|
||||
if reference.shape != test.shape:
|
||||
print(f"\n❌ Error: Shape mismatch!")
|
||||
print(f" Reference: {reference.shape}")
|
||||
print(f" Test: {test.shape}")
|
||||
sys.exit(1)
|
||||
|
||||
print(f"\n✅ Shapes match: {reference.shape}")
|
||||
|
||||
nmse, mse, ref_var = calculate_nmse(reference, test)
|
||||
|
||||
# Additional metrics
|
||||
max_abs_error = np.max(np.abs(test - reference))
|
||||
mean_abs_error = np.mean(np.abs(test - reference))
|
||||
|
||||
# Results
|
||||
print(f"\n📈 METRICS")
|
||||
print("=" * 30)
|
||||
print(f"MSE (Mean Squared Error): {mse:.6e}")
|
||||
print(f"Reference Variance: {ref_var:.6e}")
|
||||
print(f"NMSE: {nmse:.6e}")
|
||||
print(f"Max Absolute Error: {max_abs_error:.6f}")
|
||||
print(f"Mean Absolute Error: {mean_abs_error:.6f}")
|
||||
|
||||
# NMSE in dB (common in signal processing)
|
||||
if nmse > 0:
|
||||
nmse_db = 10 * np.log10(nmse)
|
||||
print(f"NMSE (dB): {nmse_db:.2f} dB")
|
||||
|
||||
# Interpretation
|
||||
interpretation, emoji = interpret_nmse(nmse)
|
||||
print(f"\n🎯 INTERPRETATION")
|
||||
print("=" * 30)
|
||||
print(f"{emoji} {interpretation}")
|
||||
|
||||
# Detailed guidance
|
||||
print(f"\n📋 GUIDANCE")
|
||||
print("=" * 30)
|
||||
if nmse < 1e-3:
|
||||
print("✅ EXCELLENT: Your GGML conversion is working very well!")
|
||||
print(" The differences are negligible for practical use.")
|
||||
elif nmse < 1e-2:
|
||||
print("👍 GOOD: Your GGML conversion is working well.")
|
||||
print(" Small differences are likely due to precision/quantization.")
|
||||
elif nmse < 0.1:
|
||||
print("⚠️ ACCEPTABLE: Conversion is working but with some differences.")
|
||||
print(" Check if you're using quantization (Q4, Q8, etc.)")
|
||||
print(" Test generation quality to see if it's acceptable.")
|
||||
else:
|
||||
print("❌ PROBLEMATIC: Large differences detected.")
|
||||
print(" Check your conversion process for potential issues.")
|
||||
print(" Verify you're using the same model weights.")
|
||||
|
||||
# NMSE benchmarks
|
||||
print(f"\n📚 NMSE BENCHMARKS")
|
||||
print("=" * 30)
|
||||
print("< 1e-6: Essentially identical")
|
||||
print("< 1e-4: Excellent (typical for good conversions)")
|
||||
print("< 1e-3: Very good")
|
||||
print("< 1e-2: Good (acceptable for most use cases)")
|
||||
print("< 0.1: Acceptable (may need verification)")
|
||||
print("> 1.0: Poor (worse than random)")
|
||||
|
||||
# Exit code based on NMSE
|
||||
if nmse < 1e-2:
|
||||
print(f"\n✅ RESULT: PASS (NMSE = {nmse:.2e})")
|
||||
sys.exit(0)
|
||||
else:
|
||||
print(f"\n❌ RESULT: NEEDS REVIEW (NMSE = {nmse:.2e})")
|
||||
sys.exit(1)
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
299
examples/model-conversion/scripts/utils/common.py
Normal file
299
examples/model-conversion/scripts/utils/common.py
Normal file
@@ -0,0 +1,299 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import os
|
||||
import sys
|
||||
import torch
|
||||
import transformers
|
||||
import json
|
||||
import textwrap
|
||||
import numpy as np
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def get_model_name_from_env_path(env_path_name):
|
||||
model_path = os.getenv(env_path_name)
|
||||
if not model_path:
|
||||
print(f"Error: {env_path_name} environment variable not set")
|
||||
sys.exit(1)
|
||||
|
||||
if not os.path.exists(model_path):
|
||||
print(f"Error: Model file not found: {model_path}")
|
||||
sys.exit(1)
|
||||
|
||||
name = os.path.basename(os.path.normpath(model_path))
|
||||
if name.endswith(".gguf"):
|
||||
name = name[:-5]
|
||||
|
||||
return name
|
||||
|
||||
|
||||
def summarize(tensor: torch.Tensor, name: str, max_seq: int = 3, max_vals: int = 3):
|
||||
"""
|
||||
Print a tensor in llama.cpp debug style.
|
||||
|
||||
Supports:
|
||||
- 2D tensors (seq, hidden)
|
||||
- 3D tensors (batch, seq, hidden)
|
||||
- 4D tensors (batch, seq, heads, dim_per_head) via flattening heads × dim_per_head
|
||||
|
||||
Shows first and last max_vals of each vector per sequence position.
|
||||
"""
|
||||
t = tensor.detach().to(torch.float32).cpu()
|
||||
|
||||
# Determine dimensions
|
||||
if t.ndim == 3:
|
||||
_, s, _ = t.shape
|
||||
elif t.ndim == 2:
|
||||
_, s = 1, t.shape[0]
|
||||
t = t.unsqueeze(0)
|
||||
elif t.ndim == 4:
|
||||
_, s, _, _ = t.shape
|
||||
else:
|
||||
print(f"Skipping tensor due to unsupported dimensions: {t.ndim}")
|
||||
return
|
||||
|
||||
ten_shape = t.shape
|
||||
|
||||
print(f"ggml_debug: {name} = (f32) ... = {{{ten_shape}}}")
|
||||
print(" [")
|
||||
print(" [")
|
||||
|
||||
# Determine indices for first and last sequences
|
||||
first_indices = list(range(min(s, max_seq)))
|
||||
last_indices = list(range(max(0, s - max_seq), s))
|
||||
|
||||
# Check if there's an overlap between first and last indices or if we're at the edge case of s = 2 * max_seq
|
||||
has_overlap = bool(set(first_indices) & set(last_indices)) or (max_seq * 2 == s)
|
||||
|
||||
# Combine indices
|
||||
if has_overlap:
|
||||
# If there's overlap, just use the combined unique indices
|
||||
indices = sorted(list(set(first_indices + last_indices)))
|
||||
separator_index = None
|
||||
else:
|
||||
# If no overlap, we'll add a separator between first and last sequences
|
||||
indices = first_indices + last_indices
|
||||
separator_index = len(first_indices)
|
||||
|
||||
for i, si in enumerate(indices):
|
||||
# Add separator if needed
|
||||
if separator_index is not None and i == separator_index:
|
||||
print(" ...")
|
||||
|
||||
# Extract appropriate slice
|
||||
vec = t[0, si]
|
||||
if vec.ndim == 2: # 4D case: flatten heads × dim_per_head
|
||||
flat = vec.flatten().tolist()
|
||||
else: # 2D or 3D case
|
||||
flat = vec.tolist()
|
||||
|
||||
# First and last slices
|
||||
first = flat[:max_vals]
|
||||
last = flat[-max_vals:] if len(flat) >= max_vals else flat
|
||||
first_str = ", ".join(f"{v:12.4f}" for v in first)
|
||||
last_str = ", ".join(f"{v:12.4f}" for v in last)
|
||||
|
||||
print(f" [{first_str}, ..., {last_str}]")
|
||||
|
||||
print(" ],")
|
||||
print(" ]")
|
||||
print(f" sum = {t.sum().item():.6f}\n")
|
||||
|
||||
|
||||
def debug_hook(name):
|
||||
def fn(_m, input, output):
|
||||
if isinstance(input, torch.Tensor):
|
||||
summarize(input, name + "_in")
|
||||
elif isinstance(input, (tuple, list)) and len(input) > 0 and isinstance(input[0], torch.Tensor):
|
||||
summarize(input[0], name + "_in")
|
||||
if isinstance(output, torch.Tensor):
|
||||
summarize(output, name + "_out")
|
||||
elif isinstance(output, (tuple, list)) and len(output) > 0 and isinstance(output[0], torch.Tensor):
|
||||
summarize(output[0], name + "_out")
|
||||
|
||||
return fn
|
||||
|
||||
|
||||
def setup_rope_debug(model_module_path: str, function_name: str = "apply_rotary_pos_emb"):
|
||||
"""
|
||||
Apply monkey patch to dump RoPE activations for debugging.
|
||||
|
||||
Args:
|
||||
model_module_path: Path to the model module (e.g., "transformers.models.apertus.modeling_apertus")
|
||||
function_name: Name of the RoPE function to patch (default: "apply_rotary_pos_emb")
|
||||
|
||||
Example:
|
||||
from utils.common import setup_rope_debug
|
||||
setup_rope_debug("transformers.models.apertus.modeling_apertus")
|
||||
"""
|
||||
import importlib
|
||||
|
||||
# Import the module and get the original function
|
||||
module = importlib.import_module(model_module_path)
|
||||
orig_rope = getattr(module, function_name)
|
||||
|
||||
# Set torch print options for better debugging
|
||||
torch.set_printoptions(threshold=float('inf'))
|
||||
torch.set_printoptions(precision=6, sci_mode=False)
|
||||
|
||||
def debug_rope(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
|
||||
# log inputs
|
||||
summarize(q, "RoPE.q_in")
|
||||
summarize(k, "RoPE.k_in")
|
||||
|
||||
# call original
|
||||
q_out, k_out = orig_rope(q, k, cos, sin, position_ids, unsqueeze_dim)
|
||||
|
||||
# log outputs
|
||||
summarize(q_out, "RoPE.q_out")
|
||||
summarize(k_out, "RoPE.k_out")
|
||||
|
||||
return q_out, k_out
|
||||
|
||||
# Patch it
|
||||
setattr(module, function_name, debug_rope)
|
||||
print(f"RoPE debug patching applied to {model_module_path}.{function_name}")
|
||||
|
||||
|
||||
def save_output_data(data, tokens, prompt, model_name, type_suffix="", output_dir="data"):
|
||||
"""
|
||||
Save output data (logits/embeddings), tokens, and prompt to files.
|
||||
|
||||
Args:
|
||||
data: numpy array of floats (logits or embeddings)
|
||||
tokens: list or array of token IDs
|
||||
prompt: string containing the input prompt
|
||||
model_name: name of the model
|
||||
type_suffix: optional suffix like "-embeddings" (default: "")
|
||||
output_dir: directory to save files (default: "data")
|
||||
|
||||
Creates the following files in output_dir:
|
||||
- pytorch-{model_name}{type_suffix}.bin
|
||||
- pytorch-{model_name}{type_suffix}.txt
|
||||
- pytorch-{model_name}{type_suffix}-prompt.txt
|
||||
- pytorch-{model_name}{type_suffix}-tokens.bin
|
||||
"""
|
||||
data_dir = Path(output_dir)
|
||||
data_dir.mkdir(exist_ok=True)
|
||||
base_path = data_dir / f"pytorch-{model_name}{type_suffix}"
|
||||
|
||||
# Convert and flatten logits/embeddings
|
||||
data = data.cpu().numpy() if isinstance(data, torch.Tensor) else np.asarray(data)
|
||||
data = data.flatten() if data.ndim > 1 else data
|
||||
|
||||
# Save logits/embedding files
|
||||
data.astype(np.float32).tofile(f"{base_path}.bin")
|
||||
print(f"Data saved to {base_path}.bin")
|
||||
|
||||
with open(f"{base_path}.txt", "w") as f:
|
||||
f.writelines(f"{i}: {value:.6f}\n" for i, value in enumerate(data))
|
||||
print(f"Data saved to {base_path}.txt")
|
||||
|
||||
# Convert and flatten tokens
|
||||
tokens = tokens.cpu().numpy() if isinstance(tokens, torch.Tensor) else np.asarray(tokens)
|
||||
tokens = tokens.flatten() if tokens.ndim > 1 else tokens
|
||||
|
||||
# Save token binary file
|
||||
tokens.astype(np.int32).tofile(f"{base_path}-tokens.bin")
|
||||
print(f"Tokens saved to {base_path}-tokens.bin")
|
||||
|
||||
# Save prompt file
|
||||
with open(f"{base_path}-prompt.txt", "w") as f:
|
||||
f.write(f"prompt: {prompt}\n")
|
||||
f.write(f"n_tokens: {len(tokens)}\n")
|
||||
f.write(f"token ids: {', '.join(str(int(tid)) for tid in tokens)}\n")
|
||||
print(f"Prompt saved to {base_path}-prompt.txt")
|
||||
|
||||
|
||||
def compare_tokens(original, converted, type_suffix="", output_dir="data"):
|
||||
data_dir = Path(output_dir)
|
||||
|
||||
# Read tokens from both models
|
||||
tokens1_file = data_dir / f"{original}{type_suffix}-tokens.bin"
|
||||
tokens2_file = data_dir / f"{converted}{type_suffix}-tokens.bin"
|
||||
|
||||
if not tokens1_file.exists():
|
||||
print(f"Error: Token file not found: {tokens1_file}")
|
||||
return False
|
||||
|
||||
if not tokens2_file.exists():
|
||||
print(f"Error: Token file not found: {tokens2_file}")
|
||||
return False
|
||||
|
||||
tokens1 = np.fromfile(tokens1_file, dtype=np.int32)
|
||||
tokens2 = np.fromfile(tokens2_file, dtype=np.int32)
|
||||
|
||||
print(f"\nComparing tokens between:")
|
||||
print(f" Original : {original} ({len(tokens1)} tokens)")
|
||||
print(f" Converted: {converted} ({len(tokens2)} tokens)")
|
||||
|
||||
if len(tokens1) != len(tokens2):
|
||||
print(f"\n❌ Token count mismatch: {len(tokens1)} vs {len(tokens2)}")
|
||||
return False
|
||||
|
||||
if np.array_equal(tokens1, tokens2):
|
||||
print(f"\n✅ All {len(tokens1)} tokens match!")
|
||||
return True
|
||||
|
||||
mismatches = np.where(tokens1 != tokens2)[0]
|
||||
print(f"\n❌ Found {len(mismatches)} mismatched tokens:")
|
||||
|
||||
num_to_show = min(len(mismatches), 10)
|
||||
for idx in mismatches[:num_to_show]:
|
||||
print(f" Position {idx}: {tokens1[idx]} vs {tokens2[idx]}")
|
||||
|
||||
if len(mismatches) > num_to_show:
|
||||
print(f" ... and {len(mismatches) - num_to_show} more mismatches")
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def show_version_warning(current_version, model_version):
|
||||
if not model_version:
|
||||
return False
|
||||
|
||||
try:
|
||||
from packaging.version import parse, InvalidVersion
|
||||
try:
|
||||
return parse(current_version) < parse(model_version)
|
||||
except InvalidVersion:
|
||||
return current_version != model_version
|
||||
except ImportError:
|
||||
return current_version != model_version
|
||||
|
||||
def get_model_transformers_version(model_path):
|
||||
if not model_path:
|
||||
return None
|
||||
|
||||
config_path = Path(model_path) / "config.json"
|
||||
if not config_path.is_file():
|
||||
return None
|
||||
|
||||
try:
|
||||
with open(config_path, "r", encoding="utf-8") as f:
|
||||
config = json.load(f)
|
||||
return config.get("transformers_version")
|
||||
except (IOError, json.JSONDecodeError) as e:
|
||||
print(f"Warning: Could not read or parse {config_path}: {e}", file=sys.stderr)
|
||||
return None
|
||||
|
||||
def exit_with_warning(message, model_path):
|
||||
print(message)
|
||||
|
||||
if model_path and transformers is not None:
|
||||
model_transformers_version = get_model_transformers_version(model_path)
|
||||
transformers_version = transformers.__version__
|
||||
if show_version_warning(transformers_version, model_transformers_version):
|
||||
warning_message = f"""
|
||||
=====================================================================
|
||||
Verification failure might be due to a transformers version mismatch:
|
||||
|
||||
Current transformers version: {transformers_version}
|
||||
Model's required version : {model_transformers_version}
|
||||
|
||||
Consider installing the version specified by the model's config:
|
||||
pip install transformers=={model_transformers_version}
|
||||
=====================================================================
|
||||
"""
|
||||
print(textwrap.dedent(warning_message))
|
||||
sys.exit(1)
|
||||
76
examples/model-conversion/scripts/utils/compare_tokens.py
Executable file
76
examples/model-conversion/scripts/utils/compare_tokens.py
Executable file
@@ -0,0 +1,76 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
from common import compare_tokens # type: ignore
|
||||
|
||||
|
||||
def parse_arguments():
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Compare tokens between two models',
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
%(prog)s pytorch-gemma-3-270m-it llamacpp-gemma-3-270m-it-bf16
|
||||
"""
|
||||
)
|
||||
parser.add_argument(
|
||||
'original',
|
||||
help='Original model name'
|
||||
)
|
||||
parser.add_argument(
|
||||
'converted',
|
||||
help='Converted model name'
|
||||
)
|
||||
parser.add_argument(
|
||||
'-s', '--suffix',
|
||||
default='',
|
||||
help='Type suffix (e.g., "-embeddings")'
|
||||
)
|
||||
parser.add_argument(
|
||||
'-d', '--data-dir',
|
||||
default='data',
|
||||
help='Directory containing token files (default: data)'
|
||||
)
|
||||
parser.add_argument(
|
||||
'-v', '--verbose',
|
||||
action='store_true',
|
||||
help='Print prompts from both models'
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main():
|
||||
args = parse_arguments()
|
||||
|
||||
if args.verbose:
|
||||
from pathlib import Path
|
||||
data_dir = Path(args.data_dir)
|
||||
|
||||
prompt1_file = data_dir / f"{args.original}{args.suffix}-prompt.txt"
|
||||
prompt2_file = data_dir / f"{args.converted}{args.suffix}-prompt.txt"
|
||||
|
||||
if prompt1_file.exists():
|
||||
print(f"\nOriginal model prompt ({args.original}):")
|
||||
print(f" {prompt1_file.read_text().strip()}")
|
||||
|
||||
if prompt2_file.exists():
|
||||
print(f"\nConverted model prompt ({args.converted}):")
|
||||
print(f" {prompt2_file.read_text().strip()}")
|
||||
|
||||
print()
|
||||
|
||||
result = compare_tokens(
|
||||
args.original,
|
||||
args.converted,
|
||||
type_suffix=args.suffix,
|
||||
output_dir=args.data_dir
|
||||
)
|
||||
|
||||
# Enable the script to be used in shell scripts so that they can check
|
||||
# the exit code for success/failure.
|
||||
sys.exit(0 if result else 1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,8 @@
|
||||
|
||||
#!/usr/bin/env bash
|
||||
|
||||
COLLECTION_SLUG=$(python ./create_collection.py --return-slug)
|
||||
echo "Created collection: $COLLECTION_SLUG"
|
||||
|
||||
# Use it in the next command
|
||||
python add_model_to_collection.py "$COLLECTION_SLUG" "username/my-model"
|
||||
6
examples/model-conversion/scripts/utils/curl-embedding-server.sh
Executable file
6
examples/model-conversion/scripts/utils/curl-embedding-server.sh
Executable file
@@ -0,0 +1,6 @@
|
||||
#!/usr/bin/env bash
|
||||
curl --request POST \
|
||||
--url http://localhost:8080/embedding \
|
||||
--header "Content-Type: application/json" \
|
||||
--data '{"input": "Hello world today"}' \
|
||||
--silent
|
||||
80
examples/model-conversion/scripts/utils/hf-add-model-to-collection.py
Executable file
80
examples/model-conversion/scripts/utils/hf-add-model-to-collection.py
Executable file
@@ -0,0 +1,80 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
from huggingface_hub import HfApi
|
||||
import argparse
|
||||
import sys
|
||||
|
||||
def add_model_to_collection(collection_slug, model_id, note=""):
|
||||
"""
|
||||
Add a model to an existing collection
|
||||
|
||||
Args:
|
||||
collection_slug: The slug of the collection (e.g., "username/collection-name-12345")
|
||||
model_id: The model repository ID (e.g., "username/model-name")
|
||||
note: Optional note about the model
|
||||
|
||||
Returns:
|
||||
True if successful, False if failed
|
||||
"""
|
||||
|
||||
# Initialize API
|
||||
api = HfApi()
|
||||
|
||||
try:
|
||||
user_info = api.whoami()
|
||||
print(f"✅ Authenticated as: {user_info['name']}")
|
||||
|
||||
# Verify the model exists
|
||||
print(f"🔍 Checking if model exists: {model_id}")
|
||||
try:
|
||||
model_info = api.model_info(model_id)
|
||||
except Exception as e:
|
||||
print(f"❌ Model not found or not accessible: {model_id}")
|
||||
print(f"Error: {e}")
|
||||
return False
|
||||
|
||||
print(f"📚 Adding model to collection...")
|
||||
api.add_collection_item(
|
||||
collection_slug=collection_slug,
|
||||
item_id=model_id,
|
||||
item_type="model",
|
||||
note=note
|
||||
)
|
||||
|
||||
print(f"✅ Model added to collection successfully!")
|
||||
print(f"🔗 Collection URL: https://huggingface.co/collections/{collection_slug}")
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error adding model to collection: {e}")
|
||||
return False
|
||||
|
||||
def main():
|
||||
# This script requires that the environment variable HF_TOKEN is set with your
|
||||
# Hugging Face API token.
|
||||
api = HfApi()
|
||||
|
||||
parser = argparse.ArgumentParser(description='Add model to a Huggingface Collection')
|
||||
parser.add_argument('--collection', '-c', help='The collection slug username/collection-hash', required=True)
|
||||
parser.add_argument('--model', '-m', help='The model to add to the Collection', required=True)
|
||||
parser.add_argument('--note', '-n', help='An optional note/description', required=False)
|
||||
args = parser.parse_args()
|
||||
|
||||
collection = args.collection
|
||||
model = args.model
|
||||
note = args.note
|
||||
|
||||
success = add_model_to_collection(
|
||||
collection_slug=collection,
|
||||
model_id=model,
|
||||
note=note
|
||||
)
|
||||
|
||||
if success:
|
||||
print("\n🎉 Model added successfully!")
|
||||
else:
|
||||
print("\n❌ Failed to add model to collection")
|
||||
sys.exit(1)
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
106
examples/model-conversion/scripts/utils/hf-create-collection.py
Executable file
106
examples/model-conversion/scripts/utils/hf-create-collection.py
Executable file
@@ -0,0 +1,106 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
from huggingface_hub import HfApi
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
|
||||
|
||||
def create_collection(title, description, private=False, namespace=None, return_slug=False):
|
||||
"""
|
||||
Create a new collection on Hugging Face
|
||||
|
||||
Args:
|
||||
title: Collection title
|
||||
description: Collection description
|
||||
private: Whether the collection should be private (default: False)
|
||||
namespace: Optional namespace (defaults to your username)
|
||||
|
||||
Returns:
|
||||
Collection object if successful, None if failed
|
||||
"""
|
||||
|
||||
# Check if HF_TOKEN is available
|
||||
token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_HUB_TOKEN")
|
||||
if not token:
|
||||
print("❌ No HF_TOKEN or HUGGINGFACE_HUB_TOKEN found in environment variables")
|
||||
print("Please set your Hugging Face token as an environment variable")
|
||||
return None
|
||||
|
||||
# Initialize API
|
||||
api = HfApi()
|
||||
|
||||
try:
|
||||
# Test authentication first
|
||||
user_info = api.whoami()
|
||||
if not return_slug:
|
||||
print(f"✅ Authenticated as: {user_info['name']}")
|
||||
|
||||
# Create the collection
|
||||
if not return_slug:
|
||||
print(f"📚 Creating collection: '{title}'...")
|
||||
collection = api.create_collection(
|
||||
title=title,
|
||||
description=description,
|
||||
private=private,
|
||||
namespace=namespace
|
||||
)
|
||||
|
||||
if not return_slug:
|
||||
print(f"✅ Collection created successfully!")
|
||||
print(f"📋 Collection slug: {collection.slug}")
|
||||
print(f"🔗 Collection URL: https://huggingface.co/collections/{collection.slug}")
|
||||
|
||||
return collection
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error creating collection: {e}")
|
||||
return None
|
||||
|
||||
def main():
|
||||
# This script requires that the environment variable HF_TOKEN is set with your
|
||||
# Hugging Face API token.
|
||||
api = HfApi()
|
||||
|
||||
parser = argparse.ArgumentParser(description='Create a Huggingface Collection')
|
||||
parser.add_argument('--name', '-n', help='The name/title of the Collection', required=True)
|
||||
parser.add_argument('--description', '-d', help='The description for the Collection', required=True)
|
||||
parser.add_argument('--namespace', '-ns', help='The namespace to add the Collection to', required=True)
|
||||
parser.add_argument('--private', '-p', help='Create a private Collection', action='store_true') # Fixed
|
||||
parser.add_argument('--return-slug', '-s', help='Only output the collection slug', action='store_true') # Fixed
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
name = args.name
|
||||
description = args.description
|
||||
private = args.private
|
||||
namespace = args.namespace
|
||||
return_slug = args.return_slug
|
||||
|
||||
if not return_slug:
|
||||
print("🚀 Creating Hugging Face Collection")
|
||||
print(f"Title: {name}")
|
||||
print(f"Description: {description}")
|
||||
print(f"Namespace: {namespace}")
|
||||
print(f"Private: {private}")
|
||||
|
||||
collection = create_collection(
|
||||
title=name,
|
||||
description=description,
|
||||
private=private,
|
||||
namespace=namespace,
|
||||
return_slug=return_slug
|
||||
)
|
||||
|
||||
if collection:
|
||||
if return_slug:
|
||||
print(collection.slug)
|
||||
else:
|
||||
print("\n🎉 Collection created successfully!")
|
||||
print(f"Use this slug to add models: {collection.slug}")
|
||||
else:
|
||||
print("\n❌ Failed to create collection")
|
||||
sys.exit(1)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
78
examples/model-conversion/scripts/utils/hf-create-model.py
Executable file
78
examples/model-conversion/scripts/utils/hf-create-model.py
Executable file
@@ -0,0 +1,78 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
from huggingface_hub import HfApi
|
||||
import argparse
|
||||
|
||||
# This script requires that the environment variable HF_TOKEN is set with your
|
||||
# Hugging Face API token.
|
||||
api = HfApi()
|
||||
|
||||
def load_template_and_substitute(template_path, **kwargs):
|
||||
try:
|
||||
with open(template_path, 'r', encoding='utf-8') as f:
|
||||
template_content = f.read()
|
||||
|
||||
return template_content.format(**kwargs)
|
||||
except FileNotFoundError:
|
||||
print(f"Template file '{template_path}' not found!")
|
||||
return None
|
||||
except KeyError as e:
|
||||
print(f"Missing template variable: {e}")
|
||||
return None
|
||||
|
||||
parser = argparse.ArgumentParser(description='Create a new Hugging Face model repository')
|
||||
parser.add_argument('--model-name', '-m', help='Name for the model', required=True)
|
||||
parser.add_argument('--namespace', '-ns', help='Namespace to add the model to', required=True)
|
||||
parser.add_argument('--org-base-model', '-b', help='Original Base model name', default="")
|
||||
parser.add_argument('--no-card', action='store_true', help='Skip creating model card')
|
||||
parser.add_argument('--private', '-p', action='store_true', help='Create private model')
|
||||
parser.add_argument('--embedding', '-e', action='store_true', help='Use embedding model card template')
|
||||
parser.add_argument('--dry-run', '-d', action='store_true', help='Print repository info and template without creating repository')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
repo_id = f"{args.namespace}/{args.model_name}-GGUF"
|
||||
print("Repository ID: ", repo_id)
|
||||
|
||||
repo_url = None
|
||||
if not args.dry_run:
|
||||
repo_url = api.create_repo(
|
||||
repo_id=repo_id,
|
||||
repo_type="model",
|
||||
private=args.private,
|
||||
exist_ok=False
|
||||
)
|
||||
|
||||
if not args.no_card:
|
||||
if args.embedding:
|
||||
template_path = "scripts/embedding/modelcard.template"
|
||||
else:
|
||||
template_path = "scripts/causal/modelcard.template"
|
||||
|
||||
print("Template path: ", template_path)
|
||||
|
||||
model_card_content = load_template_and_substitute(
|
||||
template_path,
|
||||
model_name=args.model_name,
|
||||
namespace=args.namespace,
|
||||
base_model=args.org_base_model,
|
||||
)
|
||||
|
||||
if args.dry_run:
|
||||
print("\nTemplate Content:\n")
|
||||
print(model_card_content)
|
||||
else:
|
||||
if model_card_content:
|
||||
api.upload_file(
|
||||
path_or_fileobj=model_card_content.encode('utf-8'),
|
||||
path_in_repo="README.md",
|
||||
repo_id=repo_id
|
||||
)
|
||||
print("Model card created successfully.")
|
||||
else:
|
||||
print("Failed to create model card.")
|
||||
|
||||
if not args.dry_run and repo_url:
|
||||
print(f"Repository created: {repo_url}")
|
||||
|
||||
|
||||
58
examples/model-conversion/scripts/utils/hf-upload-gguf-model.py
Executable file
58
examples/model-conversion/scripts/utils/hf-upload-gguf-model.py
Executable file
@@ -0,0 +1,58 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
from huggingface_hub import HfApi
|
||||
import argparse
|
||||
import os
|
||||
|
||||
def upload_gguf_file(local_file_path, repo_id, filename_in_repo=None):
|
||||
"""
|
||||
Upload a GGUF file to a Hugging Face model repository
|
||||
|
||||
Args:
|
||||
local_file_path: Path to your local GGUF file
|
||||
repo_id: Your repository ID (e.g., "username/model-name")
|
||||
filename_in_repo: Optional custom name for the file in the repo
|
||||
"""
|
||||
|
||||
if not os.path.exists(local_file_path):
|
||||
print(f"❌ File not found: {local_file_path}")
|
||||
return False
|
||||
|
||||
if filename_in_repo is None:
|
||||
filename_in_repo = os.path.basename(local_file_path)
|
||||
|
||||
if filename_in_repo is None or filename_in_repo == "":
|
||||
filename_in_repo = os.path.basename(local_file_path)
|
||||
|
||||
print(f"📤 Uploading {local_file_path} to {repo_id}/{filename_in_repo}")
|
||||
|
||||
api = HfApi()
|
||||
|
||||
try:
|
||||
api.upload_file(
|
||||
path_or_fileobj=local_file_path,
|
||||
path_in_repo=filename_in_repo,
|
||||
repo_id=repo_id,
|
||||
repo_type="model",
|
||||
commit_message=f"Upload {filename_in_repo}"
|
||||
)
|
||||
|
||||
print("✅ Upload successful!")
|
||||
print(f"🔗 File available at: https://huggingface.co/{repo_id}/blob/main/{filename_in_repo}")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Upload failed: {e}")
|
||||
return False
|
||||
|
||||
# This script requires that the environment variable HF_TOKEN is set with your
|
||||
# Hugging Face API token.
|
||||
api = HfApi()
|
||||
|
||||
parser = argparse.ArgumentParser(description='Upload a GGUF model to a Huggingface model repository')
|
||||
parser.add_argument('--gguf-model-path', '-m', help='The GGUF model file to upload', required=True)
|
||||
parser.add_argument('--repo-id', '-r', help='The repository to upload to', required=True)
|
||||
parser.add_argument('--name', '-o', help='The name in the model repository', required=False)
|
||||
args = parser.parse_args()
|
||||
|
||||
upload_gguf_file(args.gguf_model_path, args.repo_id, args.name)
|
||||
14
examples/model-conversion/scripts/utils/inspect-converted-model.sh
Executable file
14
examples/model-conversion/scripts/utils/inspect-converted-model.sh
Executable file
@@ -0,0 +1,14 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
# First try command line argument, then environment variable, then file
|
||||
CONVERTED_MODEL="${1:-"$CONVERTED_MODEL"}"
|
||||
|
||||
# Final check if we have a model path
|
||||
if [ -z "$CONVERTED_MODEL" ]; then
|
||||
echo "Error: Model path must be provided either as:" >&2
|
||||
echo " 1. Command line argument" >&2
|
||||
echo " 2. CONVERTED_MODEL environment variable" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
../../gguf-py/gguf/scripts/gguf_dump.py $CONVERTED_MODEL
|
||||
67
examples/model-conversion/scripts/utils/inspect-org-model.py
Executable file
67
examples/model-conversion/scripts/utils/inspect-org-model.py
Executable file
@@ -0,0 +1,67 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import json
|
||||
from safetensors import safe_open
|
||||
from collections import defaultdict
|
||||
|
||||
parser = argparse.ArgumentParser(description='Process model with specified path')
|
||||
parser.add_argument('--model-path', '-m', help='Path to the model')
|
||||
args = parser.parse_args()
|
||||
|
||||
model_path = os.environ.get('MODEL_PATH', args.model_path)
|
||||
if model_path is None:
|
||||
parser.error("Model path must be specified either via --model-path argument or MODEL_PATH environment variable")
|
||||
|
||||
# Check if there's an index file (multi-file model)
|
||||
index_path = os.path.join(model_path, "model.safetensors.index.json")
|
||||
single_file_path = os.path.join(model_path, "model.safetensors")
|
||||
|
||||
if os.path.exists(index_path):
|
||||
# Multi-file model
|
||||
print("Multi-file model detected")
|
||||
|
||||
with open(index_path, 'r') as f:
|
||||
index_data = json.load(f)
|
||||
|
||||
# Get the weight map (tensor_name -> file_name)
|
||||
weight_map = index_data.get("weight_map", {})
|
||||
|
||||
# Group tensors by file for efficient processing
|
||||
file_tensors = defaultdict(list)
|
||||
for tensor_name, file_name in weight_map.items():
|
||||
file_tensors[file_name].append(tensor_name)
|
||||
|
||||
print("Tensors in model:")
|
||||
|
||||
# Process each shard file
|
||||
for file_name, tensor_names in file_tensors.items():
|
||||
file_path = os.path.join(model_path, file_name)
|
||||
print(f"\n--- From {file_name} ---")
|
||||
|
||||
with safe_open(file_path, framework="pt") as f:
|
||||
for tensor_name in sorted(tensor_names):
|
||||
tensor = f.get_tensor(tensor_name)
|
||||
print(f"- {tensor_name} : shape = {tensor.shape}, dtype = {tensor.dtype}")
|
||||
|
||||
elif os.path.exists(single_file_path):
|
||||
# Single file model (original behavior)
|
||||
print("Single-file model detected")
|
||||
|
||||
with safe_open(single_file_path, framework="pt") as f:
|
||||
keys = f.keys()
|
||||
print("Tensors in model:")
|
||||
for key in sorted(keys):
|
||||
tensor = f.get_tensor(key)
|
||||
print(f"- {key} : shape = {tensor.shape}, dtype = {tensor.dtype}")
|
||||
|
||||
else:
|
||||
print(f"Error: Neither 'model.safetensors.index.json' nor 'model.safetensors' found in {model_path}")
|
||||
print("Available files:")
|
||||
if os.path.exists(model_path):
|
||||
for item in sorted(os.listdir(model_path)):
|
||||
print(f" {item}")
|
||||
else:
|
||||
print(f" Directory {model_path} does not exist")
|
||||
exit(1)
|
||||
35
examples/model-conversion/scripts/utils/perplexity-gen.sh
Executable file
35
examples/model-conversion/scripts/utils/perplexity-gen.sh
Executable file
@@ -0,0 +1,35 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -e
|
||||
|
||||
CONVERTED_MODEL="${1:-"$CONVERTED_MODEL"}"
|
||||
|
||||
# Final check if we have a model path
|
||||
if [ -z "$CONVERTED_MODEL" ]; then
|
||||
echo "Error: Model path must be provided either as:" >&2
|
||||
echo " 1. Command line argument" >&2
|
||||
echo " 2. CONVERTED_MODEL environment variable" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Check if data/wikitext-2-raw directory exists
|
||||
if [ ! -d "ppl/wikitext-2-raw" ]; then
|
||||
echo "ppl/wikitext-2-raw directory does not exist. Downloading..." >&2
|
||||
mkdir -p ppl
|
||||
pushd ppl
|
||||
./../../../scripts/get-wikitext-2.sh
|
||||
popd
|
||||
fi
|
||||
|
||||
mkdir -p ppl
|
||||
OUTPUTFILE="ppl/$(basename $CONVERTED_MODEL).kld"
|
||||
echo "Model: $CONVERTED_MODEL"
|
||||
|
||||
cmake --build ../../build --target llama-perplexity -j8
|
||||
|
||||
../.././build/bin/llama-perplexity -m $CONVERTED_MODEL \
|
||||
-f ppl/wikitext-2-raw/wiki.test.raw \
|
||||
--kl-divergence-base $OUTPUTFILE
|
||||
|
||||
echo "Generated logits in $OUTPUTFILE"
|
||||
|
||||
27
examples/model-conversion/scripts/utils/perplexity-run-simple.sh
Executable file
27
examples/model-conversion/scripts/utils/perplexity-run-simple.sh
Executable file
@@ -0,0 +1,27 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -e
|
||||
|
||||
QUANTIZED_MODEL="${1:-"$QUANTIZED_MODEL"}"
|
||||
|
||||
if [ -z "$QUANTIZED_MODEL" ]; then
|
||||
echo "Error: Model path must be provided either as:" >&2
|
||||
echo " 1. Command line argument" >&2
|
||||
echo " 2. QUANTIZED_MODEL environment variable" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Check if data/wikitext-2-raw directory exists
|
||||
if [ ! -d "ppl/wikitext-2-raw" ]; then
|
||||
echo "ppl/wikitext-2-raw directory does not exist. Downloading..." >&2
|
||||
mkdir -p ppl
|
||||
pushd ppl
|
||||
./../../../scripts/get-wikitext-2.sh
|
||||
popd
|
||||
fi
|
||||
|
||||
cmake --build ../../build --target llama-perplexity -j8
|
||||
|
||||
../.././build/bin/llama-perplexity -m $QUANTIZED_MODEL -f ppl/wikitext-2-raw/wiki.test.raw
|
||||
|
||||
|
||||
28
examples/model-conversion/scripts/utils/perplexity-run.sh
Executable file
28
examples/model-conversion/scripts/utils/perplexity-run.sh
Executable file
@@ -0,0 +1,28 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -e
|
||||
|
||||
QUANTIZED_MODEL="${1:-"$QUANTIZED_MODEL"}"
|
||||
LOGITS_FILE="${1:-"$LOGITS_FILE"}"
|
||||
|
||||
if [ -z "$QUANTIZED_MODEL" ]; then
|
||||
echo "Error: Model path must be provided either as:" >&2
|
||||
echo " 1. Command line argument" >&2
|
||||
echo " 2. QUANTIZED_MODEL environment variable" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ ! -f ${LOGITS_FILE} ]; then
|
||||
echo "Error: logits file '${LOGITS_FILE} was not found"
|
||||
echo "Did you run the perplexity-gen.sh script?"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Model: $QUANTIZED_MODEL"
|
||||
echo "Data file: $LOGITS_FILE"
|
||||
|
||||
cmake --build ../../build --target llama-perplexity -j8
|
||||
|
||||
../.././build/bin/llama-perplexity -m $QUANTIZED_MODEL \
|
||||
--kl-divergence-base $LOGITS_FILE \
|
||||
--kl-divergence
|
||||
48
examples/model-conversion/scripts/utils/quantize.sh
Executable file
48
examples/model-conversion/scripts/utils/quantize.sh
Executable file
@@ -0,0 +1,48 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -e
|
||||
|
||||
CONVERTED_MODEL="${1:-"$CONVERTED_MODEL"}"
|
||||
QUANTIZED_TYPE="${2:-"$QUANTIZED_TYPE"}"
|
||||
TOKEN_EMBD_TYPE="${3:-"${TOKEN_EMBD_TYPE}"}"
|
||||
OUTPUT_TYPE="${4:-"${OUTPUT_TYPE}"}"
|
||||
QUANTIZED_MODEL=$CONVERTED_MODEL
|
||||
|
||||
# Final check if we have a model path
|
||||
if [ -z "$CONVERTED_MODEL" ]; then
|
||||
echo "Error: Model path must be provided either as:" >&2
|
||||
echo " 1. Command line argument" >&2
|
||||
echo " 2. CONVERTED_MODEL environment variable" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ -z "$QUANTIZED_TYPE" ]; then
|
||||
echo "Error: QUANTIZED_TYPE is required" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo $CONVERTED_MODEL
|
||||
|
||||
# Process the quantized model filename
|
||||
if [[ "$QUANTIZED_MODEL" == *.gguf ]]; then
|
||||
# Remove .gguf suffix, add quantized type, then add .gguf back
|
||||
BASE_NAME="${QUANTIZED_MODEL%.gguf}"
|
||||
QUANTIZED_MODEL="${BASE_NAME}-${QUANTIZED_TYPE}.gguf"
|
||||
else
|
||||
echo "Error: QUANTIZED_MODEL must end with .gguf extension" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
cmake --build ../../build --target llama-quantize -j8
|
||||
|
||||
echo $TOKEN_EMBD_TYPE
|
||||
echo $OUTPUT_TYPE
|
||||
|
||||
CMD_ARGS=("../../build/bin/llama-quantize")
|
||||
[[ -n "$TOKEN_EMBD_TYPE" ]] && CMD_ARGS+=("--token-embedding-type" "$TOKEN_EMBD_TYPE")
|
||||
[[ -n "$OUTPUT_TYPE" ]] && CMD_ARGS+=("--output-tensor-type" "$OUTPUT_TYPE")
|
||||
CMD_ARGS+=("$CONVERTED_MODEL" "$QUANTIZED_MODEL" "$QUANTIZED_TYPE")
|
||||
|
||||
"${CMD_ARGS[@]}"
|
||||
|
||||
echo "Quantized model saved to: $QUANTIZED_MODEL"
|
||||
22
examples/model-conversion/scripts/utils/run-embedding-server.sh
Executable file
22
examples/model-conversion/scripts/utils/run-embedding-server.sh
Executable file
@@ -0,0 +1,22 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -e
|
||||
#
|
||||
# First try command line argument, then environment variable, then file
|
||||
CONVERTED_MODEL="${1:-"$CONVERTED_MODEL"}"
|
||||
|
||||
# Final check if we have a model path
|
||||
if [ -z "$CONVERTED_MODEL" ]; then
|
||||
echo "Error: Model path must be provided either as:" >&2
|
||||
echo " 1. Command line argument" >&2
|
||||
echo " 2. CONVERTED_MODEL environment variable" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo $CONVERTED_MODEL
|
||||
|
||||
cmake --build ../../build --target llama-server
|
||||
|
||||
../../build/bin/llama-server -m $CONVERTED_MODEL \
|
||||
--embedding \
|
||||
--pooling none
|
||||
242
examples/model-conversion/scripts/utils/semantic_check.py
Normal file
242
examples/model-conversion/scripts/utils/semantic_check.py
Normal file
@@ -0,0 +1,242 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import numpy as np
|
||||
import argparse
|
||||
import os
|
||||
import importlib
|
||||
from pathlib import Path
|
||||
|
||||
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, AutoModel
|
||||
from common import compare_tokens, exit_with_warning # type: ignore[import-not-found]
|
||||
|
||||
unreleased_model_name = os.getenv('UNRELEASED_MODEL_NAME')
|
||||
|
||||
def cosine_similarity(a, b=None):
|
||||
a = np.asarray(a)
|
||||
if b is None:
|
||||
b = a
|
||||
else:
|
||||
b = np.asarray(b)
|
||||
|
||||
if a.ndim == 1:
|
||||
a = a.reshape(1, -1)
|
||||
if b.ndim == 1:
|
||||
b = b.reshape(1, -1)
|
||||
|
||||
a_norms = np.linalg.norm(a, axis=1, keepdims=True)
|
||||
b_norms = np.linalg.norm(b, axis=1, keepdims=True)
|
||||
|
||||
a_norms = np.where(a_norms == 0, 1e-8, a_norms)
|
||||
b_norms = np.where(b_norms == 0, 1e-8, b_norms)
|
||||
|
||||
a_normalized = a / a_norms
|
||||
b_normalized = b / b_norms
|
||||
|
||||
# Compute cosine similarity
|
||||
return np.dot(a_normalized, b_normalized.T)
|
||||
|
||||
def load_embeddings_from_file(filename, n_tokens, n_embd):
|
||||
embeddings = np.fromfile(filename, dtype=np.float32)
|
||||
# Check if this is pooled (single embedding) or per-token embeddings
|
||||
if len(embeddings) == n_embd:
|
||||
return embeddings.reshape(1, n_embd)
|
||||
else:
|
||||
return embeddings.reshape(n_tokens, n_embd)
|
||||
|
||||
def test_single_prompt_similarity(python_emb, cpp_emb, tokens, prompt):
|
||||
np.set_printoptions(suppress=True, precision=6)
|
||||
print("pytorch embeddings:");
|
||||
print(python_emb)
|
||||
print("llama.cpp embeddings:");
|
||||
print(cpp_emb)
|
||||
print(f"\n=== Prompt: '{prompt}' ===")
|
||||
print(f"Tokens: {tokens}")
|
||||
print(f"Embeddings shape: Python {python_emb.shape}, llama.cpp {cpp_emb.shape}")
|
||||
|
||||
n_tokens = len(tokens)
|
||||
is_pooled = python_emb.shape[0] == 1
|
||||
|
||||
if is_pooled:
|
||||
print(f"\n[Pooled Embeddings Mode - comparing single sentence embeddings]")
|
||||
|
||||
# 1. Direct embedding comparison for pooled embeddings
|
||||
print(f"\n1. Raw Embedding Magnitude Comparison:")
|
||||
py_mag = np.linalg.norm(python_emb[0])
|
||||
cpp_mag = np.linalg.norm(cpp_emb[0])
|
||||
ratio = py_mag / cpp_mag if cpp_mag > 0 else float('inf')
|
||||
print(f" Pooled embedding: Python={py_mag:.3f}, llama.cpp={cpp_mag:.3f}, ratio={ratio:.3f}")
|
||||
|
||||
# 2. Cross-model similarity for pooled embeddings
|
||||
print(f"\n2. Cross-Model Pooled Embedding Similarity:")
|
||||
sim = cosine_similarity([python_emb[0]], [cpp_emb[0]])[0][0]
|
||||
print(f" Cosine similarity: {sim:.6f}")
|
||||
|
||||
return {
|
||||
'cross_model_similarities': [sim],
|
||||
'similarity_matrix_diff': np.array([[0.0]]),
|
||||
'max_diff': 0.0,
|
||||
'mean_diff': 0.0,
|
||||
'rms_diff': 0.0
|
||||
}
|
||||
else:
|
||||
# Original per-token comparison logic
|
||||
# 1. Direct embedding comparison
|
||||
print(f"\n1. Raw Embedding Magnitude Comparison:")
|
||||
# Check if the distance of each token embedding from the origin and compare
|
||||
# if the vectors are on the same "sphere". This does not tell us about
|
||||
# direction (meaning of the token embedding), just magnitude.
|
||||
for i in range(n_tokens):
|
||||
py_mag = np.linalg.norm(python_emb[i]) # calculate standard euclidean norm for Python embeddings
|
||||
cpp_mag = np.linalg.norm(cpp_emb[i]) # calculate standard euclidean norm for llama.cpp embeddings
|
||||
ratio = py_mag / cpp_mag if cpp_mag > 0 else float('inf')
|
||||
print(f" Token {i} ({tokens[i]}): Python={py_mag:.3f}, llama.cpp={cpp_mag:.3f}, ratio={ratio:.3f}")
|
||||
|
||||
# 2. Cosine similarity between tokens within each model
|
||||
# Here we check the direction of token embeddings to see if the have the
|
||||
# same meaning (similarity). This is done by calculating cosine similarity
|
||||
# of a pair of token embeddings within each model.
|
||||
print(f"\n2. Within-Model Token Similarities:")
|
||||
print(" Python model:")
|
||||
for i in range(n_tokens):
|
||||
for j in range(i+1, n_tokens):
|
||||
sim = cosine_similarity([python_emb[i]], [python_emb[j]])[0][0]
|
||||
print(f" {tokens[i]} ↔ {tokens[j]}: {sim:.4f}")
|
||||
|
||||
print(" llama.cpp model:")
|
||||
for i in range(n_tokens):
|
||||
for j in range(i+1, n_tokens):
|
||||
sim = cosine_similarity([cpp_emb[i]], [cpp_emb[j]])[0][0]
|
||||
print(f" {tokens[i]} ↔ {tokens[j]}: {sim:.4f}")
|
||||
|
||||
# 3. Cross-model similarity (same token position)
|
||||
print(f"\n3. Cross-Model Same-Token Similarities:")
|
||||
for i in range(n_tokens):
|
||||
sim = cosine_similarity([python_emb[i]], [cpp_emb[i]])[0][0]
|
||||
print(f" Token {i} ({tokens[i]}): {sim:.4f}")
|
||||
|
||||
# 4. Similarity matrix comparison
|
||||
print(f"\n4. Similarity Matrix Differences:")
|
||||
py_sim_matrix = cosine_similarity(python_emb)
|
||||
cpp_sim_matrix = cosine_similarity(cpp_emb)
|
||||
diff_matrix = np.abs(py_sim_matrix - cpp_sim_matrix)
|
||||
|
||||
print(f" Max difference: {np.max(diff_matrix):.4f}")
|
||||
print(f" Mean difference: {np.mean(diff_matrix):.4f}")
|
||||
print(f" RMS difference: {np.sqrt(np.mean(diff_matrix**2)):.4f}")
|
||||
|
||||
return {
|
||||
'cross_model_similarities': [cosine_similarity([python_emb[i]], [cpp_emb[i]])[0][0] for i in range(n_tokens)],
|
||||
'similarity_matrix_diff': diff_matrix,
|
||||
'max_diff': np.max(diff_matrix),
|
||||
'mean_diff': np.mean(diff_matrix),
|
||||
'rms_diff': np.sqrt(np.mean(diff_matrix**2))
|
||||
}
|
||||
|
||||
def read_prompt_from_file(file_path):
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
return f.read().strip()
|
||||
except FileNotFoundError:
|
||||
print(f"Error: Prompts file '{file_path}' not found")
|
||||
exit(1)
|
||||
except Exception as e:
|
||||
print(f"Error reading prompts file: {e}")
|
||||
exit(1)
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description='Test semantic similarity between Python and llama.cpp embeddings')
|
||||
parser.add_argument('--model-path', '-m', required=True, help='Path to the original Python model')
|
||||
parser.add_argument('--python-embeddings', '-pe', help='Path to pytorch embeddings "logits" binary file')
|
||||
parser.add_argument('--cpp-embeddings', '-ce', help='Path to llama.cpp embeddings "logits" binary file')
|
||||
parser.add_argument('--causal', '-c', default=False, help='if the model is causal (default: false)', action='store_true')
|
||||
parser.add_argument('--prompt', '-p', default='Hello world today', help='Test prompt')
|
||||
parser.add_argument('--prompts-file', '-pf', help='Path to file containing prompts')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.prompts_file:
|
||||
prompt = read_prompt_from_file(args.prompts_file)
|
||||
else:
|
||||
prompt = args.prompt
|
||||
|
||||
python_emb_path = Path(args.python_embeddings)
|
||||
cpp_emb_path = Path(args.cpp_embeddings)
|
||||
|
||||
# Extract base names (e.g., "pytorch-model-name-embeddings.bin" -> "pytorch-model-name")
|
||||
python_model_name = python_emb_path.stem.replace("-embeddings", "")
|
||||
cpp_model_name = cpp_emb_path.stem.replace("-embeddings", "")
|
||||
|
||||
print("Semantic Similarity Test Between Python and llama.cpp Embedding Models")
|
||||
print("=" * 70)
|
||||
|
||||
# First verify tokens match before comparing embeddings
|
||||
print("\n🔍 Token Comparison Check")
|
||||
print("=" * 70)
|
||||
data_dir = python_emb_path.parent
|
||||
if not compare_tokens(python_model_name, cpp_model_name, type_suffix="-embeddings", output_dir=str(data_dir)):
|
||||
exit_with_warning("\n❌ Token mismatch detected", args.model_path)
|
||||
print()
|
||||
|
||||
# Single prompt detailed comparison
|
||||
print(f"\nTesting with prompt: '{prompt}'")
|
||||
|
||||
# Load the python model to get configuration information and also to load the tokenizer.
|
||||
print("Loading model and tokenizer using AutoTokenizer:", args.model_path)
|
||||
tokenizer = AutoTokenizer.from_pretrained(args.model_path)
|
||||
config = AutoConfig.from_pretrained(args.model_path, trust_remote_code=True)
|
||||
|
||||
if unreleased_model_name:
|
||||
model_name_lower = unreleased_model_name.lower()
|
||||
unreleased_module_path = f"transformers.models.{model_name_lower}.modular_{model_name_lower}"
|
||||
if args.causal:
|
||||
class_name = f"{unreleased_model_name}ForCausalLM"
|
||||
else:
|
||||
class_name = f"{unreleased_model_name}Model"
|
||||
print(f"Model class: {class_name}")
|
||||
print(f"Importing unreleased model module: {unreleased_module_path}")
|
||||
|
||||
try:
|
||||
model_class = getattr(importlib.import_module(unreleased_module_path), class_name)
|
||||
model = model_class.from_pretrained(args.model_path)
|
||||
except (ImportError, AttributeError) as e:
|
||||
print(f"Failed to import or load model: {e}")
|
||||
exit(1)
|
||||
else:
|
||||
if args.causal:
|
||||
model = AutoModelForCausalLM.from_pretrained(args.model_path, trust_remote_code=True)
|
||||
else:
|
||||
model = AutoModel.from_pretrained(args.model_path, trust_remote_code=True)
|
||||
|
||||
encoded = tokenizer(prompt, return_tensors="pt")
|
||||
tokens = tokenizer.convert_ids_to_tokens(encoded['input_ids'][0])
|
||||
n_tokens = len(tokens)
|
||||
print(f"n_tokens: {n_tokens}");
|
||||
print(f"hidden_size: {model.config.hidden_size}")
|
||||
|
||||
# Load binary embeddings from data directory.
|
||||
llamacpp_embeddings = load_embeddings_from_file(args.cpp_embeddings, n_tokens, model.config.hidden_size)
|
||||
python_embeddings = load_embeddings_from_file(args.python_embeddings, n_tokens, model.config.hidden_size)
|
||||
|
||||
# Run comparison
|
||||
results = test_single_prompt_similarity(python_embeddings, llamacpp_embeddings, tokens, prompt)
|
||||
|
||||
# Summary
|
||||
print(f"\n=== SUMMARY ===")
|
||||
avg_cross_sim = np.mean(results['cross_model_similarities'])
|
||||
print(f"Average cross-model similarity: {avg_cross_sim:.4f}")
|
||||
print(f"Similarity matrix RMS difference: {results['rms_diff']:.4f}")
|
||||
|
||||
# Quality assessment
|
||||
if avg_cross_sim > 0.95:
|
||||
print("✅ EXCELLENT: Models are highly similar")
|
||||
elif avg_cross_sim > 0.90:
|
||||
print("✅ VERY GOOD: Models are very similar")
|
||||
elif avg_cross_sim > 0.80:
|
||||
print("⚠️ GOOD: Models are reasonably similar")
|
||||
elif avg_cross_sim > 0.70:
|
||||
print("⚠️ FAIR: Models have some differences")
|
||||
else:
|
||||
exit_with_warning("❌ POOR: Models are significantly different", args.model_path)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user